Line data Source code
1 : /*-----------------------------------------------------------------------
2 : *
3 : * PostgreSQL locale utilities for libc
4 : *
5 : * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 : *
7 : * src/backend/utils/adt/pg_locale_libc.c
8 : *
9 : *-----------------------------------------------------------------------
10 : */
11 :
12 : #include "postgres.h"
13 :
14 : #include <limits.h>
15 : #include <wctype.h>
16 :
17 : #include "access/htup_details.h"
18 : #include "catalog/pg_database.h"
19 : #include "catalog/pg_collation.h"
20 : #include "mb/pg_wchar.h"
21 : #include "miscadmin.h"
22 : #include "utils/builtins.h"
23 : #include "utils/formatting.h"
24 : #include "utils/memutils.h"
25 : #include "utils/pg_locale.h"
26 : #include "utils/syscache.h"
27 :
28 : #ifdef __GLIBC__
29 : #include <gnu/libc-version.h>
30 : #endif
31 :
32 : #ifdef WIN32
33 : #include <shlwapi.h>
34 : #endif
35 :
36 : /*
37 : * For the libc provider, to provide as much functionality as possible on a
38 : * variety of platforms without going so far as to implement everything from
39 : * scratch, we use several implementation strategies depending on the
40 : * situation:
41 : *
42 : * 1. In C/POSIX collations, we use hard-wired code. We can't depend on
43 : * the <ctype.h> functions since those will obey LC_CTYPE. Note that these
44 : * collations don't give a fig about multibyte characters.
45 : *
46 : * 2. When working in UTF8 encoding, we use the <wctype.h> functions.
47 : * This assumes that every platform uses Unicode codepoints directly
48 : * as the wchar_t representation of Unicode. (XXX: ICU makes this assumption
49 : * even for non-UTF8 encodings, which may be a problem.) On some platforms
50 : * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
51 : *
52 : * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar
53 : * values up to 255, and punt for values above that. This is 100% correct
54 : * only in single-byte encodings such as LATINn. However, non-Unicode
55 : * multibyte encodings are mostly Far Eastern character sets for which the
56 : * properties being tested here aren't very relevant for higher code values
57 : * anyway. The difficulty with using the <wctype.h> functions with
58 : * non-Unicode multibyte encodings is that we can have no certainty that
59 : * the platform's wchar_t representation matches what we do in pg_wchar
60 : * conversions.
61 : *
62 : * As a special case, in the "default" collation, (2) and (3) force ASCII
63 : * letters to follow ASCII upcase/downcase rules, while in a non-default
64 : * collation we just let the library functions do what they will. The case
65 : * where this matters is treatment of I/i in Turkish, and the behavior is
66 : * meant to match the upper()/lower() SQL functions.
67 : *
68 : * We store the active collation setting in static variables. In principle
69 : * it could be passed down to here via the regex library's "struct vars" data
70 : * structure; but that would require somewhat invasive changes in the regex
71 : * library, and right now there's no real benefit to be gained from that.
72 : *
73 : * NB: the coding here assumes pg_wchar is an unsigned type.
74 : */
75 :
76 : /*
77 : * Size of stack buffer to use for string transformations, used to avoid heap
78 : * allocations in typical cases. This should be large enough that most strings
79 : * will fit, but small enough that we feel comfortable putting it on the
80 : * stack.
81 : */
82 : #define TEXTBUFLEN 1024
83 :
84 : extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context);
85 :
86 : static int strncoll_libc(const char *arg1, ssize_t len1,
87 : const char *arg2, ssize_t len2,
88 : pg_locale_t locale);
89 : static size_t strnxfrm_libc(char *dest, size_t destsize,
90 : const char *src, ssize_t srclen,
91 : pg_locale_t locale);
92 : extern char *get_collation_actual_version_libc(const char *collcollate);
93 : static locale_t make_libc_collator(const char *collate,
94 : const char *ctype);
95 :
96 : #ifdef WIN32
97 : static int strncoll_libc_win32_utf8(const char *arg1, ssize_t len1,
98 : const char *arg2, ssize_t len2,
99 : pg_locale_t locale);
100 : #endif
101 :
102 : static size_t char2wchar(wchar_t *to, size_t tolen, const char *from,
103 : size_t fromlen, locale_t loc);
104 :
105 : static size_t strlower_libc_sb(char *dest, size_t destsize,
106 : const char *src, ssize_t srclen,
107 : pg_locale_t locale);
108 : static size_t strlower_libc_mb(char *dest, size_t destsize,
109 : const char *src, ssize_t srclen,
110 : pg_locale_t locale);
111 : static size_t strtitle_libc_sb(char *dest, size_t destsize,
112 : const char *src, ssize_t srclen,
113 : pg_locale_t locale);
114 : static size_t strtitle_libc_mb(char *dest, size_t destsize,
115 : const char *src, ssize_t srclen,
116 : pg_locale_t locale);
117 : static size_t strupper_libc_sb(char *dest, size_t destsize,
118 : const char *src, ssize_t srclen,
119 : pg_locale_t locale);
120 : static size_t strupper_libc_mb(char *dest, size_t destsize,
121 : const char *src, ssize_t srclen,
122 : pg_locale_t locale);
123 :
124 : static bool
125 0 : wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
126 : {
127 0 : return isdigit_l((unsigned char) wc, locale->lt);
128 : }
129 :
130 : static bool
131 0 : wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale)
132 : {
133 0 : return isalpha_l((unsigned char) wc, locale->lt);
134 : }
135 :
136 : static bool
137 0 : wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale)
138 : {
139 0 : return isalnum_l((unsigned char) wc, locale->lt);
140 : }
141 :
142 : static bool
143 0 : wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale)
144 : {
145 0 : return isupper_l((unsigned char) wc, locale->lt);
146 : }
147 :
148 : static bool
149 0 : wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale)
150 : {
151 0 : return islower_l((unsigned char) wc, locale->lt);
152 : }
153 :
154 : static bool
155 0 : wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale)
156 : {
157 0 : return isgraph_l((unsigned char) wc, locale->lt);
158 : }
159 :
160 : static bool
161 0 : wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale)
162 : {
163 0 : return isprint_l((unsigned char) wc, locale->lt);
164 : }
165 :
166 : static bool
167 0 : wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale)
168 : {
169 0 : return ispunct_l((unsigned char) wc, locale->lt);
170 : }
171 :
172 : static bool
173 0 : wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale)
174 : {
175 0 : return isspace_l((unsigned char) wc, locale->lt);
176 : }
177 :
178 : static bool
179 0 : wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
180 : {
181 : #ifndef WIN32
182 0 : return isxdigit_l((unsigned char) wc, locale->lt);
183 : #else
184 : return _isxdigit_l((unsigned char) wc, locale->lt);
185 : #endif
186 : }
187 :
188 : static bool
189 131608 : wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
190 : {
191 131608 : return iswdigit_l((wint_t) wc, locale->lt);
192 : }
193 :
194 : static bool
195 81148 : wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale)
196 : {
197 81148 : return iswalpha_l((wint_t) wc, locale->lt);
198 : }
199 :
200 : static bool
201 2845650 : wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale)
202 : {
203 2845650 : return iswalnum_l((wint_t) wc, locale->lt);
204 : }
205 :
206 : static bool
207 4112 : wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale)
208 : {
209 4112 : return iswupper_l((wint_t) wc, locale->lt);
210 : }
211 :
212 : static bool
213 4102 : wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale)
214 : {
215 4102 : return iswlower_l((wint_t) wc, locale->lt);
216 : }
217 :
218 : static bool
219 4102 : wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale)
220 : {
221 4102 : return iswgraph_l((wint_t) wc, locale->lt);
222 : }
223 :
224 : static bool
225 4102 : wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale)
226 : {
227 4102 : return iswprint_l((wint_t) wc, locale->lt);
228 : }
229 :
230 : static bool
231 4102 : wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale)
232 : {
233 4102 : return iswpunct_l((wint_t) wc, locale->lt);
234 : }
235 :
236 : static bool
237 48152 : wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale)
238 : {
239 48152 : return iswspace_l((wint_t) wc, locale->lt);
240 : }
241 :
242 : static bool
243 12 : wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
244 : {
245 : #ifndef WIN32
246 12 : return iswxdigit_l((wint_t) wc, locale->lt);
247 : #else
248 : return _iswxdigit_l((wint_t) wc, locale->lt);
249 : #endif
250 : }
251 :
252 : static char
253 0 : char_tolower_libc(unsigned char ch, pg_locale_t locale)
254 : {
255 : Assert(pg_database_encoding_max_length() == 1);
256 0 : return tolower_l(ch, locale->lt);
257 : }
258 :
259 : static bool
260 0 : char_is_cased_libc(char ch, pg_locale_t locale)
261 : {
262 0 : bool is_multibyte = pg_database_encoding_max_length() > 1;
263 :
264 0 : if (is_multibyte && IS_HIGHBIT_SET(ch))
265 0 : return true;
266 : else
267 0 : return isalpha_l((unsigned char) ch, locale->lt);
268 : }
269 :
270 : static pg_wchar
271 0 : toupper_libc_sb(pg_wchar wc, pg_locale_t locale)
272 : {
273 : Assert(GetDatabaseEncoding() != PG_UTF8);
274 :
275 : /* force C behavior for ASCII characters, per comments above */
276 0 : if (locale->is_default && wc <= (pg_wchar) 127)
277 0 : return pg_ascii_toupper((unsigned char) wc);
278 0 : if (wc <= (pg_wchar) UCHAR_MAX)
279 0 : return toupper_l((unsigned char) wc, locale->lt);
280 : else
281 0 : return wc;
282 : }
283 :
284 : static pg_wchar
285 9088 : toupper_libc_mb(pg_wchar wc, pg_locale_t locale)
286 : {
287 : Assert(GetDatabaseEncoding() == PG_UTF8);
288 :
289 : /* force C behavior for ASCII characters, per comments above */
290 9088 : if (locale->is_default && wc <= (pg_wchar) 127)
291 892 : return pg_ascii_toupper((unsigned char) wc);
292 : if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
293 8196 : return towupper_l((wint_t) wc, locale->lt);
294 : else
295 : return wc;
296 : }
297 :
298 : static pg_wchar
299 0 : tolower_libc_sb(pg_wchar wc, pg_locale_t locale)
300 : {
301 : Assert(GetDatabaseEncoding() != PG_UTF8);
302 :
303 : /* force C behavior for ASCII characters, per comments above */
304 0 : if (locale->is_default && wc <= (pg_wchar) 127)
305 0 : return pg_ascii_tolower((unsigned char) wc);
306 0 : if (wc <= (pg_wchar) UCHAR_MAX)
307 0 : return tolower_l((unsigned char) wc, locale->lt);
308 : else
309 0 : return wc;
310 : }
311 :
312 : static pg_wchar
313 9092 : tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
314 : {
315 : Assert(GetDatabaseEncoding() == PG_UTF8);
316 :
317 : /* force C behavior for ASCII characters, per comments above */
318 9092 : if (locale->is_default && wc <= (pg_wchar) 127)
319 896 : return pg_ascii_tolower((unsigned char) wc);
320 : if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
321 8196 : return towlower_l((wint_t) wc, locale->lt);
322 : else
323 : return wc;
324 : }
325 :
326 : static const struct ctype_methods ctype_methods_libc_sb = {
327 : .strlower = strlower_libc_sb,
328 : .strtitle = strtitle_libc_sb,
329 : .strupper = strupper_libc_sb,
330 : .wc_isdigit = wc_isdigit_libc_sb,
331 : .wc_isalpha = wc_isalpha_libc_sb,
332 : .wc_isalnum = wc_isalnum_libc_sb,
333 : .wc_isupper = wc_isupper_libc_sb,
334 : .wc_islower = wc_islower_libc_sb,
335 : .wc_isgraph = wc_isgraph_libc_sb,
336 : .wc_isprint = wc_isprint_libc_sb,
337 : .wc_ispunct = wc_ispunct_libc_sb,
338 : .wc_isspace = wc_isspace_libc_sb,
339 : .wc_isxdigit = wc_isxdigit_libc_sb,
340 : .char_is_cased = char_is_cased_libc,
341 : .char_tolower = char_tolower_libc,
342 : .wc_toupper = toupper_libc_sb,
343 : .wc_tolower = tolower_libc_sb,
344 : .max_chr = UCHAR_MAX,
345 : };
346 :
347 : /*
348 : * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but
349 : * single-byte semantics for pattern matching.
350 : */
351 : static const struct ctype_methods ctype_methods_libc_other_mb = {
352 : .strlower = strlower_libc_mb,
353 : .strtitle = strtitle_libc_mb,
354 : .strupper = strupper_libc_mb,
355 : .wc_isdigit = wc_isdigit_libc_sb,
356 : .wc_isalpha = wc_isalpha_libc_sb,
357 : .wc_isalnum = wc_isalnum_libc_sb,
358 : .wc_isupper = wc_isupper_libc_sb,
359 : .wc_islower = wc_islower_libc_sb,
360 : .wc_isgraph = wc_isgraph_libc_sb,
361 : .wc_isprint = wc_isprint_libc_sb,
362 : .wc_ispunct = wc_ispunct_libc_sb,
363 : .wc_isspace = wc_isspace_libc_sb,
364 : .wc_isxdigit = wc_isxdigit_libc_sb,
365 : .char_is_cased = char_is_cased_libc,
366 : .char_tolower = char_tolower_libc,
367 : .wc_toupper = toupper_libc_sb,
368 : .wc_tolower = tolower_libc_sb,
369 : .max_chr = UCHAR_MAX,
370 : };
371 :
372 : static const struct ctype_methods ctype_methods_libc_utf8 = {
373 : .strlower = strlower_libc_mb,
374 : .strtitle = strtitle_libc_mb,
375 : .strupper = strupper_libc_mb,
376 : .wc_isdigit = wc_isdigit_libc_mb,
377 : .wc_isalpha = wc_isalpha_libc_mb,
378 : .wc_isalnum = wc_isalnum_libc_mb,
379 : .wc_isupper = wc_isupper_libc_mb,
380 : .wc_islower = wc_islower_libc_mb,
381 : .wc_isgraph = wc_isgraph_libc_mb,
382 : .wc_isprint = wc_isprint_libc_mb,
383 : .wc_ispunct = wc_ispunct_libc_mb,
384 : .wc_isspace = wc_isspace_libc_mb,
385 : .wc_isxdigit = wc_isxdigit_libc_mb,
386 : .char_is_cased = char_is_cased_libc,
387 : .char_tolower = char_tolower_libc,
388 : .wc_toupper = toupper_libc_mb,
389 : .wc_tolower = tolower_libc_mb,
390 : };
391 :
392 : static const struct collate_methods collate_methods_libc = {
393 : .strncoll = strncoll_libc,
394 : .strnxfrm = strnxfrm_libc,
395 : .strnxfrm_prefix = NULL,
396 :
397 : /*
398 : * Unfortunately, it seems that strxfrm() for non-C collations is broken
399 : * on many common platforms; testing of multiple versions of glibc reveals
400 : * that, for many locales, strcoll() and strxfrm() do not return
401 : * consistent results. While no other libc other than Cygwin has so far
402 : * been shown to have a problem, we take the conservative course of action
403 : * for right now and disable this categorically. (Users who are certain
404 : * this isn't a problem on their system can define TRUST_STRXFRM.)
405 : */
406 : #ifdef TRUST_STRXFRM
407 : .strxfrm_is_safe = true,
408 : #else
409 : .strxfrm_is_safe = false,
410 : #endif
411 : };
412 :
413 : #ifdef WIN32
414 : static const struct collate_methods collate_methods_libc_win32_utf8 = {
415 : .strncoll = strncoll_libc_win32_utf8,
416 : .strnxfrm = strnxfrm_libc,
417 : .strnxfrm_prefix = NULL,
418 : #ifdef TRUST_STRXFRM
419 : .strxfrm_is_safe = true,
420 : #else
421 : .strxfrm_is_safe = false,
422 : #endif
423 : };
424 : #endif
425 :
426 : static size_t
427 0 : strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
428 : pg_locale_t locale)
429 : {
430 0 : if (srclen < 0)
431 0 : srclen = strlen(src);
432 :
433 0 : if (srclen + 1 <= destsize)
434 : {
435 0 : locale_t loc = locale->lt;
436 : char *p;
437 :
438 0 : if (srclen + 1 > destsize)
439 0 : return srclen;
440 :
441 0 : memcpy(dest, src, srclen);
442 0 : dest[srclen] = '\0';
443 :
444 : /*
445 : * Note: we assume that tolower_l() will not be so broken as to need
446 : * an isupper_l() guard test. When using the default collation, we
447 : * apply the traditional Postgres behavior that forces ASCII-style
448 : * treatment of I/i, but in non-default collations you get exactly
449 : * what the collation says.
450 : */
451 0 : for (p = dest; *p; p++)
452 : {
453 0 : if (locale->is_default)
454 0 : *p = pg_tolower((unsigned char) *p);
455 : else
456 0 : *p = tolower_l((unsigned char) *p, loc);
457 : }
458 : }
459 :
460 0 : return srclen;
461 : }
462 :
463 : static size_t
464 424990 : strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
465 : pg_locale_t locale)
466 : {
467 424990 : locale_t loc = locale->lt;
468 : size_t result_size;
469 : wchar_t *workspace;
470 : char *result;
471 : size_t curr_char;
472 : size_t max_size;
473 :
474 424990 : if (srclen < 0)
475 0 : srclen = strlen(src);
476 :
477 : /* Overflow paranoia */
478 424990 : if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
479 0 : ereport(ERROR,
480 : (errcode(ERRCODE_OUT_OF_MEMORY),
481 : errmsg("out of memory")));
482 :
483 : /* Output workspace cannot have more codes than input bytes */
484 424990 : workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
485 :
486 424990 : char2wchar(workspace, srclen + 1, src, srclen, loc);
487 :
488 3669304 : for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
489 3244314 : workspace[curr_char] = towlower_l(workspace[curr_char], loc);
490 :
491 : /*
492 : * Make result large enough; case change might change number of bytes
493 : */
494 424990 : max_size = curr_char * pg_database_encoding_max_length();
495 424990 : result = palloc(max_size + 1);
496 :
497 424990 : result_size = wchar2char(result, workspace, max_size + 1, loc);
498 :
499 424990 : if (result_size + 1 > destsize)
500 0 : return result_size;
501 :
502 424990 : memcpy(dest, result, result_size);
503 424990 : dest[result_size] = '\0';
504 :
505 424990 : pfree(workspace);
506 424990 : pfree(result);
507 :
508 424990 : return result_size;
509 : }
510 :
511 : static size_t
512 0 : strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
513 : pg_locale_t locale)
514 : {
515 0 : if (srclen < 0)
516 0 : srclen = strlen(src);
517 :
518 0 : if (srclen + 1 <= destsize)
519 : {
520 0 : locale_t loc = locale->lt;
521 0 : int wasalnum = false;
522 : char *p;
523 :
524 0 : memcpy(dest, src, srclen);
525 0 : dest[srclen] = '\0';
526 :
527 : /*
528 : * Note: we assume that toupper_l()/tolower_l() will not be so broken
529 : * as to need guard tests. When using the default collation, we apply
530 : * the traditional Postgres behavior that forces ASCII-style treatment
531 : * of I/i, but in non-default collations you get exactly what the
532 : * collation says.
533 : */
534 0 : for (p = dest; *p; p++)
535 : {
536 0 : if (locale->is_default)
537 : {
538 0 : if (wasalnum)
539 0 : *p = pg_tolower((unsigned char) *p);
540 : else
541 0 : *p = pg_toupper((unsigned char) *p);
542 : }
543 : else
544 : {
545 0 : if (wasalnum)
546 0 : *p = tolower_l((unsigned char) *p, loc);
547 : else
548 0 : *p = toupper_l((unsigned char) *p, loc);
549 : }
550 0 : wasalnum = isalnum_l((unsigned char) *p, loc);
551 : }
552 : }
553 :
554 0 : return srclen;
555 : }
556 :
557 : static size_t
558 8 : strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
559 : pg_locale_t locale)
560 : {
561 8 : locale_t loc = locale->lt;
562 8 : int wasalnum = false;
563 : size_t result_size;
564 : wchar_t *workspace;
565 : char *result;
566 : size_t curr_char;
567 : size_t max_size;
568 :
569 8 : if (srclen < 0)
570 0 : srclen = strlen(src);
571 :
572 : /* Overflow paranoia */
573 8 : if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
574 0 : ereport(ERROR,
575 : (errcode(ERRCODE_OUT_OF_MEMORY),
576 : errmsg("out of memory")));
577 :
578 : /* Output workspace cannot have more codes than input bytes */
579 8 : workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
580 :
581 8 : char2wchar(workspace, srclen + 1, src, srclen, loc);
582 :
583 80 : for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
584 : {
585 72 : if (wasalnum)
586 56 : workspace[curr_char] = towlower_l(workspace[curr_char], loc);
587 : else
588 16 : workspace[curr_char] = towupper_l(workspace[curr_char], loc);
589 72 : wasalnum = iswalnum_l(workspace[curr_char], loc);
590 : }
591 :
592 : /*
593 : * Make result large enough; case change might change number of bytes
594 : */
595 8 : max_size = curr_char * pg_database_encoding_max_length();
596 8 : result = palloc(max_size + 1);
597 :
598 8 : result_size = wchar2char(result, workspace, max_size + 1, loc);
599 :
600 8 : if (result_size + 1 > destsize)
601 0 : return result_size;
602 :
603 8 : memcpy(dest, result, result_size);
604 8 : dest[result_size] = '\0';
605 :
606 8 : pfree(workspace);
607 8 : pfree(result);
608 :
609 8 : return result_size;
610 : }
611 :
612 : static size_t
613 0 : strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
614 : pg_locale_t locale)
615 : {
616 0 : if (srclen < 0)
617 0 : srclen = strlen(src);
618 :
619 0 : if (srclen + 1 <= destsize)
620 : {
621 0 : locale_t loc = locale->lt;
622 : char *p;
623 :
624 0 : memcpy(dest, src, srclen);
625 0 : dest[srclen] = '\0';
626 :
627 : /*
628 : * Note: we assume that toupper_l() will not be so broken as to need
629 : * an islower_l() guard test. When using the default collation, we
630 : * apply the traditional Postgres behavior that forces ASCII-style
631 : * treatment of I/i, but in non-default collations you get exactly
632 : * what the collation says.
633 : */
634 0 : for (p = dest; *p; p++)
635 : {
636 0 : if (locale->is_default)
637 0 : *p = pg_toupper((unsigned char) *p);
638 : else
639 0 : *p = toupper_l((unsigned char) *p, loc);
640 : }
641 : }
642 :
643 0 : return srclen;
644 : }
645 :
646 : static size_t
647 719232 : strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
648 : pg_locale_t locale)
649 : {
650 719232 : locale_t loc = locale->lt;
651 : size_t result_size;
652 : wchar_t *workspace;
653 : char *result;
654 : size_t curr_char;
655 : size_t max_size;
656 :
657 719232 : if (srclen < 0)
658 0 : srclen = strlen(src);
659 :
660 : /* Overflow paranoia */
661 719232 : if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
662 0 : ereport(ERROR,
663 : (errcode(ERRCODE_OUT_OF_MEMORY),
664 : errmsg("out of memory")));
665 :
666 : /* Output workspace cannot have more codes than input bytes */
667 719232 : workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
668 :
669 719232 : char2wchar(workspace, srclen + 1, src, srclen, loc);
670 :
671 2367270 : for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
672 1648038 : workspace[curr_char] = towupper_l(workspace[curr_char], loc);
673 :
674 : /*
675 : * Make result large enough; case change might change number of bytes
676 : */
677 719232 : max_size = curr_char * pg_database_encoding_max_length();
678 719232 : result = palloc(max_size + 1);
679 :
680 719232 : result_size = wchar2char(result, workspace, max_size + 1, loc);
681 :
682 719232 : if (result_size + 1 > destsize)
683 0 : return result_size;
684 :
685 719232 : memcpy(dest, result, result_size);
686 719232 : dest[result_size] = '\0';
687 :
688 719232 : pfree(workspace);
689 719232 : pfree(result);
690 :
691 719232 : return result_size;
692 : }
693 :
694 : pg_locale_t
695 35304 : create_pg_locale_libc(Oid collid, MemoryContext context)
696 : {
697 : const char *collate;
698 : const char *ctype;
699 : locale_t loc;
700 : pg_locale_t result;
701 :
702 35304 : if (collid == DEFAULT_COLLATION_OID)
703 : {
704 : HeapTuple tp;
705 : Datum datum;
706 :
707 31256 : tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
708 31256 : if (!HeapTupleIsValid(tp))
709 0 : elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
710 31256 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
711 : Anum_pg_database_datcollate);
712 31256 : collate = TextDatumGetCString(datum);
713 31256 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
714 : Anum_pg_database_datctype);
715 31256 : ctype = TextDatumGetCString(datum);
716 :
717 31256 : ReleaseSysCache(tp);
718 : }
719 : else
720 : {
721 : HeapTuple tp;
722 : Datum datum;
723 :
724 4048 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
725 4048 : if (!HeapTupleIsValid(tp))
726 0 : elog(ERROR, "cache lookup failed for collation %u", collid);
727 :
728 4048 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
729 : Anum_pg_collation_collcollate);
730 4048 : collate = TextDatumGetCString(datum);
731 4048 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
732 : Anum_pg_collation_collctype);
733 4048 : ctype = TextDatumGetCString(datum);
734 :
735 4048 : ReleaseSysCache(tp);
736 : }
737 :
738 :
739 35304 : loc = make_libc_collator(collate, ctype);
740 :
741 35304 : result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
742 35304 : result->deterministic = true;
743 65520 : result->collate_is_c = (strcmp(collate, "C") == 0) ||
744 30216 : (strcmp(collate, "POSIX") == 0);
745 65520 : result->ctype_is_c = (strcmp(ctype, "C") == 0) ||
746 30216 : (strcmp(ctype, "POSIX") == 0);
747 35304 : result->lt = loc;
748 35304 : if (!result->collate_is_c)
749 : {
750 : #ifdef WIN32
751 : if (GetDatabaseEncoding() == PG_UTF8)
752 : result->collate = &collate_methods_libc_win32_utf8;
753 : else
754 : #endif
755 30152 : result->collate = &collate_methods_libc;
756 : }
757 35304 : if (!result->ctype_is_c)
758 : {
759 30152 : if (GetDatabaseEncoding() == PG_UTF8)
760 30088 : result->ctype = &ctype_methods_libc_utf8;
761 64 : else if (pg_database_encoding_max_length() > 1)
762 0 : result->ctype = &ctype_methods_libc_other_mb;
763 : else
764 64 : result->ctype = &ctype_methods_libc_sb;
765 : }
766 :
767 35304 : return result;
768 : }
769 :
770 : /*
771 : * Create a locale_t with the given collation and ctype.
772 : *
773 : * The "C" and "POSIX" locales are not actually handled by libc, so return
774 : * NULL.
775 : *
776 : * Ensure that no path leaks a locale_t.
777 : */
778 : static locale_t
779 35304 : make_libc_collator(const char *collate, const char *ctype)
780 : {
781 35304 : locale_t loc = 0;
782 :
783 35304 : if (strcmp(collate, ctype) == 0)
784 : {
785 35304 : if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
786 : {
787 : /* Normal case where they're the same */
788 30152 : errno = 0;
789 : #ifndef WIN32
790 30152 : loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
791 : NULL);
792 : #else
793 : loc = _create_locale(LC_ALL, collate);
794 : #endif
795 30152 : if (!loc)
796 0 : report_newlocale_failure(collate);
797 : }
798 : }
799 : else
800 : {
801 : #ifndef WIN32
802 : /* We need two newlocale() steps */
803 0 : locale_t loc1 = 0;
804 :
805 0 : if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0)
806 : {
807 0 : errno = 0;
808 0 : loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
809 0 : if (!loc1)
810 0 : report_newlocale_failure(collate);
811 : }
812 :
813 0 : if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
814 : {
815 0 : errno = 0;
816 0 : loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
817 0 : if (!loc)
818 : {
819 0 : if (loc1)
820 0 : freelocale(loc1);
821 0 : report_newlocale_failure(ctype);
822 : }
823 : }
824 : else
825 0 : loc = loc1;
826 : #else
827 :
828 : /*
829 : * XXX The _create_locale() API doesn't appear to support this. Could
830 : * perhaps be worked around by changing pg_locale_t to contain two
831 : * separate fields.
832 : */
833 : ereport(ERROR,
834 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
835 : errmsg("collations with different collate and ctype values are not supported on this platform")));
836 : #endif
837 : }
838 :
839 35304 : return loc;
840 : }
841 :
842 : /*
843 : * strncoll_libc
844 : *
845 : * NUL-terminate arguments, if necessary, and pass to strcoll_l().
846 : *
847 : * An input string length of -1 means that it's already NUL-terminated.
848 : */
849 : int
850 30192280 : strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
851 : pg_locale_t locale)
852 : {
853 : char sbuf[TEXTBUFLEN];
854 30192280 : char *buf = sbuf;
855 30192280 : size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1;
856 30192280 : size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1;
857 : const char *arg1n;
858 : const char *arg2n;
859 : int result;
860 :
861 30192280 : if (bufsize1 + bufsize2 > TEXTBUFLEN)
862 568 : buf = palloc(bufsize1 + bufsize2);
863 :
864 : /* nul-terminate arguments if necessary */
865 30192280 : if (len1 == -1)
866 : {
867 25768894 : arg1n = arg1;
868 : }
869 : else
870 : {
871 4423386 : char *buf1 = buf;
872 :
873 4423386 : memcpy(buf1, arg1, len1);
874 4423386 : buf1[len1] = '\0';
875 4423386 : arg1n = buf1;
876 : }
877 :
878 30192280 : if (len2 == -1)
879 : {
880 25768894 : arg2n = arg2;
881 : }
882 : else
883 : {
884 4423386 : char *buf2 = buf + bufsize1;
885 :
886 4423386 : memcpy(buf2, arg2, len2);
887 4423386 : buf2[len2] = '\0';
888 4423386 : arg2n = buf2;
889 : }
890 :
891 30192280 : result = strcoll_l(arg1n, arg2n, locale->lt);
892 :
893 30192280 : if (buf != sbuf)
894 568 : pfree(buf);
895 :
896 30192280 : return result;
897 : }
898 :
899 : /*
900 : * strnxfrm_libc
901 : *
902 : * NUL-terminate src, if necessary, and pass to strxfrm_l().
903 : *
904 : * A source length of -1 means that it's already NUL-terminated.
905 : */
906 : size_t
907 144 : strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
908 : pg_locale_t locale)
909 : {
910 : char sbuf[TEXTBUFLEN];
911 144 : char *buf = sbuf;
912 144 : size_t bufsize = srclen + 1;
913 : size_t result;
914 :
915 144 : if (srclen == -1)
916 144 : return strxfrm_l(dest, src, destsize, locale->lt);
917 :
918 0 : if (bufsize > TEXTBUFLEN)
919 0 : buf = palloc(bufsize);
920 :
921 : /* nul-terminate argument */
922 0 : memcpy(buf, src, srclen);
923 0 : buf[srclen] = '\0';
924 :
925 0 : result = strxfrm_l(dest, buf, destsize, locale->lt);
926 :
927 0 : if (buf != sbuf)
928 0 : pfree(buf);
929 :
930 : /* if dest is defined, it should be nul-terminated */
931 : Assert(result >= destsize || dest[result] == '\0');
932 :
933 0 : return result;
934 : }
935 :
936 : char *
937 30500 : get_collation_actual_version_libc(const char *collcollate)
938 : {
939 30500 : char *collversion = NULL;
940 :
941 60824 : if (pg_strcasecmp("C", collcollate) != 0 &&
942 60456 : pg_strncasecmp("C.", collcollate, 2) != 0 &&
943 30132 : pg_strcasecmp("POSIX", collcollate) != 0)
944 : {
945 : #if defined(__GLIBC__)
946 : /* Use the glibc version because we don't have anything better. */
947 30106 : collversion = pstrdup(gnu_get_libc_version());
948 : #elif defined(LC_VERSION_MASK)
949 : locale_t loc;
950 :
951 : /* Look up FreeBSD collation version. */
952 : loc = newlocale(LC_COLLATE_MASK, collcollate, NULL);
953 : if (loc)
954 : {
955 : collversion =
956 : pstrdup(querylocale(LC_COLLATE_MASK | LC_VERSION_MASK, loc));
957 : freelocale(loc);
958 : }
959 : else
960 : ereport(ERROR,
961 : (errmsg("could not load locale \"%s\"", collcollate)));
962 : #elif defined(WIN32)
963 : /*
964 : * If we are targeting Windows Vista and above, we can ask for a name
965 : * given a collation name (earlier versions required a location code
966 : * that we don't have).
967 : */
968 : NLSVERSIONINFOEX version = {sizeof(NLSVERSIONINFOEX)};
969 : WCHAR wide_collcollate[LOCALE_NAME_MAX_LENGTH];
970 :
971 : MultiByteToWideChar(CP_ACP, 0, collcollate, -1, wide_collcollate,
972 : LOCALE_NAME_MAX_LENGTH);
973 : if (!GetNLSVersionEx(COMPARE_STRING, wide_collcollate, &version))
974 : {
975 : /*
976 : * GetNLSVersionEx() wants a language tag such as "en-US", not a
977 : * locale name like "English_United States.1252". Until those
978 : * values can be prevented from entering the system, or 100%
979 : * reliably converted to the more useful tag format, tolerate the
980 : * resulting error and report that we have no version data.
981 : */
982 : if (GetLastError() == ERROR_INVALID_PARAMETER)
983 : return NULL;
984 :
985 : ereport(ERROR,
986 : (errmsg("could not get collation version for locale \"%s\": error code %lu",
987 : collcollate,
988 : GetLastError())));
989 : }
990 : collversion = psprintf("%lu.%lu,%lu.%lu",
991 : (version.dwNLSVersion >> 8) & 0xFFFF,
992 : version.dwNLSVersion & 0xFF,
993 : (version.dwDefinedVersion >> 8) & 0xFFFF,
994 : version.dwDefinedVersion & 0xFF);
995 : #endif
996 : }
997 :
998 30500 : return collversion;
999 : }
1000 :
1001 : /*
1002 : * strncoll_libc_win32_utf8
1003 : *
1004 : * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
1005 : * invoke wcscoll_l().
1006 : *
1007 : * An input string length of -1 means that it's NUL-terminated.
1008 : */
1009 : #ifdef WIN32
1010 : static int
1011 : strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
1012 : ssize_t len2, pg_locale_t locale)
1013 : {
1014 : char sbuf[TEXTBUFLEN];
1015 : char *buf = sbuf;
1016 : char *a1p,
1017 : *a2p;
1018 : int a1len;
1019 : int a2len;
1020 : int r;
1021 : int result;
1022 :
1023 : Assert(GetDatabaseEncoding() == PG_UTF8);
1024 :
1025 : if (len1 == -1)
1026 : len1 = strlen(arg1);
1027 : if (len2 == -1)
1028 : len2 = strlen(arg2);
1029 :
1030 : a1len = len1 * 2 + 2;
1031 : a2len = len2 * 2 + 2;
1032 :
1033 : if (a1len + a2len > TEXTBUFLEN)
1034 : buf = palloc(a1len + a2len);
1035 :
1036 : a1p = buf;
1037 : a2p = buf + a1len;
1038 :
1039 : /* API does not work for zero-length input */
1040 : if (len1 == 0)
1041 : r = 0;
1042 : else
1043 : {
1044 : r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1045 : (LPWSTR) a1p, a1len / 2);
1046 : if (!r)
1047 : ereport(ERROR,
1048 : (errmsg("could not convert string to UTF-16: error code %lu",
1049 : GetLastError())));
1050 : }
1051 : ((LPWSTR) a1p)[r] = 0;
1052 :
1053 : if (len2 == 0)
1054 : r = 0;
1055 : else
1056 : {
1057 : r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1058 : (LPWSTR) a2p, a2len / 2);
1059 : if (!r)
1060 : ereport(ERROR,
1061 : (errmsg("could not convert string to UTF-16: error code %lu",
1062 : GetLastError())));
1063 : }
1064 : ((LPWSTR) a2p)[r] = 0;
1065 :
1066 : errno = 0;
1067 : result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->lt);
1068 : if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */
1069 : ereport(ERROR,
1070 : (errmsg("could not compare Unicode strings: %m")));
1071 :
1072 : if (buf != sbuf)
1073 : pfree(buf);
1074 :
1075 : return result;
1076 : }
1077 : #endif /* WIN32 */
1078 :
1079 : /* simple subroutine for reporting errors from newlocale() */
1080 : void
1081 0 : report_newlocale_failure(const char *localename)
1082 : {
1083 : int save_errno;
1084 :
1085 : /*
1086 : * Windows doesn't provide any useful error indication from
1087 : * _create_locale(), and BSD-derived platforms don't seem to feel they
1088 : * need to set errno either (even though POSIX is pretty clear that
1089 : * newlocale should do so). So, if errno hasn't been set, assume ENOENT
1090 : * is what to report.
1091 : */
1092 0 : if (errno == 0)
1093 0 : errno = ENOENT;
1094 :
1095 : /*
1096 : * ENOENT means "no such locale", not "no such file", so clarify that
1097 : * errno with an errdetail message.
1098 : */
1099 0 : save_errno = errno; /* auxiliary funcs might change errno */
1100 0 : ereport(ERROR,
1101 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1102 : errmsg("could not create locale \"%s\": %m",
1103 : localename),
1104 : (save_errno == ENOENT ?
1105 : errdetail("The operating system could not find any locale data for the locale name \"%s\".",
1106 : localename) : 0)));
1107 : }
1108 :
1109 : /*
1110 : * POSIX doesn't define _l-variants of these functions, but several systems
1111 : * have them. We provide our own replacements here.
1112 : */
1113 : #ifndef HAVE_MBSTOWCS_L
1114 : static size_t
1115 1144230 : mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
1116 : {
1117 : #ifdef WIN32
1118 : return _mbstowcs_l(dest, src, n, loc);
1119 : #else
1120 : size_t result;
1121 1144230 : locale_t save_locale = uselocale(loc);
1122 :
1123 1144230 : result = mbstowcs(dest, src, n);
1124 1144230 : uselocale(save_locale);
1125 1144230 : return result;
1126 : #endif
1127 : }
1128 : #endif
1129 : #ifndef HAVE_WCSTOMBS_L
1130 : static size_t
1131 1144230 : wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
1132 : {
1133 : #ifdef WIN32
1134 : return _wcstombs_l(dest, src, n, loc);
1135 : #else
1136 : size_t result;
1137 1144230 : locale_t save_locale = uselocale(loc);
1138 :
1139 1144230 : result = wcstombs(dest, src, n);
1140 1144230 : uselocale(save_locale);
1141 1144230 : return result;
1142 : #endif
1143 : }
1144 : #endif
1145 :
1146 : /*
1147 : * These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
1148 : * Therefore we keep them here rather than with the mbutils code.
1149 : */
1150 :
1151 : /*
1152 : * wchar2char --- convert wide characters to multibyte format
1153 : *
1154 : * This has the same API as the standard wcstombs_l() function; in particular,
1155 : * tolen is the maximum number of bytes to store at *to, and *from must be
1156 : * zero-terminated. The output will be zero-terminated iff there is room.
1157 : */
1158 : size_t
1159 1144230 : wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc)
1160 : {
1161 : size_t result;
1162 :
1163 1144230 : if (tolen == 0)
1164 0 : return 0;
1165 :
1166 : #ifdef WIN32
1167 :
1168 : /*
1169 : * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
1170 : * for some reason mbstowcs and wcstombs won't do this for us, so we use
1171 : * MultiByteToWideChar().
1172 : */
1173 : if (GetDatabaseEncoding() == PG_UTF8)
1174 : {
1175 : result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
1176 : NULL, NULL);
1177 : /* A zero return is failure */
1178 : if (result <= 0)
1179 : result = -1;
1180 : else
1181 : {
1182 : Assert(result <= tolen);
1183 : /* Microsoft counts the zero terminator in the result */
1184 : result--;
1185 : }
1186 : }
1187 : else
1188 : #endif /* WIN32 */
1189 1144230 : if (loc == (locale_t) 0)
1190 : {
1191 : /* Use wcstombs directly for the default locale */
1192 0 : result = wcstombs(to, from, tolen);
1193 : }
1194 : else
1195 : {
1196 : /* Use wcstombs_l for nondefault locales */
1197 1144230 : result = wcstombs_l(to, from, tolen, loc);
1198 : }
1199 :
1200 1144230 : return result;
1201 : }
1202 :
1203 : /*
1204 : * char2wchar --- convert multibyte characters to wide characters
1205 : *
1206 : * This has almost the API of mbstowcs_l(), except that *from need not be
1207 : * null-terminated; instead, the number of input bytes is specified as
1208 : * fromlen. Also, we ereport() rather than returning -1 for invalid
1209 : * input encoding. tolen is the maximum number of wchar_t's to store at *to.
1210 : * The output will be zero-terminated iff there is room.
1211 : */
1212 : static size_t
1213 1144230 : char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
1214 : locale_t loc)
1215 : {
1216 : size_t result;
1217 :
1218 1144230 : if (tolen == 0)
1219 0 : return 0;
1220 :
1221 : #ifdef WIN32
1222 : /* See WIN32 "Unicode" comment above */
1223 : if (GetDatabaseEncoding() == PG_UTF8)
1224 : {
1225 : /* Win32 API does not work for zero-length input */
1226 : if (fromlen == 0)
1227 : result = 0;
1228 : else
1229 : {
1230 : result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
1231 : /* A zero return is failure */
1232 : if (result == 0)
1233 : result = -1;
1234 : }
1235 :
1236 : if (result != -1)
1237 : {
1238 : Assert(result < tolen);
1239 : /* Append trailing null wchar (MultiByteToWideChar() does not) */
1240 : to[result] = 0;
1241 : }
1242 : }
1243 : else
1244 : #endif /* WIN32 */
1245 : {
1246 : /* mbstowcs requires ending '\0' */
1247 1144230 : char *str = pnstrdup(from, fromlen);
1248 :
1249 1144230 : if (loc == (locale_t) 0)
1250 : {
1251 : /* Use mbstowcs directly for the default locale */
1252 0 : result = mbstowcs(to, str, tolen);
1253 : }
1254 : else
1255 : {
1256 : /* Use mbstowcs_l for nondefault locales */
1257 1144230 : result = mbstowcs_l(to, str, tolen, loc);
1258 : }
1259 :
1260 1144230 : pfree(str);
1261 : }
1262 :
1263 1144230 : if (result == -1)
1264 : {
1265 : /*
1266 : * Invalid multibyte character encountered. We try to give a useful
1267 : * error message by letting pg_verifymbstr check the string. But it's
1268 : * possible that the string is OK to us, and not OK to mbstowcs ---
1269 : * this suggests that the LC_CTYPE locale is different from the
1270 : * database encoding. Give a generic error message if pg_verifymbstr
1271 : * can't find anything wrong.
1272 : */
1273 0 : pg_verifymbstr(from, fromlen, false); /* might not return */
1274 : /* but if it does ... */
1275 0 : ereport(ERROR,
1276 : (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1277 : errmsg("invalid multibyte character for locale"),
1278 : errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
1279 : }
1280 :
1281 1144230 : return result;
1282 : }
|