Line data Source code
1 : /*-----------------------------------------------------------------------
2 : *
3 : * PostgreSQL locale utilities for libc
4 : *
5 : * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 : *
7 : * src/backend/utils/adt/pg_locale_libc.c
8 : *
9 : *-----------------------------------------------------------------------
10 : */
11 :
12 : #include "postgres.h"
13 :
14 : #include <limits.h>
15 : #include <wctype.h>
16 :
17 : #include "access/htup_details.h"
18 : #include "catalog/pg_database.h"
19 : #include "catalog/pg_collation.h"
20 : #include "mb/pg_wchar.h"
21 : #include "miscadmin.h"
22 : #include "utils/builtins.h"
23 : #include "utils/formatting.h"
24 : #include "utils/memutils.h"
25 : #include "utils/pg_locale.h"
26 : #include "utils/syscache.h"
27 :
28 : #ifdef __GLIBC__
29 : #include <gnu/libc-version.h>
30 : #endif
31 :
32 : #ifdef WIN32
33 : #include <shlwapi.h>
34 : #endif
35 :
36 : /*
37 : * For the libc provider, to provide as much functionality as possible on a
38 : * variety of platforms without going so far as to implement everything from
39 : * scratch, we use several implementation strategies depending on the
40 : * situation:
41 : *
42 : * 1. In C/POSIX collations, we use hard-wired code. We can't depend on
43 : * the <ctype.h> functions since those will obey LC_CTYPE. Note that these
44 : * collations don't give a fig about multibyte characters.
45 : *
46 : * 2. When working in UTF8 encoding, we use the <wctype.h> functions.
47 : * This assumes that every platform uses Unicode codepoints directly
48 : * as the wchar_t representation of Unicode. On some platforms
49 : * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
50 : *
51 : * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar
52 : * values up to 255, and punt for values above that. This is 100% correct
53 : * only in single-byte encodings such as LATINn. However, non-Unicode
54 : * multibyte encodings are mostly Far Eastern character sets for which the
55 : * properties being tested here aren't very relevant for higher code values
56 : * anyway. The difficulty with using the <wctype.h> functions with
57 : * non-Unicode multibyte encodings is that we can have no certainty that
58 : * the platform's wchar_t representation matches what we do in pg_wchar
59 : * conversions.
60 : *
61 : * As a special case, in the "default" collation, (2) and (3) force ASCII
62 : * letters to follow ASCII upcase/downcase rules, while in a non-default
63 : * collation we just let the library functions do what they will. The case
64 : * where this matters is treatment of I/i in Turkish, and the behavior is
65 : * meant to match the upper()/lower() SQL functions.
66 : *
67 : * We store the active collation setting in static variables. In principle
68 : * it could be passed down to here via the regex library's "struct vars" data
69 : * structure; but that would require somewhat invasive changes in the regex
70 : * library, and right now there's no real benefit to be gained from that.
71 : *
72 : * NB: the coding here assumes pg_wchar is an unsigned type.
73 : */
74 :
75 : /*
76 : * Size of stack buffer to use for string transformations, used to avoid heap
77 : * allocations in typical cases. This should be large enough that most strings
78 : * will fit, but small enough that we feel comfortable putting it on the
79 : * stack.
80 : */
81 : #define TEXTBUFLEN 1024
82 :
83 : extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context);
84 :
85 : static int strncoll_libc(const char *arg1, ssize_t len1,
86 : const char *arg2, ssize_t len2,
87 : pg_locale_t locale);
88 : static size_t strnxfrm_libc(char *dest, size_t destsize,
89 : const char *src, ssize_t srclen,
90 : pg_locale_t locale);
91 : extern char *get_collation_actual_version_libc(const char *collcollate);
92 : static locale_t make_libc_collator(const char *collate,
93 : const char *ctype);
94 :
95 : #ifdef WIN32
96 : static int strncoll_libc_win32_utf8(const char *arg1, ssize_t len1,
97 : const char *arg2, ssize_t len2,
98 : pg_locale_t locale);
99 : #endif
100 :
101 : static size_t char2wchar(wchar_t *to, size_t tolen, const char *from,
102 : size_t fromlen, locale_t loc);
103 :
104 : static size_t strlower_libc_sb(char *dest, size_t destsize,
105 : const char *src, ssize_t srclen,
106 : pg_locale_t locale);
107 : static size_t strlower_libc_mb(char *dest, size_t destsize,
108 : const char *src, ssize_t srclen,
109 : pg_locale_t locale);
110 : static size_t strtitle_libc_sb(char *dest, size_t destsize,
111 : const char *src, ssize_t srclen,
112 : pg_locale_t locale);
113 : static size_t strtitle_libc_mb(char *dest, size_t destsize,
114 : const char *src, ssize_t srclen,
115 : pg_locale_t locale);
116 : static size_t strupper_libc_sb(char *dest, size_t destsize,
117 : const char *src, ssize_t srclen,
118 : pg_locale_t locale);
119 : static size_t strupper_libc_mb(char *dest, size_t destsize,
120 : const char *src, ssize_t srclen,
121 : pg_locale_t locale);
122 :
123 : static bool
124 0 : wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
125 : {
126 0 : return isdigit_l((unsigned char) wc, locale->lt);
127 : }
128 :
129 : static bool
130 0 : wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale)
131 : {
132 0 : return isalpha_l((unsigned char) wc, locale->lt);
133 : }
134 :
135 : static bool
136 0 : wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale)
137 : {
138 0 : return isalnum_l((unsigned char) wc, locale->lt);
139 : }
140 :
141 : static bool
142 0 : wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale)
143 : {
144 0 : return isupper_l((unsigned char) wc, locale->lt);
145 : }
146 :
147 : static bool
148 0 : wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale)
149 : {
150 0 : return islower_l((unsigned char) wc, locale->lt);
151 : }
152 :
153 : static bool
154 0 : wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale)
155 : {
156 0 : return isgraph_l((unsigned char) wc, locale->lt);
157 : }
158 :
159 : static bool
160 0 : wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale)
161 : {
162 0 : return isprint_l((unsigned char) wc, locale->lt);
163 : }
164 :
165 : static bool
166 0 : wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale)
167 : {
168 0 : return ispunct_l((unsigned char) wc, locale->lt);
169 : }
170 :
171 : static bool
172 0 : wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale)
173 : {
174 0 : return isspace_l((unsigned char) wc, locale->lt);
175 : }
176 :
177 : static bool
178 0 : wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
179 : {
180 : #ifndef WIN32
181 0 : return isxdigit_l((unsigned char) wc, locale->lt);
182 : #else
183 : return _isxdigit_l((unsigned char) wc, locale->lt);
184 : #endif
185 : }
186 :
187 : static bool
188 131608 : wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
189 : {
190 131608 : return iswdigit_l((wint_t) wc, locale->lt);
191 : }
192 :
193 : static bool
194 81148 : wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale)
195 : {
196 81148 : return iswalpha_l((wint_t) wc, locale->lt);
197 : }
198 :
199 : static bool
200 2845670 : wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale)
201 : {
202 2845670 : return iswalnum_l((wint_t) wc, locale->lt);
203 : }
204 :
205 : static bool
206 4112 : wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale)
207 : {
208 4112 : return iswupper_l((wint_t) wc, locale->lt);
209 : }
210 :
211 : static bool
212 4102 : wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale)
213 : {
214 4102 : return iswlower_l((wint_t) wc, locale->lt);
215 : }
216 :
217 : static bool
218 4102 : wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale)
219 : {
220 4102 : return iswgraph_l((wint_t) wc, locale->lt);
221 : }
222 :
223 : static bool
224 4102 : wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale)
225 : {
226 4102 : return iswprint_l((wint_t) wc, locale->lt);
227 : }
228 :
229 : static bool
230 4102 : wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale)
231 : {
232 4102 : return iswpunct_l((wint_t) wc, locale->lt);
233 : }
234 :
235 : static bool
236 48152 : wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale)
237 : {
238 48152 : return iswspace_l((wint_t) wc, locale->lt);
239 : }
240 :
241 : static bool
242 12 : wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
243 : {
244 : #ifndef WIN32
245 12 : return iswxdigit_l((wint_t) wc, locale->lt);
246 : #else
247 : return _iswxdigit_l((wint_t) wc, locale->lt);
248 : #endif
249 : }
250 :
251 : static char
252 0 : char_tolower_libc(unsigned char ch, pg_locale_t locale)
253 : {
254 : Assert(pg_database_encoding_max_length() == 1);
255 0 : return tolower_l(ch, locale->lt);
256 : }
257 :
258 : static bool
259 0 : char_is_cased_libc(char ch, pg_locale_t locale)
260 : {
261 0 : bool is_multibyte = pg_database_encoding_max_length() > 1;
262 :
263 0 : if (is_multibyte && IS_HIGHBIT_SET(ch))
264 0 : return true;
265 : else
266 0 : return isalpha_l((unsigned char) ch, locale->lt);
267 : }
268 :
269 : static pg_wchar
270 0 : toupper_libc_sb(pg_wchar wc, pg_locale_t locale)
271 : {
272 : Assert(GetDatabaseEncoding() != PG_UTF8);
273 :
274 : /* force C behavior for ASCII characters, per comments above */
275 0 : if (locale->is_default && wc <= (pg_wchar) 127)
276 0 : return pg_ascii_toupper((unsigned char) wc);
277 0 : if (wc <= (pg_wchar) UCHAR_MAX)
278 0 : return toupper_l((unsigned char) wc, locale->lt);
279 : else
280 0 : return wc;
281 : }
282 :
283 : static pg_wchar
284 9088 : toupper_libc_mb(pg_wchar wc, pg_locale_t locale)
285 : {
286 : Assert(GetDatabaseEncoding() == PG_UTF8);
287 :
288 : /* force C behavior for ASCII characters, per comments above */
289 9088 : if (locale->is_default && wc <= (pg_wchar) 127)
290 892 : return pg_ascii_toupper((unsigned char) wc);
291 : if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
292 8196 : return towupper_l((wint_t) wc, locale->lt);
293 : else
294 : return wc;
295 : }
296 :
297 : static pg_wchar
298 0 : tolower_libc_sb(pg_wchar wc, pg_locale_t locale)
299 : {
300 : Assert(GetDatabaseEncoding() != PG_UTF8);
301 :
302 : /* force C behavior for ASCII characters, per comments above */
303 0 : if (locale->is_default && wc <= (pg_wchar) 127)
304 0 : return pg_ascii_tolower((unsigned char) wc);
305 0 : if (wc <= (pg_wchar) UCHAR_MAX)
306 0 : return tolower_l((unsigned char) wc, locale->lt);
307 : else
308 0 : return wc;
309 : }
310 :
311 : static pg_wchar
312 9092 : tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
313 : {
314 : Assert(GetDatabaseEncoding() == PG_UTF8);
315 :
316 : /* force C behavior for ASCII characters, per comments above */
317 9092 : if (locale->is_default && wc <= (pg_wchar) 127)
318 896 : return pg_ascii_tolower((unsigned char) wc);
319 : if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
320 8196 : return towlower_l((wint_t) wc, locale->lt);
321 : else
322 : return wc;
323 : }
324 :
325 : static const struct ctype_methods ctype_methods_libc_sb = {
326 : .strlower = strlower_libc_sb,
327 : .strtitle = strtitle_libc_sb,
328 : .strupper = strupper_libc_sb,
329 : .wc_isdigit = wc_isdigit_libc_sb,
330 : .wc_isalpha = wc_isalpha_libc_sb,
331 : .wc_isalnum = wc_isalnum_libc_sb,
332 : .wc_isupper = wc_isupper_libc_sb,
333 : .wc_islower = wc_islower_libc_sb,
334 : .wc_isgraph = wc_isgraph_libc_sb,
335 : .wc_isprint = wc_isprint_libc_sb,
336 : .wc_ispunct = wc_ispunct_libc_sb,
337 : .wc_isspace = wc_isspace_libc_sb,
338 : .wc_isxdigit = wc_isxdigit_libc_sb,
339 : .char_is_cased = char_is_cased_libc,
340 : .char_tolower = char_tolower_libc,
341 : .wc_toupper = toupper_libc_sb,
342 : .wc_tolower = tolower_libc_sb,
343 : .max_chr = UCHAR_MAX,
344 : };
345 :
346 : /*
347 : * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but
348 : * single-byte semantics for pattern matching.
349 : */
350 : static const struct ctype_methods ctype_methods_libc_other_mb = {
351 : .strlower = strlower_libc_mb,
352 : .strtitle = strtitle_libc_mb,
353 : .strupper = strupper_libc_mb,
354 : .wc_isdigit = wc_isdigit_libc_sb,
355 : .wc_isalpha = wc_isalpha_libc_sb,
356 : .wc_isalnum = wc_isalnum_libc_sb,
357 : .wc_isupper = wc_isupper_libc_sb,
358 : .wc_islower = wc_islower_libc_sb,
359 : .wc_isgraph = wc_isgraph_libc_sb,
360 : .wc_isprint = wc_isprint_libc_sb,
361 : .wc_ispunct = wc_ispunct_libc_sb,
362 : .wc_isspace = wc_isspace_libc_sb,
363 : .wc_isxdigit = wc_isxdigit_libc_sb,
364 : .char_is_cased = char_is_cased_libc,
365 : .char_tolower = char_tolower_libc,
366 : .wc_toupper = toupper_libc_sb,
367 : .wc_tolower = tolower_libc_sb,
368 : .max_chr = UCHAR_MAX,
369 : };
370 :
371 : static const struct ctype_methods ctype_methods_libc_utf8 = {
372 : .strlower = strlower_libc_mb,
373 : .strtitle = strtitle_libc_mb,
374 : .strupper = strupper_libc_mb,
375 : .wc_isdigit = wc_isdigit_libc_mb,
376 : .wc_isalpha = wc_isalpha_libc_mb,
377 : .wc_isalnum = wc_isalnum_libc_mb,
378 : .wc_isupper = wc_isupper_libc_mb,
379 : .wc_islower = wc_islower_libc_mb,
380 : .wc_isgraph = wc_isgraph_libc_mb,
381 : .wc_isprint = wc_isprint_libc_mb,
382 : .wc_ispunct = wc_ispunct_libc_mb,
383 : .wc_isspace = wc_isspace_libc_mb,
384 : .wc_isxdigit = wc_isxdigit_libc_mb,
385 : .char_is_cased = char_is_cased_libc,
386 : .char_tolower = char_tolower_libc,
387 : .wc_toupper = toupper_libc_mb,
388 : .wc_tolower = tolower_libc_mb,
389 : };
390 :
391 : static const struct collate_methods collate_methods_libc = {
392 : .strncoll = strncoll_libc,
393 : .strnxfrm = strnxfrm_libc,
394 : .strnxfrm_prefix = NULL,
395 :
396 : /*
397 : * Unfortunately, it seems that strxfrm() for non-C collations is broken
398 : * on many common platforms; testing of multiple versions of glibc reveals
399 : * that, for many locales, strcoll() and strxfrm() do not return
400 : * consistent results. While no other libc other than Cygwin has so far
401 : * been shown to have a problem, we take the conservative course of action
402 : * for right now and disable this categorically. (Users who are certain
403 : * this isn't a problem on their system can define TRUST_STRXFRM.)
404 : */
405 : #ifdef TRUST_STRXFRM
406 : .strxfrm_is_safe = true,
407 : #else
408 : .strxfrm_is_safe = false,
409 : #endif
410 : };
411 :
412 : #ifdef WIN32
413 : static const struct collate_methods collate_methods_libc_win32_utf8 = {
414 : .strncoll = strncoll_libc_win32_utf8,
415 : .strnxfrm = strnxfrm_libc,
416 : .strnxfrm_prefix = NULL,
417 : #ifdef TRUST_STRXFRM
418 : .strxfrm_is_safe = true,
419 : #else
420 : .strxfrm_is_safe = false,
421 : #endif
422 : };
423 : #endif
424 :
425 : static size_t
426 0 : strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
427 : pg_locale_t locale)
428 : {
429 0 : if (srclen < 0)
430 0 : srclen = strlen(src);
431 :
432 0 : if (srclen + 1 <= destsize)
433 : {
434 0 : locale_t loc = locale->lt;
435 : char *p;
436 :
437 0 : if (srclen + 1 > destsize)
438 0 : return srclen;
439 :
440 0 : memcpy(dest, src, srclen);
441 0 : dest[srclen] = '\0';
442 :
443 : /*
444 : * Note: we assume that tolower_l() will not be so broken as to need
445 : * an isupper_l() guard test. When using the default collation, we
446 : * apply the traditional Postgres behavior that forces ASCII-style
447 : * treatment of I/i, but in non-default collations you get exactly
448 : * what the collation says.
449 : */
450 0 : for (p = dest; *p; p++)
451 : {
452 0 : if (locale->is_default)
453 0 : *p = pg_tolower((unsigned char) *p);
454 : else
455 0 : *p = tolower_l((unsigned char) *p, loc);
456 : }
457 : }
458 :
459 0 : return srclen;
460 : }
461 :
462 : static size_t
463 424990 : strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
464 : pg_locale_t locale)
465 : {
466 424990 : locale_t loc = locale->lt;
467 : size_t result_size;
468 : wchar_t *workspace;
469 : char *result;
470 : size_t curr_char;
471 : size_t max_size;
472 :
473 424990 : if (srclen < 0)
474 0 : srclen = strlen(src);
475 :
476 : /* Overflow paranoia */
477 424990 : if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
478 0 : ereport(ERROR,
479 : (errcode(ERRCODE_OUT_OF_MEMORY),
480 : errmsg("out of memory")));
481 :
482 : /* Output workspace cannot have more codes than input bytes */
483 424990 : workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
484 :
485 424990 : char2wchar(workspace, srclen + 1, src, srclen, loc);
486 :
487 3669294 : for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
488 3244304 : workspace[curr_char] = towlower_l(workspace[curr_char], loc);
489 :
490 : /*
491 : * Make result large enough; case change might change number of bytes
492 : */
493 424990 : max_size = curr_char * pg_database_encoding_max_length();
494 424990 : result = palloc(max_size + 1);
495 :
496 424990 : result_size = wchar2char(result, workspace, max_size + 1, loc);
497 :
498 424990 : if (result_size + 1 > destsize)
499 0 : return result_size;
500 :
501 424990 : memcpy(dest, result, result_size);
502 424990 : dest[result_size] = '\0';
503 :
504 424990 : pfree(workspace);
505 424990 : pfree(result);
506 :
507 424990 : return result_size;
508 : }
509 :
510 : static size_t
511 0 : strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
512 : pg_locale_t locale)
513 : {
514 0 : if (srclen < 0)
515 0 : srclen = strlen(src);
516 :
517 0 : if (srclen + 1 <= destsize)
518 : {
519 0 : locale_t loc = locale->lt;
520 0 : int wasalnum = false;
521 : char *p;
522 :
523 0 : memcpy(dest, src, srclen);
524 0 : dest[srclen] = '\0';
525 :
526 : /*
527 : * Note: we assume that toupper_l()/tolower_l() will not be so broken
528 : * as to need guard tests. When using the default collation, we apply
529 : * the traditional Postgres behavior that forces ASCII-style treatment
530 : * of I/i, but in non-default collations you get exactly what the
531 : * collation says.
532 : */
533 0 : for (p = dest; *p; p++)
534 : {
535 0 : if (locale->is_default)
536 : {
537 0 : if (wasalnum)
538 0 : *p = pg_tolower((unsigned char) *p);
539 : else
540 0 : *p = pg_toupper((unsigned char) *p);
541 : }
542 : else
543 : {
544 0 : if (wasalnum)
545 0 : *p = tolower_l((unsigned char) *p, loc);
546 : else
547 0 : *p = toupper_l((unsigned char) *p, loc);
548 : }
549 0 : wasalnum = isalnum_l((unsigned char) *p, loc);
550 : }
551 : }
552 :
553 0 : return srclen;
554 : }
555 :
556 : static size_t
557 8 : strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
558 : pg_locale_t locale)
559 : {
560 8 : locale_t loc = locale->lt;
561 8 : int wasalnum = false;
562 : size_t result_size;
563 : wchar_t *workspace;
564 : char *result;
565 : size_t curr_char;
566 : size_t max_size;
567 :
568 8 : if (srclen < 0)
569 0 : srclen = strlen(src);
570 :
571 : /* Overflow paranoia */
572 8 : if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
573 0 : ereport(ERROR,
574 : (errcode(ERRCODE_OUT_OF_MEMORY),
575 : errmsg("out of memory")));
576 :
577 : /* Output workspace cannot have more codes than input bytes */
578 8 : workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
579 :
580 8 : char2wchar(workspace, srclen + 1, src, srclen, loc);
581 :
582 80 : for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
583 : {
584 72 : if (wasalnum)
585 56 : workspace[curr_char] = towlower_l(workspace[curr_char], loc);
586 : else
587 16 : workspace[curr_char] = towupper_l(workspace[curr_char], loc);
588 72 : wasalnum = iswalnum_l(workspace[curr_char], loc);
589 : }
590 :
591 : /*
592 : * Make result large enough; case change might change number of bytes
593 : */
594 8 : max_size = curr_char * pg_database_encoding_max_length();
595 8 : result = palloc(max_size + 1);
596 :
597 8 : result_size = wchar2char(result, workspace, max_size + 1, loc);
598 :
599 8 : if (result_size + 1 > destsize)
600 0 : return result_size;
601 :
602 8 : memcpy(dest, result, result_size);
603 8 : dest[result_size] = '\0';
604 :
605 8 : pfree(workspace);
606 8 : pfree(result);
607 :
608 8 : return result_size;
609 : }
610 :
611 : static size_t
612 0 : strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
613 : pg_locale_t locale)
614 : {
615 0 : if (srclen < 0)
616 0 : srclen = strlen(src);
617 :
618 0 : if (srclen + 1 <= destsize)
619 : {
620 0 : locale_t loc = locale->lt;
621 : char *p;
622 :
623 0 : memcpy(dest, src, srclen);
624 0 : dest[srclen] = '\0';
625 :
626 : /*
627 : * Note: we assume that toupper_l() will not be so broken as to need
628 : * an islower_l() guard test. When using the default collation, we
629 : * apply the traditional Postgres behavior that forces ASCII-style
630 : * treatment of I/i, but in non-default collations you get exactly
631 : * what the collation says.
632 : */
633 0 : for (p = dest; *p; p++)
634 : {
635 0 : if (locale->is_default)
636 0 : *p = pg_toupper((unsigned char) *p);
637 : else
638 0 : *p = toupper_l((unsigned char) *p, loc);
639 : }
640 : }
641 :
642 0 : return srclen;
643 : }
644 :
645 : static size_t
646 719780 : strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
647 : pg_locale_t locale)
648 : {
649 719780 : locale_t loc = locale->lt;
650 : size_t result_size;
651 : wchar_t *workspace;
652 : char *result;
653 : size_t curr_char;
654 : size_t max_size;
655 :
656 719780 : if (srclen < 0)
657 0 : srclen = strlen(src);
658 :
659 : /* Overflow paranoia */
660 719780 : if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
661 0 : ereport(ERROR,
662 : (errcode(ERRCODE_OUT_OF_MEMORY),
663 : errmsg("out of memory")));
664 :
665 : /* Output workspace cannot have more codes than input bytes */
666 719780 : workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
667 :
668 719780 : char2wchar(workspace, srclen + 1, src, srclen, loc);
669 :
670 2371034 : for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
671 1651254 : workspace[curr_char] = towupper_l(workspace[curr_char], loc);
672 :
673 : /*
674 : * Make result large enough; case change might change number of bytes
675 : */
676 719780 : max_size = curr_char * pg_database_encoding_max_length();
677 719780 : result = palloc(max_size + 1);
678 :
679 719780 : result_size = wchar2char(result, workspace, max_size + 1, loc);
680 :
681 719780 : if (result_size + 1 > destsize)
682 0 : return result_size;
683 :
684 719780 : memcpy(dest, result, result_size);
685 719780 : dest[result_size] = '\0';
686 :
687 719780 : pfree(workspace);
688 719780 : pfree(result);
689 :
690 719780 : return result_size;
691 : }
692 :
693 : pg_locale_t
694 31746 : create_pg_locale_libc(Oid collid, MemoryContext context)
695 : {
696 : const char *collate;
697 : const char *ctype;
698 : locale_t loc;
699 : pg_locale_t result;
700 :
701 31746 : if (collid == DEFAULT_COLLATION_OID)
702 : {
703 : HeapTuple tp;
704 : Datum datum;
705 :
706 31656 : tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
707 31656 : if (!HeapTupleIsValid(tp))
708 0 : elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
709 31656 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
710 : Anum_pg_database_datcollate);
711 31656 : collate = TextDatumGetCString(datum);
712 31656 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
713 : Anum_pg_database_datctype);
714 31656 : ctype = TextDatumGetCString(datum);
715 :
716 31656 : ReleaseSysCache(tp);
717 : }
718 : else
719 : {
720 : HeapTuple tp;
721 : Datum datum;
722 :
723 90 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
724 90 : if (!HeapTupleIsValid(tp))
725 0 : elog(ERROR, "cache lookup failed for collation %u", collid);
726 :
727 90 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
728 : Anum_pg_collation_collcollate);
729 90 : collate = TextDatumGetCString(datum);
730 90 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
731 : Anum_pg_collation_collctype);
732 90 : ctype = TextDatumGetCString(datum);
733 :
734 90 : ReleaseSysCache(tp);
735 : }
736 :
737 :
738 31746 : loc = make_libc_collator(collate, ctype);
739 :
740 31746 : result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
741 31746 : result->deterministic = true;
742 62358 : result->collate_is_c = (strcmp(collate, "C") == 0) ||
743 30612 : (strcmp(collate, "POSIX") == 0);
744 62358 : result->ctype_is_c = (strcmp(ctype, "C") == 0) ||
745 30612 : (strcmp(ctype, "POSIX") == 0);
746 31746 : result->lt = loc;
747 31746 : if (!result->collate_is_c)
748 : {
749 : #ifdef WIN32
750 : if (GetDatabaseEncoding() == PG_UTF8)
751 : result->collate = &collate_methods_libc_win32_utf8;
752 : else
753 : #endif
754 30548 : result->collate = &collate_methods_libc;
755 : }
756 31746 : if (!result->ctype_is_c)
757 : {
758 30548 : if (GetDatabaseEncoding() == PG_UTF8)
759 30484 : result->ctype = &ctype_methods_libc_utf8;
760 64 : else if (pg_database_encoding_max_length() > 1)
761 0 : result->ctype = &ctype_methods_libc_other_mb;
762 : else
763 64 : result->ctype = &ctype_methods_libc_sb;
764 : }
765 :
766 31746 : return result;
767 : }
768 :
769 : /*
770 : * Create a locale_t with the given collation and ctype.
771 : *
772 : * The "C" and "POSIX" locales are not actually handled by libc, so return
773 : * NULL.
774 : *
775 : * Ensure that no path leaks a locale_t.
776 : */
777 : static locale_t
778 31746 : make_libc_collator(const char *collate, const char *ctype)
779 : {
780 31746 : locale_t loc = 0;
781 :
782 31746 : if (strcmp(collate, ctype) == 0)
783 : {
784 31746 : if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
785 : {
786 : /* Normal case where they're the same */
787 30548 : errno = 0;
788 : #ifndef WIN32
789 30548 : loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
790 : NULL);
791 : #else
792 : loc = _create_locale(LC_ALL, collate);
793 : #endif
794 30548 : if (!loc)
795 0 : report_newlocale_failure(collate);
796 : }
797 : }
798 : else
799 : {
800 : #ifndef WIN32
801 : /* We need two newlocale() steps */
802 0 : locale_t loc1 = 0;
803 :
804 0 : if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0)
805 : {
806 0 : errno = 0;
807 0 : loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
808 0 : if (!loc1)
809 0 : report_newlocale_failure(collate);
810 : }
811 :
812 0 : if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
813 : {
814 0 : errno = 0;
815 0 : loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
816 0 : if (!loc)
817 : {
818 0 : if (loc1)
819 0 : freelocale(loc1);
820 0 : report_newlocale_failure(ctype);
821 : }
822 : }
823 : else
824 0 : loc = loc1;
825 : #else
826 :
827 : /*
828 : * XXX The _create_locale() API doesn't appear to support this. Could
829 : * perhaps be worked around by changing pg_locale_t to contain two
830 : * separate fields.
831 : */
832 : ereport(ERROR,
833 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
834 : errmsg("collations with different collate and ctype values are not supported on this platform")));
835 : #endif
836 : }
837 :
838 31746 : return loc;
839 : }
840 :
841 : /*
842 : * strncoll_libc
843 : *
844 : * NUL-terminate arguments, if necessary, and pass to strcoll_l().
845 : *
846 : * An input string length of -1 means that it's already NUL-terminated.
847 : */
848 : int
849 29640266 : strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
850 : pg_locale_t locale)
851 : {
852 : char sbuf[TEXTBUFLEN];
853 29640266 : char *buf = sbuf;
854 29640266 : size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1;
855 29640266 : size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1;
856 : const char *arg1n;
857 : const char *arg2n;
858 : int result;
859 :
860 29640266 : if (bufsize1 + bufsize2 > TEXTBUFLEN)
861 568 : buf = palloc(bufsize1 + bufsize2);
862 :
863 : /* nul-terminate arguments if necessary */
864 29640266 : if (len1 == -1)
865 : {
866 25232144 : arg1n = arg1;
867 : }
868 : else
869 : {
870 4408122 : char *buf1 = buf;
871 :
872 4408122 : memcpy(buf1, arg1, len1);
873 4408122 : buf1[len1] = '\0';
874 4408122 : arg1n = buf1;
875 : }
876 :
877 29640266 : if (len2 == -1)
878 : {
879 25232144 : arg2n = arg2;
880 : }
881 : else
882 : {
883 4408122 : char *buf2 = buf + bufsize1;
884 :
885 4408122 : memcpy(buf2, arg2, len2);
886 4408122 : buf2[len2] = '\0';
887 4408122 : arg2n = buf2;
888 : }
889 :
890 29640266 : result = strcoll_l(arg1n, arg2n, locale->lt);
891 :
892 29640266 : if (buf != sbuf)
893 568 : pfree(buf);
894 :
895 29640266 : return result;
896 : }
897 :
898 : /*
899 : * strnxfrm_libc
900 : *
901 : * NUL-terminate src, if necessary, and pass to strxfrm_l().
902 : *
903 : * A source length of -1 means that it's already NUL-terminated.
904 : */
905 : size_t
906 144 : strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
907 : pg_locale_t locale)
908 : {
909 : char sbuf[TEXTBUFLEN];
910 144 : char *buf = sbuf;
911 144 : size_t bufsize = srclen + 1;
912 : size_t result;
913 :
914 144 : if (srclen == -1)
915 144 : return strxfrm_l(dest, src, destsize, locale->lt);
916 :
917 0 : if (bufsize > TEXTBUFLEN)
918 0 : buf = palloc(bufsize);
919 :
920 : /* nul-terminate argument */
921 0 : memcpy(buf, src, srclen);
922 0 : buf[srclen] = '\0';
923 :
924 0 : result = strxfrm_l(dest, buf, destsize, locale->lt);
925 :
926 0 : if (buf != sbuf)
927 0 : pfree(buf);
928 :
929 : /* if dest is defined, it should be nul-terminated */
930 : Assert(result >= destsize || dest[result] == '\0');
931 :
932 0 : return result;
933 : }
934 :
935 : char *
936 30876 : get_collation_actual_version_libc(const char *collcollate)
937 : {
938 30876 : char *collversion = NULL;
939 :
940 61576 : if (pg_strcasecmp("C", collcollate) != 0 &&
941 61208 : pg_strncasecmp("C.", collcollate, 2) != 0 &&
942 30508 : pg_strcasecmp("POSIX", collcollate) != 0)
943 : {
944 : #if defined(__GLIBC__)
945 : /* Use the glibc version because we don't have anything better. */
946 30482 : collversion = pstrdup(gnu_get_libc_version());
947 : #elif defined(LC_VERSION_MASK)
948 : locale_t loc;
949 :
950 : /* Look up FreeBSD collation version. */
951 : loc = newlocale(LC_COLLATE_MASK, collcollate, NULL);
952 : if (loc)
953 : {
954 : collversion =
955 : pstrdup(querylocale(LC_COLLATE_MASK | LC_VERSION_MASK, loc));
956 : freelocale(loc);
957 : }
958 : else
959 : ereport(ERROR,
960 : (errmsg("could not load locale \"%s\"", collcollate)));
961 : #elif defined(WIN32)
962 : /*
963 : * If we are targeting Windows Vista and above, we can ask for a name
964 : * given a collation name (earlier versions required a location code
965 : * that we don't have).
966 : */
967 : NLSVERSIONINFOEX version = {sizeof(NLSVERSIONINFOEX)};
968 : WCHAR wide_collcollate[LOCALE_NAME_MAX_LENGTH];
969 :
970 : MultiByteToWideChar(CP_ACP, 0, collcollate, -1, wide_collcollate,
971 : LOCALE_NAME_MAX_LENGTH);
972 : if (!GetNLSVersionEx(COMPARE_STRING, wide_collcollate, &version))
973 : {
974 : /*
975 : * GetNLSVersionEx() wants a language tag such as "en-US", not a
976 : * locale name like "English_United States.1252". Until those
977 : * values can be prevented from entering the system, or 100%
978 : * reliably converted to the more useful tag format, tolerate the
979 : * resulting error and report that we have no version data.
980 : */
981 : if (GetLastError() == ERROR_INVALID_PARAMETER)
982 : return NULL;
983 :
984 : ereport(ERROR,
985 : (errmsg("could not get collation version for locale \"%s\": error code %lu",
986 : collcollate,
987 : GetLastError())));
988 : }
989 : collversion = psprintf("%lu.%lu,%lu.%lu",
990 : (version.dwNLSVersion >> 8) & 0xFFFF,
991 : version.dwNLSVersion & 0xFF,
992 : (version.dwDefinedVersion >> 8) & 0xFFFF,
993 : version.dwDefinedVersion & 0xFF);
994 : #endif
995 : }
996 :
997 30876 : return collversion;
998 : }
999 :
1000 : /*
1001 : * strncoll_libc_win32_utf8
1002 : *
1003 : * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
1004 : * invoke wcscoll_l().
1005 : *
1006 : * An input string length of -1 means that it's NUL-terminated.
1007 : */
1008 : #ifdef WIN32
1009 : static int
1010 : strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
1011 : ssize_t len2, pg_locale_t locale)
1012 : {
1013 : char sbuf[TEXTBUFLEN];
1014 : char *buf = sbuf;
1015 : char *a1p,
1016 : *a2p;
1017 : int a1len;
1018 : int a2len;
1019 : int r;
1020 : int result;
1021 :
1022 : Assert(GetDatabaseEncoding() == PG_UTF8);
1023 :
1024 : if (len1 == -1)
1025 : len1 = strlen(arg1);
1026 : if (len2 == -1)
1027 : len2 = strlen(arg2);
1028 :
1029 : a1len = len1 * 2 + 2;
1030 : a2len = len2 * 2 + 2;
1031 :
1032 : if (a1len + a2len > TEXTBUFLEN)
1033 : buf = palloc(a1len + a2len);
1034 :
1035 : a1p = buf;
1036 : a2p = buf + a1len;
1037 :
1038 : /* API does not work for zero-length input */
1039 : if (len1 == 0)
1040 : r = 0;
1041 : else
1042 : {
1043 : r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1044 : (LPWSTR) a1p, a1len / 2);
1045 : if (!r)
1046 : ereport(ERROR,
1047 : (errmsg("could not convert string to UTF-16: error code %lu",
1048 : GetLastError())));
1049 : }
1050 : ((LPWSTR) a1p)[r] = 0;
1051 :
1052 : if (len2 == 0)
1053 : r = 0;
1054 : else
1055 : {
1056 : r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1057 : (LPWSTR) a2p, a2len / 2);
1058 : if (!r)
1059 : ereport(ERROR,
1060 : (errmsg("could not convert string to UTF-16: error code %lu",
1061 : GetLastError())));
1062 : }
1063 : ((LPWSTR) a2p)[r] = 0;
1064 :
1065 : errno = 0;
1066 : result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->lt);
1067 : if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */
1068 : ereport(ERROR,
1069 : (errmsg("could not compare Unicode strings: %m")));
1070 :
1071 : if (buf != sbuf)
1072 : pfree(buf);
1073 :
1074 : return result;
1075 : }
1076 : #endif /* WIN32 */
1077 :
1078 : /* simple subroutine for reporting errors from newlocale() */
1079 : void
1080 0 : report_newlocale_failure(const char *localename)
1081 : {
1082 : int save_errno;
1083 :
1084 : /*
1085 : * Windows doesn't provide any useful error indication from
1086 : * _create_locale(), and BSD-derived platforms don't seem to feel they
1087 : * need to set errno either (even though POSIX is pretty clear that
1088 : * newlocale should do so). So, if errno hasn't been set, assume ENOENT
1089 : * is what to report.
1090 : */
1091 0 : if (errno == 0)
1092 0 : errno = ENOENT;
1093 :
1094 : /*
1095 : * ENOENT means "no such locale", not "no such file", so clarify that
1096 : * errno with an errdetail message.
1097 : */
1098 0 : save_errno = errno; /* auxiliary funcs might change errno */
1099 0 : ereport(ERROR,
1100 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1101 : errmsg("could not create locale \"%s\": %m",
1102 : localename),
1103 : (save_errno == ENOENT ?
1104 : errdetail("The operating system could not find any locale data for the locale name \"%s\".",
1105 : localename) : 0)));
1106 : }
1107 :
1108 : /*
1109 : * POSIX doesn't define _l-variants of these functions, but several systems
1110 : * have them. We provide our own replacements here.
1111 : */
1112 : #ifndef HAVE_MBSTOWCS_L
1113 : static size_t
1114 1144778 : mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
1115 : {
1116 : #ifdef WIN32
1117 : return _mbstowcs_l(dest, src, n, loc);
1118 : #else
1119 : size_t result;
1120 1144778 : locale_t save_locale = uselocale(loc);
1121 :
1122 1144778 : result = mbstowcs(dest, src, n);
1123 1144778 : uselocale(save_locale);
1124 1144778 : return result;
1125 : #endif
1126 : }
1127 : #endif
1128 : #ifndef HAVE_WCSTOMBS_L
1129 : static size_t
1130 1144778 : wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
1131 : {
1132 : #ifdef WIN32
1133 : return _wcstombs_l(dest, src, n, loc);
1134 : #else
1135 : size_t result;
1136 1144778 : locale_t save_locale = uselocale(loc);
1137 :
1138 1144778 : result = wcstombs(dest, src, n);
1139 1144778 : uselocale(save_locale);
1140 1144778 : return result;
1141 : #endif
1142 : }
1143 : #endif
1144 :
1145 : /*
1146 : * These functions convert from/to libc's wchar_t, *not* pg_wchar.
1147 : * Therefore we keep them here rather than with the mbutils code.
1148 : */
1149 :
1150 : /*
1151 : * wchar2char --- convert wide characters to multibyte format
1152 : *
1153 : * This has the same API as the standard wcstombs_l() function; in particular,
1154 : * tolen is the maximum number of bytes to store at *to, and *from must be
1155 : * zero-terminated. The output will be zero-terminated iff there is room.
1156 : */
1157 : size_t
1158 1144778 : wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc)
1159 : {
1160 : size_t result;
1161 :
1162 1144778 : if (tolen == 0)
1163 0 : return 0;
1164 :
1165 : #ifdef WIN32
1166 :
1167 : /*
1168 : * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
1169 : * for some reason mbstowcs and wcstombs won't do this for us, so we use
1170 : * MultiByteToWideChar().
1171 : */
1172 : if (GetDatabaseEncoding() == PG_UTF8)
1173 : {
1174 : result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
1175 : NULL, NULL);
1176 : /* A zero return is failure */
1177 : if (result <= 0)
1178 : result = -1;
1179 : else
1180 : {
1181 : Assert(result <= tolen);
1182 : /* Microsoft counts the zero terminator in the result */
1183 : result--;
1184 : }
1185 : }
1186 : else
1187 : #endif /* WIN32 */
1188 1144778 : if (loc == (locale_t) 0)
1189 : {
1190 : /* Use wcstombs directly for the default locale */
1191 0 : result = wcstombs(to, from, tolen);
1192 : }
1193 : else
1194 : {
1195 : /* Use wcstombs_l for nondefault locales */
1196 1144778 : result = wcstombs_l(to, from, tolen, loc);
1197 : }
1198 :
1199 1144778 : return result;
1200 : }
1201 :
1202 : /*
1203 : * char2wchar --- convert multibyte characters to wide characters
1204 : *
1205 : * This has almost the API of mbstowcs_l(), except that *from need not be
1206 : * null-terminated; instead, the number of input bytes is specified as
1207 : * fromlen. Also, we ereport() rather than returning -1 for invalid
1208 : * input encoding. tolen is the maximum number of wchar_t's to store at *to.
1209 : * The output will be zero-terminated iff there is room.
1210 : */
1211 : static size_t
1212 1144778 : char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
1213 : locale_t loc)
1214 : {
1215 : size_t result;
1216 :
1217 1144778 : if (tolen == 0)
1218 0 : return 0;
1219 :
1220 : #ifdef WIN32
1221 : /* See WIN32 "Unicode" comment above */
1222 : if (GetDatabaseEncoding() == PG_UTF8)
1223 : {
1224 : /* Win32 API does not work for zero-length input */
1225 : if (fromlen == 0)
1226 : result = 0;
1227 : else
1228 : {
1229 : result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
1230 : /* A zero return is failure */
1231 : if (result == 0)
1232 : result = -1;
1233 : }
1234 :
1235 : if (result != -1)
1236 : {
1237 : Assert(result < tolen);
1238 : /* Append trailing null wchar (MultiByteToWideChar() does not) */
1239 : to[result] = 0;
1240 : }
1241 : }
1242 : else
1243 : #endif /* WIN32 */
1244 : {
1245 : /* mbstowcs requires ending '\0' */
1246 1144778 : char *str = pnstrdup(from, fromlen);
1247 :
1248 1144778 : if (loc == (locale_t) 0)
1249 : {
1250 : /* Use mbstowcs directly for the default locale */
1251 0 : result = mbstowcs(to, str, tolen);
1252 : }
1253 : else
1254 : {
1255 : /* Use mbstowcs_l for nondefault locales */
1256 1144778 : result = mbstowcs_l(to, str, tolen, loc);
1257 : }
1258 :
1259 1144778 : pfree(str);
1260 : }
1261 :
1262 1144778 : if (result == -1)
1263 : {
1264 : /*
1265 : * Invalid multibyte character encountered. We try to give a useful
1266 : * error message by letting pg_verifymbstr check the string. But it's
1267 : * possible that the string is OK to us, and not OK to mbstowcs ---
1268 : * this suggests that the LC_CTYPE locale is different from the
1269 : * database encoding. Give a generic error message if pg_verifymbstr
1270 : * can't find anything wrong.
1271 : */
1272 0 : pg_verifymbstr(from, fromlen, false); /* might not return */
1273 : /* but if it does ... */
1274 0 : ereport(ERROR,
1275 : (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1276 : errmsg("invalid multibyte character for locale"),
1277 : errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
1278 : }
1279 :
1280 1144778 : return result;
1281 : }
|