Line data Source code
1 : /*-----------------------------------------------------------------------
2 : *
3 : * PostgreSQL locale utilities for libc
4 : *
5 : * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 : *
7 : * src/backend/utils/adt/pg_locale_libc.c
8 : *
9 : *-----------------------------------------------------------------------
10 : */
11 :
12 : #include "postgres.h"
13 :
14 : #include <limits.h>
15 : #include <wctype.h>
16 :
17 : #include "access/htup_details.h"
18 : #include "catalog/pg_database.h"
19 : #include "catalog/pg_collation.h"
20 : #include "mb/pg_wchar.h"
21 : #include "miscadmin.h"
22 : #include "utils/builtins.h"
23 : #include "utils/formatting.h"
24 : #include "utils/memutils.h"
25 : #include "utils/pg_locale.h"
26 : #include "utils/syscache.h"
27 :
28 : #ifdef __GLIBC__
29 : #include <gnu/libc-version.h>
30 : #endif
31 :
32 : #ifdef WIN32
33 : #include <shlwapi.h>
34 : #endif
35 :
36 : /*
37 : * For the libc provider, to provide as much functionality as possible on a
38 : * variety of platforms without going so far as to implement everything from
39 : * scratch, we use several implementation strategies depending on the
40 : * situation:
41 : *
42 : * 1. In C/POSIX collations, we use hard-wired code. We can't depend on
43 : * the <ctype.h> functions since those will obey LC_CTYPE. Note that these
44 : * collations don't give a fig about multibyte characters.
45 : *
46 : * 2. When working in UTF8 encoding, we use the <wctype.h> functions.
47 : * This assumes that every platform uses Unicode codepoints directly
48 : * as the wchar_t representation of Unicode. (XXX: ICU makes this assumption
49 : * even for non-UTF8 encodings, which may be a problem.) On some platforms
50 : * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
51 : *
52 : * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar
53 : * values up to 255, and punt for values above that. This is 100% correct
54 : * only in single-byte encodings such as LATINn. However, non-Unicode
55 : * multibyte encodings are mostly Far Eastern character sets for which the
56 : * properties being tested here aren't very relevant for higher code values
57 : * anyway. The difficulty with using the <wctype.h> functions with
58 : * non-Unicode multibyte encodings is that we can have no certainty that
59 : * the platform's wchar_t representation matches what we do in pg_wchar
60 : * conversions.
61 : *
62 : * As a special case, in the "default" collation, (2) and (3) force ASCII
63 : * letters to follow ASCII upcase/downcase rules, while in a non-default
64 : * collation we just let the library functions do what they will. The case
65 : * where this matters is treatment of I/i in Turkish, and the behavior is
66 : * meant to match the upper()/lower() SQL functions.
67 : *
68 : * We store the active collation setting in static variables. In principle
69 : * it could be passed down to here via the regex library's "struct vars" data
70 : * structure; but that would require somewhat invasive changes in the regex
71 : * library, and right now there's no real benefit to be gained from that.
72 : *
73 : * NB: the coding here assumes pg_wchar is an unsigned type.
74 : */
75 :
76 : /*
77 : * Size of stack buffer to use for string transformations, used to avoid heap
78 : * allocations in typical cases. This should be large enough that most strings
79 : * will fit, but small enough that we feel comfortable putting it on the
80 : * stack.
81 : */
82 : #define TEXTBUFLEN 1024
83 :
84 : extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context);
85 :
86 : static int strncoll_libc(const char *arg1, ssize_t len1,
87 : const char *arg2, ssize_t len2,
88 : pg_locale_t locale);
89 : static size_t strnxfrm_libc(char *dest, size_t destsize,
90 : const char *src, ssize_t srclen,
91 : pg_locale_t locale);
92 : extern char *get_collation_actual_version_libc(const char *collcollate);
93 : static locale_t make_libc_collator(const char *collate,
94 : const char *ctype);
95 :
96 : #ifdef WIN32
97 : static int strncoll_libc_win32_utf8(const char *arg1, ssize_t len1,
98 : const char *arg2, ssize_t len2,
99 : pg_locale_t locale);
100 : #endif
101 :
102 : static size_t strlower_libc_sb(char *dest, size_t destsize,
103 : const char *src, ssize_t srclen,
104 : pg_locale_t locale);
105 : static size_t strlower_libc_mb(char *dest, size_t destsize,
106 : const char *src, ssize_t srclen,
107 : pg_locale_t locale);
108 : static size_t strtitle_libc_sb(char *dest, size_t destsize,
109 : const char *src, ssize_t srclen,
110 : pg_locale_t locale);
111 : static size_t strtitle_libc_mb(char *dest, size_t destsize,
112 : const char *src, ssize_t srclen,
113 : pg_locale_t locale);
114 : static size_t strupper_libc_sb(char *dest, size_t destsize,
115 : const char *src, ssize_t srclen,
116 : pg_locale_t locale);
117 : static size_t strupper_libc_mb(char *dest, size_t destsize,
118 : const char *src, ssize_t srclen,
119 : pg_locale_t locale);
120 :
121 : static bool
122 0 : wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
123 : {
124 0 : return isdigit_l((unsigned char) wc, locale->info.lt);
125 : }
126 :
127 : static bool
128 0 : wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale)
129 : {
130 0 : return isalpha_l((unsigned char) wc, locale->info.lt);
131 : }
132 :
133 : static bool
134 0 : wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale)
135 : {
136 0 : return isalnum_l((unsigned char) wc, locale->info.lt);
137 : }
138 :
139 : static bool
140 0 : wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale)
141 : {
142 0 : return isupper_l((unsigned char) wc, locale->info.lt);
143 : }
144 :
145 : static bool
146 0 : wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale)
147 : {
148 0 : return islower_l((unsigned char) wc, locale->info.lt);
149 : }
150 :
151 : static bool
152 0 : wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale)
153 : {
154 0 : return isgraph_l((unsigned char) wc, locale->info.lt);
155 : }
156 :
157 : static bool
158 0 : wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale)
159 : {
160 0 : return isprint_l((unsigned char) wc, locale->info.lt);
161 : }
162 :
163 : static bool
164 0 : wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale)
165 : {
166 0 : return ispunct_l((unsigned char) wc, locale->info.lt);
167 : }
168 :
169 : static bool
170 0 : wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale)
171 : {
172 0 : return isspace_l((unsigned char) wc, locale->info.lt);
173 : }
174 :
175 : static bool
176 106840 : wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
177 : {
178 106840 : return iswdigit_l((wint_t) wc, locale->info.lt);
179 : }
180 :
181 : static bool
182 12544 : wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale)
183 : {
184 12544 : return iswalpha_l((wint_t) wc, locale->info.lt);
185 : }
186 :
187 : static bool
188 45076 : wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale)
189 : {
190 45076 : return iswalnum_l((wint_t) wc, locale->info.lt);
191 : }
192 :
193 : static bool
194 4112 : wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale)
195 : {
196 4112 : return iswupper_l((wint_t) wc, locale->info.lt);
197 : }
198 :
199 : static bool
200 4102 : wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale)
201 : {
202 4102 : return iswlower_l((wint_t) wc, locale->info.lt);
203 : }
204 :
205 : static bool
206 4102 : wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale)
207 : {
208 4102 : return iswgraph_l((wint_t) wc, locale->info.lt);
209 : }
210 :
211 : static bool
212 4102 : wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale)
213 : {
214 4102 : return iswprint_l((wint_t) wc, locale->info.lt);
215 : }
216 :
217 : static bool
218 4102 : wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale)
219 : {
220 4102 : return iswpunct_l((wint_t) wc, locale->info.lt);
221 : }
222 :
223 : static bool
224 47700 : wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale)
225 : {
226 47700 : return iswspace_l((wint_t) wc, locale->info.lt);
227 : }
228 :
229 : static char
230 0 : char_tolower_libc(unsigned char ch, pg_locale_t locale)
231 : {
232 : Assert(pg_database_encoding_max_length() == 1);
233 0 : return tolower_l(ch, locale->info.lt);
234 : }
235 :
236 : static bool
237 0 : char_is_cased_libc(char ch, pg_locale_t locale)
238 : {
239 0 : bool is_multibyte = pg_database_encoding_max_length() > 1;
240 :
241 0 : if (is_multibyte && IS_HIGHBIT_SET(ch))
242 0 : return true;
243 : else
244 0 : return isalpha_l((unsigned char) ch, locale->info.lt);
245 : }
246 :
247 : static pg_wchar
248 0 : toupper_libc_sb(pg_wchar wc, pg_locale_t locale)
249 : {
250 : Assert(GetDatabaseEncoding() != PG_UTF8);
251 :
252 : /* force C behavior for ASCII characters, per comments above */
253 0 : if (locale->is_default && wc <= (pg_wchar) 127)
254 0 : return pg_ascii_toupper((unsigned char) wc);
255 0 : if (wc <= (pg_wchar) UCHAR_MAX)
256 0 : return toupper_l((unsigned char) wc, locale->info.lt);
257 : else
258 0 : return wc;
259 : }
260 :
261 : static pg_wchar
262 9088 : toupper_libc_mb(pg_wchar wc, pg_locale_t locale)
263 : {
264 : Assert(GetDatabaseEncoding() == PG_UTF8);
265 :
266 : /* force C behavior for ASCII characters, per comments above */
267 9088 : if (locale->is_default && wc <= (pg_wchar) 127)
268 892 : return pg_ascii_toupper((unsigned char) wc);
269 : if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
270 8196 : return towupper_l((wint_t) wc, locale->info.lt);
271 : else
272 : return wc;
273 : }
274 :
275 : static pg_wchar
276 0 : tolower_libc_sb(pg_wchar wc, pg_locale_t locale)
277 : {
278 : Assert(GetDatabaseEncoding() != PG_UTF8);
279 :
280 : /* force C behavior for ASCII characters, per comments above */
281 0 : if (locale->is_default && wc <= (pg_wchar) 127)
282 0 : return pg_ascii_tolower((unsigned char) wc);
283 0 : if (wc <= (pg_wchar) UCHAR_MAX)
284 0 : return tolower_l((unsigned char) wc, locale->info.lt);
285 : else
286 0 : return wc;
287 : }
288 :
289 : static pg_wchar
290 9092 : tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
291 : {
292 : Assert(GetDatabaseEncoding() == PG_UTF8);
293 :
294 : /* force C behavior for ASCII characters, per comments above */
295 9092 : if (locale->is_default && wc <= (pg_wchar) 127)
296 896 : return pg_ascii_tolower((unsigned char) wc);
297 : if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
298 8196 : return towlower_l((wint_t) wc, locale->info.lt);
299 : else
300 : return wc;
301 : }
302 :
303 : static const struct ctype_methods ctype_methods_libc_sb = {
304 : .strlower = strlower_libc_sb,
305 : .strtitle = strtitle_libc_sb,
306 : .strupper = strupper_libc_sb,
307 : .wc_isdigit = wc_isdigit_libc_sb,
308 : .wc_isalpha = wc_isalpha_libc_sb,
309 : .wc_isalnum = wc_isalnum_libc_sb,
310 : .wc_isupper = wc_isupper_libc_sb,
311 : .wc_islower = wc_islower_libc_sb,
312 : .wc_isgraph = wc_isgraph_libc_sb,
313 : .wc_isprint = wc_isprint_libc_sb,
314 : .wc_ispunct = wc_ispunct_libc_sb,
315 : .wc_isspace = wc_isspace_libc_sb,
316 : .char_is_cased = char_is_cased_libc,
317 : .char_tolower = char_tolower_libc,
318 : .wc_toupper = toupper_libc_sb,
319 : .wc_tolower = tolower_libc_sb,
320 : .max_chr = UCHAR_MAX,
321 : };
322 :
323 : /*
324 : * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but
325 : * single-byte semantics for pattern matching.
326 : */
327 : static const struct ctype_methods ctype_methods_libc_other_mb = {
328 : .strlower = strlower_libc_mb,
329 : .strtitle = strtitle_libc_mb,
330 : .strupper = strupper_libc_mb,
331 : .wc_isdigit = wc_isdigit_libc_sb,
332 : .wc_isalpha = wc_isalpha_libc_sb,
333 : .wc_isalnum = wc_isalnum_libc_sb,
334 : .wc_isupper = wc_isupper_libc_sb,
335 : .wc_islower = wc_islower_libc_sb,
336 : .wc_isgraph = wc_isgraph_libc_sb,
337 : .wc_isprint = wc_isprint_libc_sb,
338 : .wc_ispunct = wc_ispunct_libc_sb,
339 : .wc_isspace = wc_isspace_libc_sb,
340 : .char_is_cased = char_is_cased_libc,
341 : .char_tolower = char_tolower_libc,
342 : .wc_toupper = toupper_libc_sb,
343 : .wc_tolower = tolower_libc_sb,
344 : .max_chr = UCHAR_MAX,
345 : };
346 :
347 : static const struct ctype_methods ctype_methods_libc_utf8 = {
348 : .strlower = strlower_libc_mb,
349 : .strtitle = strtitle_libc_mb,
350 : .strupper = strupper_libc_mb,
351 : .wc_isdigit = wc_isdigit_libc_mb,
352 : .wc_isalpha = wc_isalpha_libc_mb,
353 : .wc_isalnum = wc_isalnum_libc_mb,
354 : .wc_isupper = wc_isupper_libc_mb,
355 : .wc_islower = wc_islower_libc_mb,
356 : .wc_isgraph = wc_isgraph_libc_mb,
357 : .wc_isprint = wc_isprint_libc_mb,
358 : .wc_ispunct = wc_ispunct_libc_mb,
359 : .wc_isspace = wc_isspace_libc_mb,
360 : .char_is_cased = char_is_cased_libc,
361 : .char_tolower = char_tolower_libc,
362 : .wc_toupper = toupper_libc_mb,
363 : .wc_tolower = tolower_libc_mb,
364 : };
365 :
366 : static const struct collate_methods collate_methods_libc = {
367 : .strncoll = strncoll_libc,
368 : .strnxfrm = strnxfrm_libc,
369 : .strnxfrm_prefix = NULL,
370 :
371 : /*
372 : * Unfortunately, it seems that strxfrm() for non-C collations is broken
373 : * on many common platforms; testing of multiple versions of glibc reveals
374 : * that, for many locales, strcoll() and strxfrm() do not return
375 : * consistent results. While no other libc other than Cygwin has so far
376 : * been shown to have a problem, we take the conservative course of action
377 : * for right now and disable this categorically. (Users who are certain
378 : * this isn't a problem on their system can define TRUST_STRXFRM.)
379 : */
380 : #ifdef TRUST_STRXFRM
381 : .strxfrm_is_safe = true,
382 : #else
383 : .strxfrm_is_safe = false,
384 : #endif
385 : };
386 :
387 : #ifdef WIN32
388 : static const struct collate_methods collate_methods_libc_win32_utf8 = {
389 : .strncoll = strncoll_libc_win32_utf8,
390 : .strnxfrm = strnxfrm_libc,
391 : .strnxfrm_prefix = NULL,
392 : #ifdef TRUST_STRXFRM
393 : .strxfrm_is_safe = true,
394 : #else
395 : .strxfrm_is_safe = false,
396 : #endif
397 : };
398 : #endif
399 :
400 : static size_t
401 0 : strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
402 : pg_locale_t locale)
403 : {
404 0 : if (srclen < 0)
405 0 : srclen = strlen(src);
406 :
407 0 : if (srclen + 1 <= destsize)
408 : {
409 0 : locale_t loc = locale->info.lt;
410 : char *p;
411 :
412 0 : if (srclen + 1 > destsize)
413 0 : return srclen;
414 :
415 0 : memcpy(dest, src, srclen);
416 0 : dest[srclen] = '\0';
417 :
418 : /*
419 : * Note: we assume that tolower_l() will not be so broken as to need
420 : * an isupper_l() guard test. When using the default collation, we
421 : * apply the traditional Postgres behavior that forces ASCII-style
422 : * treatment of I/i, but in non-default collations you get exactly
423 : * what the collation says.
424 : */
425 0 : for (p = dest; *p; p++)
426 : {
427 0 : if (locale->is_default)
428 0 : *p = pg_tolower((unsigned char) *p);
429 : else
430 0 : *p = tolower_l((unsigned char) *p, loc);
431 : }
432 : }
433 :
434 0 : return srclen;
435 : }
436 :
437 : static size_t
438 422896 : strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
439 : pg_locale_t locale)
440 : {
441 422896 : locale_t loc = locale->info.lt;
442 : size_t result_size;
443 : wchar_t *workspace;
444 : char *result;
445 : size_t curr_char;
446 : size_t max_size;
447 :
448 422896 : if (srclen < 0)
449 0 : srclen = strlen(src);
450 :
451 : /* Overflow paranoia */
452 422896 : if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
453 0 : ereport(ERROR,
454 : (errcode(ERRCODE_OUT_OF_MEMORY),
455 : errmsg("out of memory")));
456 :
457 : /* Output workspace cannot have more codes than input bytes */
458 422896 : workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
459 :
460 422896 : char2wchar(workspace, srclen + 1, src, srclen, locale);
461 :
462 3643230 : for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
463 3220334 : workspace[curr_char] = towlower_l(workspace[curr_char], loc);
464 :
465 : /*
466 : * Make result large enough; case change might change number of bytes
467 : */
468 422896 : max_size = curr_char * pg_database_encoding_max_length();
469 422896 : result = palloc(max_size + 1);
470 :
471 422896 : result_size = wchar2char(result, workspace, max_size + 1, locale);
472 :
473 422896 : if (result_size + 1 > destsize)
474 0 : return result_size;
475 :
476 422896 : memcpy(dest, result, result_size);
477 422896 : dest[result_size] = '\0';
478 :
479 422896 : pfree(workspace);
480 422896 : pfree(result);
481 :
482 422896 : return result_size;
483 : }
484 :
485 : static size_t
486 0 : strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
487 : pg_locale_t locale)
488 : {
489 0 : if (srclen < 0)
490 0 : srclen = strlen(src);
491 :
492 0 : if (srclen + 1 <= destsize)
493 : {
494 0 : locale_t loc = locale->info.lt;
495 0 : int wasalnum = false;
496 : char *p;
497 :
498 0 : memcpy(dest, src, srclen);
499 0 : dest[srclen] = '\0';
500 :
501 : /*
502 : * Note: we assume that toupper_l()/tolower_l() will not be so broken
503 : * as to need guard tests. When using the default collation, we apply
504 : * the traditional Postgres behavior that forces ASCII-style treatment
505 : * of I/i, but in non-default collations you get exactly what the
506 : * collation says.
507 : */
508 0 : for (p = dest; *p; p++)
509 : {
510 0 : if (locale->is_default)
511 : {
512 0 : if (wasalnum)
513 0 : *p = pg_tolower((unsigned char) *p);
514 : else
515 0 : *p = pg_toupper((unsigned char) *p);
516 : }
517 : else
518 : {
519 0 : if (wasalnum)
520 0 : *p = tolower_l((unsigned char) *p, loc);
521 : else
522 0 : *p = toupper_l((unsigned char) *p, loc);
523 : }
524 0 : wasalnum = isalnum_l((unsigned char) *p, loc);
525 : }
526 : }
527 :
528 0 : return srclen;
529 : }
530 :
531 : static size_t
532 8 : strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
533 : pg_locale_t locale)
534 : {
535 8 : locale_t loc = locale->info.lt;
536 8 : int wasalnum = false;
537 : size_t result_size;
538 : wchar_t *workspace;
539 : char *result;
540 : size_t curr_char;
541 : size_t max_size;
542 :
543 8 : if (srclen < 0)
544 0 : srclen = strlen(src);
545 :
546 : /* Overflow paranoia */
547 8 : if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
548 0 : ereport(ERROR,
549 : (errcode(ERRCODE_OUT_OF_MEMORY),
550 : errmsg("out of memory")));
551 :
552 : /* Output workspace cannot have more codes than input bytes */
553 8 : workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
554 :
555 8 : char2wchar(workspace, srclen + 1, src, srclen, locale);
556 :
557 80 : for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
558 : {
559 72 : if (wasalnum)
560 56 : workspace[curr_char] = towlower_l(workspace[curr_char], loc);
561 : else
562 16 : workspace[curr_char] = towupper_l(workspace[curr_char], loc);
563 72 : wasalnum = iswalnum_l(workspace[curr_char], loc);
564 : }
565 :
566 : /*
567 : * Make result large enough; case change might change number of bytes
568 : */
569 8 : max_size = curr_char * pg_database_encoding_max_length();
570 8 : result = palloc(max_size + 1);
571 :
572 8 : result_size = wchar2char(result, workspace, max_size + 1, locale);
573 :
574 8 : if (result_size + 1 > destsize)
575 0 : return result_size;
576 :
577 8 : memcpy(dest, result, result_size);
578 8 : dest[result_size] = '\0';
579 :
580 8 : pfree(workspace);
581 8 : pfree(result);
582 :
583 8 : return result_size;
584 : }
585 :
586 : static size_t
587 0 : strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
588 : pg_locale_t locale)
589 : {
590 0 : if (srclen < 0)
591 0 : srclen = strlen(src);
592 :
593 0 : if (srclen + 1 <= destsize)
594 : {
595 0 : locale_t loc = locale->info.lt;
596 : char *p;
597 :
598 0 : memcpy(dest, src, srclen);
599 0 : dest[srclen] = '\0';
600 :
601 : /*
602 : * Note: we assume that toupper_l() will not be so broken as to need
603 : * an islower_l() guard test. When using the default collation, we
604 : * apply the traditional Postgres behavior that forces ASCII-style
605 : * treatment of I/i, but in non-default collations you get exactly
606 : * what the collation says.
607 : */
608 0 : for (p = dest; *p; p++)
609 : {
610 0 : if (locale->is_default)
611 0 : *p = pg_toupper((unsigned char) *p);
612 : else
613 0 : *p = toupper_l((unsigned char) *p, loc);
614 : }
615 : }
616 :
617 0 : return srclen;
618 : }
619 :
620 : static size_t
621 717928 : strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
622 : pg_locale_t locale)
623 : {
624 717928 : locale_t loc = locale->info.lt;
625 : size_t result_size;
626 : wchar_t *workspace;
627 : char *result;
628 : size_t curr_char;
629 : size_t max_size;
630 :
631 717928 : if (srclen < 0)
632 0 : srclen = strlen(src);
633 :
634 : /* Overflow paranoia */
635 717928 : if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
636 0 : ereport(ERROR,
637 : (errcode(ERRCODE_OUT_OF_MEMORY),
638 : errmsg("out of memory")));
639 :
640 : /* Output workspace cannot have more codes than input bytes */
641 717928 : workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
642 :
643 717928 : char2wchar(workspace, srclen + 1, src, srclen, locale);
644 :
645 2358142 : for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
646 1640214 : workspace[curr_char] = towupper_l(workspace[curr_char], loc);
647 :
648 : /*
649 : * Make result large enough; case change might change number of bytes
650 : */
651 717928 : max_size = curr_char * pg_database_encoding_max_length();
652 717928 : result = palloc(max_size + 1);
653 :
654 717928 : result_size = wchar2char(result, workspace, max_size + 1, locale);
655 :
656 717928 : if (result_size + 1 > destsize)
657 0 : return result_size;
658 :
659 717928 : memcpy(dest, result, result_size);
660 717928 : dest[result_size] = '\0';
661 :
662 717928 : pfree(workspace);
663 717928 : pfree(result);
664 :
665 717928 : return result_size;
666 : }
667 :
668 : pg_locale_t
669 33370 : create_pg_locale_libc(Oid collid, MemoryContext context)
670 : {
671 : const char *collate;
672 : const char *ctype;
673 : locale_t loc;
674 : pg_locale_t result;
675 :
676 33370 : if (collid == DEFAULT_COLLATION_OID)
677 : {
678 : HeapTuple tp;
679 : Datum datum;
680 :
681 29340 : tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
682 29340 : if (!HeapTupleIsValid(tp))
683 0 : elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
684 29340 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
685 : Anum_pg_database_datcollate);
686 29340 : collate = TextDatumGetCString(datum);
687 29340 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
688 : Anum_pg_database_datctype);
689 29340 : ctype = TextDatumGetCString(datum);
690 :
691 29340 : ReleaseSysCache(tp);
692 : }
693 : else
694 : {
695 : HeapTuple tp;
696 : Datum datum;
697 :
698 4030 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
699 4030 : if (!HeapTupleIsValid(tp))
700 0 : elog(ERROR, "cache lookup failed for collation %u", collid);
701 :
702 4030 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
703 : Anum_pg_collation_collcollate);
704 4030 : collate = TextDatumGetCString(datum);
705 4030 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
706 : Anum_pg_collation_collctype);
707 4030 : ctype = TextDatumGetCString(datum);
708 :
709 4030 : ReleaseSysCache(tp);
710 : }
711 :
712 :
713 33370 : loc = make_libc_collator(collate, ctype);
714 :
715 33370 : result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
716 33370 : result->deterministic = true;
717 61670 : result->collate_is_c = (strcmp(collate, "C") == 0) ||
718 28300 : (strcmp(collate, "POSIX") == 0);
719 61670 : result->ctype_is_c = (strcmp(ctype, "C") == 0) ||
720 28300 : (strcmp(ctype, "POSIX") == 0);
721 33370 : result->info.lt = loc;
722 33370 : if (!result->collate_is_c)
723 : {
724 : #ifdef WIN32
725 : if (GetDatabaseEncoding() == PG_UTF8)
726 : result->collate = &collate_methods_libc_win32_utf8;
727 : else
728 : #endif
729 28236 : result->collate = &collate_methods_libc;
730 : }
731 33370 : if (!result->ctype_is_c)
732 : {
733 28236 : if (GetDatabaseEncoding() == PG_UTF8)
734 28172 : result->ctype = &ctype_methods_libc_utf8;
735 64 : else if (pg_database_encoding_max_length() > 1)
736 0 : result->ctype = &ctype_methods_libc_other_mb;
737 : else
738 64 : result->ctype = &ctype_methods_libc_sb;
739 : }
740 :
741 33370 : return result;
742 : }
743 :
744 : /*
745 : * Create a locale_t with the given collation and ctype.
746 : *
747 : * The "C" and "POSIX" locales are not actually handled by libc, so return
748 : * NULL.
749 : *
750 : * Ensure that no path leaks a locale_t.
751 : */
752 : static locale_t
753 33370 : make_libc_collator(const char *collate, const char *ctype)
754 : {
755 33370 : locale_t loc = 0;
756 :
757 33370 : if (strcmp(collate, ctype) == 0)
758 : {
759 33370 : if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
760 : {
761 : /* Normal case where they're the same */
762 28236 : errno = 0;
763 : #ifndef WIN32
764 28236 : loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
765 : NULL);
766 : #else
767 : loc = _create_locale(LC_ALL, collate);
768 : #endif
769 28236 : if (!loc)
770 0 : report_newlocale_failure(collate);
771 : }
772 : }
773 : else
774 : {
775 : #ifndef WIN32
776 : /* We need two newlocale() steps */
777 0 : locale_t loc1 = 0;
778 :
779 0 : if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0)
780 : {
781 0 : errno = 0;
782 0 : loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
783 0 : if (!loc1)
784 0 : report_newlocale_failure(collate);
785 : }
786 :
787 0 : if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
788 : {
789 0 : errno = 0;
790 0 : loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
791 0 : if (!loc)
792 : {
793 0 : if (loc1)
794 0 : freelocale(loc1);
795 0 : report_newlocale_failure(ctype);
796 : }
797 : }
798 : else
799 0 : loc = loc1;
800 : #else
801 :
802 : /*
803 : * XXX The _create_locale() API doesn't appear to support this. Could
804 : * perhaps be worked around by changing pg_locale_t to contain two
805 : * separate fields.
806 : */
807 : ereport(ERROR,
808 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
809 : errmsg("collations with different collate and ctype values are not supported on this platform")));
810 : #endif
811 : }
812 :
813 33370 : return loc;
814 : }
815 :
816 : /*
817 : * strncoll_libc
818 : *
819 : * NUL-terminate arguments, if necessary, and pass to strcoll_l().
820 : *
821 : * An input string length of -1 means that it's already NUL-terminated.
822 : */
823 : int
824 29417928 : strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
825 : pg_locale_t locale)
826 : {
827 : char sbuf[TEXTBUFLEN];
828 29417928 : char *buf = sbuf;
829 29417928 : size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1;
830 29417928 : size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1;
831 : const char *arg1n;
832 : const char *arg2n;
833 : int result;
834 :
835 29417928 : if (bufsize1 + bufsize2 > TEXTBUFLEN)
836 568 : buf = palloc(bufsize1 + bufsize2);
837 :
838 : /* nul-terminate arguments if necessary */
839 29417928 : if (len1 == -1)
840 : {
841 24977486 : arg1n = arg1;
842 : }
843 : else
844 : {
845 4440442 : char *buf1 = buf;
846 :
847 4440442 : memcpy(buf1, arg1, len1);
848 4440442 : buf1[len1] = '\0';
849 4440442 : arg1n = buf1;
850 : }
851 :
852 29417928 : if (len2 == -1)
853 : {
854 24977486 : arg2n = arg2;
855 : }
856 : else
857 : {
858 4440442 : char *buf2 = buf + bufsize1;
859 :
860 4440442 : memcpy(buf2, arg2, len2);
861 4440442 : buf2[len2] = '\0';
862 4440442 : arg2n = buf2;
863 : }
864 :
865 29417928 : result = strcoll_l(arg1n, arg2n, locale->info.lt);
866 :
867 29417928 : if (buf != sbuf)
868 568 : pfree(buf);
869 :
870 29417928 : return result;
871 : }
872 :
873 : /*
874 : * strnxfrm_libc
875 : *
876 : * NUL-terminate src, if necessary, and pass to strxfrm_l().
877 : *
878 : * A source length of -1 means that it's already NUL-terminated.
879 : */
880 : size_t
881 144 : strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
882 : pg_locale_t locale)
883 : {
884 : char sbuf[TEXTBUFLEN];
885 144 : char *buf = sbuf;
886 144 : size_t bufsize = srclen + 1;
887 : size_t result;
888 :
889 144 : if (srclen == -1)
890 144 : return strxfrm_l(dest, src, destsize, locale->info.lt);
891 :
892 0 : if (bufsize > TEXTBUFLEN)
893 0 : buf = palloc(bufsize);
894 :
895 : /* nul-terminate argument */
896 0 : memcpy(buf, src, srclen);
897 0 : buf[srclen] = '\0';
898 :
899 0 : result = strxfrm_l(dest, buf, destsize, locale->info.lt);
900 :
901 0 : if (buf != sbuf)
902 0 : pfree(buf);
903 :
904 : /* if dest is defined, it should be nul-terminated */
905 : Assert(result >= destsize || dest[result] == '\0');
906 :
907 0 : return result;
908 : }
909 :
910 : char *
911 28826 : get_collation_actual_version_libc(const char *collcollate)
912 : {
913 28826 : char *collversion = NULL;
914 :
915 57472 : if (pg_strcasecmp("C", collcollate) != 0 &&
916 57100 : pg_strncasecmp("C.", collcollate, 2) != 0 &&
917 28454 : pg_strcasecmp("POSIX", collcollate) != 0)
918 : {
919 : #if defined(__GLIBC__)
920 : /* Use the glibc version because we don't have anything better. */
921 28428 : collversion = pstrdup(gnu_get_libc_version());
922 : #elif defined(LC_VERSION_MASK)
923 : locale_t loc;
924 :
925 : /* Look up FreeBSD collation version. */
926 : loc = newlocale(LC_COLLATE_MASK, collcollate, NULL);
927 : if (loc)
928 : {
929 : collversion =
930 : pstrdup(querylocale(LC_COLLATE_MASK | LC_VERSION_MASK, loc));
931 : freelocale(loc);
932 : }
933 : else
934 : ereport(ERROR,
935 : (errmsg("could not load locale \"%s\"", collcollate)));
936 : #elif defined(WIN32)
937 : /*
938 : * If we are targeting Windows Vista and above, we can ask for a name
939 : * given a collation name (earlier versions required a location code
940 : * that we don't have).
941 : */
942 : NLSVERSIONINFOEX version = {sizeof(NLSVERSIONINFOEX)};
943 : WCHAR wide_collcollate[LOCALE_NAME_MAX_LENGTH];
944 :
945 : MultiByteToWideChar(CP_ACP, 0, collcollate, -1, wide_collcollate,
946 : LOCALE_NAME_MAX_LENGTH);
947 : if (!GetNLSVersionEx(COMPARE_STRING, wide_collcollate, &version))
948 : {
949 : /*
950 : * GetNLSVersionEx() wants a language tag such as "en-US", not a
951 : * locale name like "English_United States.1252". Until those
952 : * values can be prevented from entering the system, or 100%
953 : * reliably converted to the more useful tag format, tolerate the
954 : * resulting error and report that we have no version data.
955 : */
956 : if (GetLastError() == ERROR_INVALID_PARAMETER)
957 : return NULL;
958 :
959 : ereport(ERROR,
960 : (errmsg("could not get collation version for locale \"%s\": error code %lu",
961 : collcollate,
962 : GetLastError())));
963 : }
964 : collversion = psprintf("%lu.%lu,%lu.%lu",
965 : (version.dwNLSVersion >> 8) & 0xFFFF,
966 : version.dwNLSVersion & 0xFF,
967 : (version.dwDefinedVersion >> 8) & 0xFFFF,
968 : version.dwDefinedVersion & 0xFF);
969 : #endif
970 : }
971 :
972 28826 : return collversion;
973 : }
974 :
975 : /*
976 : * strncoll_libc_win32_utf8
977 : *
978 : * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
979 : * invoke wcscoll_l().
980 : *
981 : * An input string length of -1 means that it's NUL-terminated.
982 : */
983 : #ifdef WIN32
984 : static int
985 : strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
986 : ssize_t len2, pg_locale_t locale)
987 : {
988 : char sbuf[TEXTBUFLEN];
989 : char *buf = sbuf;
990 : char *a1p,
991 : *a2p;
992 : int a1len;
993 : int a2len;
994 : int r;
995 : int result;
996 :
997 : Assert(GetDatabaseEncoding() == PG_UTF8);
998 :
999 : if (len1 == -1)
1000 : len1 = strlen(arg1);
1001 : if (len2 == -1)
1002 : len2 = strlen(arg2);
1003 :
1004 : a1len = len1 * 2 + 2;
1005 : a2len = len2 * 2 + 2;
1006 :
1007 : if (a1len + a2len > TEXTBUFLEN)
1008 : buf = palloc(a1len + a2len);
1009 :
1010 : a1p = buf;
1011 : a2p = buf + a1len;
1012 :
1013 : /* API does not work for zero-length input */
1014 : if (len1 == 0)
1015 : r = 0;
1016 : else
1017 : {
1018 : r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1019 : (LPWSTR) a1p, a1len / 2);
1020 : if (!r)
1021 : ereport(ERROR,
1022 : (errmsg("could not convert string to UTF-16: error code %lu",
1023 : GetLastError())));
1024 : }
1025 : ((LPWSTR) a1p)[r] = 0;
1026 :
1027 : if (len2 == 0)
1028 : r = 0;
1029 : else
1030 : {
1031 : r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1032 : (LPWSTR) a2p, a2len / 2);
1033 : if (!r)
1034 : ereport(ERROR,
1035 : (errmsg("could not convert string to UTF-16: error code %lu",
1036 : GetLastError())));
1037 : }
1038 : ((LPWSTR) a2p)[r] = 0;
1039 :
1040 : errno = 0;
1041 : result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt);
1042 : if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */
1043 : ereport(ERROR,
1044 : (errmsg("could not compare Unicode strings: %m")));
1045 :
1046 : if (buf != sbuf)
1047 : pfree(buf);
1048 :
1049 : return result;
1050 : }
1051 : #endif /* WIN32 */
1052 :
1053 : /* simple subroutine for reporting errors from newlocale() */
1054 : void
1055 0 : report_newlocale_failure(const char *localename)
1056 : {
1057 : int save_errno;
1058 :
1059 : /*
1060 : * Windows doesn't provide any useful error indication from
1061 : * _create_locale(), and BSD-derived platforms don't seem to feel they
1062 : * need to set errno either (even though POSIX is pretty clear that
1063 : * newlocale should do so). So, if errno hasn't been set, assume ENOENT
1064 : * is what to report.
1065 : */
1066 0 : if (errno == 0)
1067 0 : errno = ENOENT;
1068 :
1069 : /*
1070 : * ENOENT means "no such locale", not "no such file", so clarify that
1071 : * errno with an errdetail message.
1072 : */
1073 0 : save_errno = errno; /* auxiliary funcs might change errno */
1074 0 : ereport(ERROR,
1075 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1076 : errmsg("could not create locale \"%s\": %m",
1077 : localename),
1078 : (save_errno == ENOENT ?
1079 : errdetail("The operating system could not find any locale data for the locale name \"%s\".",
1080 : localename) : 0)));
1081 : }
1082 :
1083 : /*
1084 : * POSIX doesn't define _l-variants of these functions, but several systems
1085 : * have them. We provide our own replacements here.
1086 : */
1087 : #ifndef HAVE_MBSTOWCS_L
1088 : static size_t
1089 1140832 : mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
1090 : {
1091 : #ifdef WIN32
1092 : return _mbstowcs_l(dest, src, n, loc);
1093 : #else
1094 : size_t result;
1095 1140832 : locale_t save_locale = uselocale(loc);
1096 :
1097 1140832 : result = mbstowcs(dest, src, n);
1098 1140832 : uselocale(save_locale);
1099 1140832 : return result;
1100 : #endif
1101 : }
1102 : #endif
1103 : #ifndef HAVE_WCSTOMBS_L
1104 : static size_t
1105 1140832 : wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
1106 : {
1107 : #ifdef WIN32
1108 : return _wcstombs_l(dest, src, n, loc);
1109 : #else
1110 : size_t result;
1111 1140832 : locale_t save_locale = uselocale(loc);
1112 :
1113 1140832 : result = wcstombs(dest, src, n);
1114 1140832 : uselocale(save_locale);
1115 1140832 : return result;
1116 : #endif
1117 : }
1118 : #endif
1119 :
1120 : /*
1121 : * These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
1122 : * Therefore we keep them here rather than with the mbutils code.
1123 : */
1124 :
1125 : /*
1126 : * wchar2char --- convert wide characters to multibyte format
1127 : *
1128 : * This has the same API as the standard wcstombs_l() function; in particular,
1129 : * tolen is the maximum number of bytes to store at *to, and *from must be
1130 : * zero-terminated. The output will be zero-terminated iff there is room.
1131 : */
1132 : size_t
1133 1140832 : wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
1134 : {
1135 : size_t result;
1136 :
1137 1140832 : if (tolen == 0)
1138 0 : return 0;
1139 :
1140 : #ifdef WIN32
1141 :
1142 : /*
1143 : * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
1144 : * for some reason mbstowcs and wcstombs won't do this for us, so we use
1145 : * MultiByteToWideChar().
1146 : */
1147 : if (GetDatabaseEncoding() == PG_UTF8)
1148 : {
1149 : result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
1150 : NULL, NULL);
1151 : /* A zero return is failure */
1152 : if (result <= 0)
1153 : result = -1;
1154 : else
1155 : {
1156 : Assert(result <= tolen);
1157 : /* Microsoft counts the zero terminator in the result */
1158 : result--;
1159 : }
1160 : }
1161 : else
1162 : #endif /* WIN32 */
1163 1140832 : if (locale == (pg_locale_t) 0)
1164 : {
1165 : /* Use wcstombs directly for the default locale */
1166 0 : result = wcstombs(to, from, tolen);
1167 : }
1168 : else
1169 : {
1170 : /* Use wcstombs_l for nondefault locales */
1171 1140832 : result = wcstombs_l(to, from, tolen, locale->info.lt);
1172 : }
1173 :
1174 1140832 : return result;
1175 : }
1176 :
1177 : /*
1178 : * char2wchar --- convert multibyte characters to wide characters
1179 : *
1180 : * This has almost the API of mbstowcs_l(), except that *from need not be
1181 : * null-terminated; instead, the number of input bytes is specified as
1182 : * fromlen. Also, we ereport() rather than returning -1 for invalid
1183 : * input encoding. tolen is the maximum number of wchar_t's to store at *to.
1184 : * The output will be zero-terminated iff there is room.
1185 : */
1186 : size_t
1187 1144004 : char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
1188 : pg_locale_t locale)
1189 : {
1190 : size_t result;
1191 :
1192 1144004 : if (tolen == 0)
1193 0 : return 0;
1194 :
1195 : #ifdef WIN32
1196 : /* See WIN32 "Unicode" comment above */
1197 : if (GetDatabaseEncoding() == PG_UTF8)
1198 : {
1199 : /* Win32 API does not work for zero-length input */
1200 : if (fromlen == 0)
1201 : result = 0;
1202 : else
1203 : {
1204 : result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
1205 : /* A zero return is failure */
1206 : if (result == 0)
1207 : result = -1;
1208 : }
1209 :
1210 : if (result != -1)
1211 : {
1212 : Assert(result < tolen);
1213 : /* Append trailing null wchar (MultiByteToWideChar() does not) */
1214 : to[result] = 0;
1215 : }
1216 : }
1217 : else
1218 : #endif /* WIN32 */
1219 : {
1220 : /* mbstowcs requires ending '\0' */
1221 1144004 : char *str = pnstrdup(from, fromlen);
1222 :
1223 1144004 : if (locale == (pg_locale_t) 0)
1224 : {
1225 : /* Use mbstowcs directly for the default locale */
1226 3172 : result = mbstowcs(to, str, tolen);
1227 : }
1228 : else
1229 : {
1230 : /* Use mbstowcs_l for nondefault locales */
1231 1140832 : result = mbstowcs_l(to, str, tolen, locale->info.lt);
1232 : }
1233 :
1234 1144004 : pfree(str);
1235 : }
1236 :
1237 1144004 : if (result == -1)
1238 : {
1239 : /*
1240 : * Invalid multibyte character encountered. We try to give a useful
1241 : * error message by letting pg_verifymbstr check the string. But it's
1242 : * possible that the string is OK to us, and not OK to mbstowcs ---
1243 : * this suggests that the LC_CTYPE locale is different from the
1244 : * database encoding. Give a generic error message if pg_verifymbstr
1245 : * can't find anything wrong.
1246 : */
1247 0 : pg_verifymbstr(from, fromlen, false); /* might not return */
1248 : /* but if it does ... */
1249 0 : ereport(ERROR,
1250 : (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1251 : errmsg("invalid multibyte character for locale"),
1252 : errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
1253 : }
1254 :
1255 1144004 : return result;
1256 : }
|