Line data Source code
1 : /*-----------------------------------------------------------------------
2 : *
3 : * PostgreSQL locale utilities for libc
4 : *
5 : * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 : *
7 : * src/backend/utils/adt/pg_locale_libc.c
8 : *
9 : *-----------------------------------------------------------------------
10 : */
11 :
12 : #include "postgres.h"
13 :
14 : #include <limits.h>
15 : #include <wctype.h>
16 :
17 : #include "access/htup_details.h"
18 : #include "catalog/pg_database.h"
19 : #include "catalog/pg_collation.h"
20 : #include "mb/pg_wchar.h"
21 : #include "miscadmin.h"
22 : #include "utils/builtins.h"
23 : #include "utils/formatting.h"
24 : #include "utils/memutils.h"
25 : #include "utils/pg_locale.h"
26 : #include "utils/syscache.h"
27 :
28 : #ifdef __GLIBC__
29 : #include <gnu/libc-version.h>
30 : #endif
31 :
32 : #ifdef WIN32
33 : #include <shlwapi.h>
34 : #endif
35 :
36 : /*
37 : * Size of stack buffer to use for string transformations, used to avoid heap
38 : * allocations in typical cases. This should be large enough that most strings
39 : * will fit, but small enough that we feel comfortable putting it on the
40 : * stack.
41 : */
42 : #define TEXTBUFLEN 1024
43 :
44 : extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context);
45 :
46 : extern size_t strlower_libc(char *dst, size_t dstsize, const char *src,
47 : ssize_t srclen, pg_locale_t locale);
48 : extern size_t strtitle_libc(char *dst, size_t dstsize, const char *src,
49 : ssize_t srclen, pg_locale_t locale);
50 : extern size_t strupper_libc(char *dst, size_t dstsize, const char *src,
51 : ssize_t srclen, pg_locale_t locale);
52 :
53 : static int strncoll_libc(const char *arg1, ssize_t len1,
54 : const char *arg2, ssize_t len2,
55 : pg_locale_t locale);
56 : static size_t strnxfrm_libc(char *dest, size_t destsize,
57 : const char *src, ssize_t srclen,
58 : pg_locale_t locale);
59 : extern char *get_collation_actual_version_libc(const char *collcollate);
60 : static locale_t make_libc_collator(const char *collate,
61 : const char *ctype);
62 :
63 : #ifdef WIN32
64 : static int strncoll_libc_win32_utf8(const char *arg1, ssize_t len1,
65 : const char *arg2, ssize_t len2,
66 : pg_locale_t locale);
67 : #endif
68 :
69 : static size_t strlower_libc_sb(char *dest, size_t destsize,
70 : const char *src, ssize_t srclen,
71 : pg_locale_t locale);
72 : static size_t strlower_libc_mb(char *dest, size_t destsize,
73 : const char *src, ssize_t srclen,
74 : pg_locale_t locale);
75 : static size_t strtitle_libc_sb(char *dest, size_t destsize,
76 : const char *src, ssize_t srclen,
77 : pg_locale_t locale);
78 : static size_t strtitle_libc_mb(char *dest, size_t destsize,
79 : const char *src, ssize_t srclen,
80 : pg_locale_t locale);
81 : static size_t strupper_libc_sb(char *dest, size_t destsize,
82 : const char *src, ssize_t srclen,
83 : pg_locale_t locale);
84 : static size_t strupper_libc_mb(char *dest, size_t destsize,
85 : const char *src, ssize_t srclen,
86 : pg_locale_t locale);
87 :
88 : static const struct collate_methods collate_methods_libc = {
89 : .strncoll = strncoll_libc,
90 : .strnxfrm = strnxfrm_libc,
91 : .strnxfrm_prefix = NULL,
92 :
93 : /*
94 : * Unfortunately, it seems that strxfrm() for non-C collations is broken
95 : * on many common platforms; testing of multiple versions of glibc reveals
96 : * that, for many locales, strcoll() and strxfrm() do not return
97 : * consistent results. While no other libc other than Cygwin has so far
98 : * been shown to have a problem, we take the conservative course of action
99 : * for right now and disable this categorically. (Users who are certain
100 : * this isn't a problem on their system can define TRUST_STRXFRM.)
101 : */
102 : #ifdef TRUST_STRXFRM
103 : .strxfrm_is_safe = true,
104 : #else
105 : .strxfrm_is_safe = false,
106 : #endif
107 : };
108 :
109 : #ifdef WIN32
110 : static const struct collate_methods collate_methods_libc_win32_utf8 = {
111 : .strncoll = strncoll_libc_win32_utf8,
112 : .strnxfrm = strnxfrm_libc,
113 : .strnxfrm_prefix = NULL,
114 : #ifdef TRUST_STRXFRM
115 : .strxfrm_is_safe = true,
116 : #else
117 : .strxfrm_is_safe = false,
118 : #endif
119 : };
120 : #endif
121 :
122 : size_t
123 422860 : strlower_libc(char *dst, size_t dstsize, const char *src,
124 : ssize_t srclen, pg_locale_t locale)
125 : {
126 422860 : if (pg_database_encoding_max_length() > 1)
127 422860 : return strlower_libc_mb(dst, dstsize, src, srclen, locale);
128 : else
129 0 : return strlower_libc_sb(dst, dstsize, src, srclen, locale);
130 : }
131 :
132 : size_t
133 8 : strtitle_libc(char *dst, size_t dstsize, const char *src,
134 : ssize_t srclen, pg_locale_t locale)
135 : {
136 8 : if (pg_database_encoding_max_length() > 1)
137 8 : return strtitle_libc_mb(dst, dstsize, src, srclen, locale);
138 : else
139 0 : return strtitle_libc_sb(dst, dstsize, src, srclen, locale);
140 : }
141 :
142 : size_t
143 717164 : strupper_libc(char *dst, size_t dstsize, const char *src,
144 : ssize_t srclen, pg_locale_t locale)
145 : {
146 717164 : if (pg_database_encoding_max_length() > 1)
147 717164 : return strupper_libc_mb(dst, dstsize, src, srclen, locale);
148 : else
149 0 : return strupper_libc_sb(dst, dstsize, src, srclen, locale);
150 : }
151 :
152 : static size_t
153 0 : strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
154 : pg_locale_t locale)
155 : {
156 0 : if (srclen < 0)
157 0 : srclen = strlen(src);
158 :
159 0 : if (srclen + 1 <= destsize)
160 : {
161 0 : locale_t loc = locale->info.lt;
162 : char *p;
163 :
164 0 : if (srclen + 1 > destsize)
165 0 : return srclen;
166 :
167 0 : memcpy(dest, src, srclen);
168 0 : dest[srclen] = '\0';
169 :
170 : /*
171 : * Note: we assume that tolower_l() will not be so broken as to need
172 : * an isupper_l() guard test. When using the default collation, we
173 : * apply the traditional Postgres behavior that forces ASCII-style
174 : * treatment of I/i, but in non-default collations you get exactly
175 : * what the collation says.
176 : */
177 0 : for (p = dest; *p; p++)
178 : {
179 0 : if (locale->is_default)
180 0 : *p = pg_tolower((unsigned char) *p);
181 : else
182 0 : *p = tolower_l((unsigned char) *p, loc);
183 : }
184 : }
185 :
186 0 : return srclen;
187 : }
188 :
189 : static size_t
190 422860 : strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
191 : pg_locale_t locale)
192 : {
193 422860 : locale_t loc = locale->info.lt;
194 : size_t result_size;
195 : wchar_t *workspace;
196 : char *result;
197 : size_t curr_char;
198 : size_t max_size;
199 :
200 422860 : if (srclen < 0)
201 0 : srclen = strlen(src);
202 :
203 : /* Overflow paranoia */
204 422860 : if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
205 0 : ereport(ERROR,
206 : (errcode(ERRCODE_OUT_OF_MEMORY),
207 : errmsg("out of memory")));
208 :
209 : /* Output workspace cannot have more codes than input bytes */
210 422860 : workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
211 :
212 422860 : char2wchar(workspace, srclen + 1, src, srclen, locale);
213 :
214 3642632 : for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
215 3219772 : workspace[curr_char] = towlower_l(workspace[curr_char], loc);
216 :
217 : /*
218 : * Make result large enough; case change might change number of bytes
219 : */
220 422860 : max_size = curr_char * pg_database_encoding_max_length();
221 422860 : result = palloc(max_size + 1);
222 :
223 422860 : result_size = wchar2char(result, workspace, max_size + 1, locale);
224 :
225 422860 : if (result_size + 1 > destsize)
226 0 : return result_size;
227 :
228 422860 : memcpy(dest, result, result_size);
229 422860 : dest[result_size] = '\0';
230 :
231 422860 : pfree(workspace);
232 422860 : pfree(result);
233 :
234 422860 : return result_size;
235 : }
236 :
237 : static size_t
238 0 : strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
239 : pg_locale_t locale)
240 : {
241 0 : if (srclen < 0)
242 0 : srclen = strlen(src);
243 :
244 0 : if (srclen + 1 <= destsize)
245 : {
246 0 : locale_t loc = locale->info.lt;
247 0 : int wasalnum = false;
248 : char *p;
249 :
250 0 : memcpy(dest, src, srclen);
251 0 : dest[srclen] = '\0';
252 :
253 : /*
254 : * Note: we assume that toupper_l()/tolower_l() will not be so broken
255 : * as to need guard tests. When using the default collation, we apply
256 : * the traditional Postgres behavior that forces ASCII-style treatment
257 : * of I/i, but in non-default collations you get exactly what the
258 : * collation says.
259 : */
260 0 : for (p = dest; *p; p++)
261 : {
262 0 : if (locale->is_default)
263 : {
264 0 : if (wasalnum)
265 0 : *p = pg_tolower((unsigned char) *p);
266 : else
267 0 : *p = pg_toupper((unsigned char) *p);
268 : }
269 : else
270 : {
271 0 : if (wasalnum)
272 0 : *p = tolower_l((unsigned char) *p, loc);
273 : else
274 0 : *p = toupper_l((unsigned char) *p, loc);
275 : }
276 0 : wasalnum = isalnum_l((unsigned char) *p, loc);
277 : }
278 : }
279 :
280 0 : return srclen;
281 : }
282 :
283 : static size_t
284 8 : strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
285 : pg_locale_t locale)
286 : {
287 8 : locale_t loc = locale->info.lt;
288 8 : int wasalnum = false;
289 : size_t result_size;
290 : wchar_t *workspace;
291 : char *result;
292 : size_t curr_char;
293 : size_t max_size;
294 :
295 8 : if (srclen < 0)
296 0 : srclen = strlen(src);
297 :
298 : /* Overflow paranoia */
299 8 : if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
300 0 : ereport(ERROR,
301 : (errcode(ERRCODE_OUT_OF_MEMORY),
302 : errmsg("out of memory")));
303 :
304 : /* Output workspace cannot have more codes than input bytes */
305 8 : workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
306 :
307 8 : char2wchar(workspace, srclen + 1, src, srclen, locale);
308 :
309 80 : for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
310 : {
311 72 : if (wasalnum)
312 56 : workspace[curr_char] = towlower_l(workspace[curr_char], loc);
313 : else
314 16 : workspace[curr_char] = towupper_l(workspace[curr_char], loc);
315 72 : wasalnum = iswalnum_l(workspace[curr_char], loc);
316 : }
317 :
318 : /*
319 : * Make result large enough; case change might change number of bytes
320 : */
321 8 : max_size = curr_char * pg_database_encoding_max_length();
322 8 : result = palloc(max_size + 1);
323 :
324 8 : result_size = wchar2char(result, workspace, max_size + 1, locale);
325 :
326 8 : if (result_size + 1 > destsize)
327 0 : return result_size;
328 :
329 8 : memcpy(dest, result, result_size);
330 8 : dest[result_size] = '\0';
331 :
332 8 : pfree(workspace);
333 8 : pfree(result);
334 :
335 8 : return result_size;
336 : }
337 :
338 : static size_t
339 0 : strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
340 : pg_locale_t locale)
341 : {
342 0 : if (srclen < 0)
343 0 : srclen = strlen(src);
344 :
345 0 : if (srclen + 1 <= destsize)
346 : {
347 0 : locale_t loc = locale->info.lt;
348 : char *p;
349 :
350 0 : memcpy(dest, src, srclen);
351 0 : dest[srclen] = '\0';
352 :
353 : /*
354 : * Note: we assume that toupper_l() will not be so broken as to need
355 : * an islower_l() guard test. When using the default collation, we
356 : * apply the traditional Postgres behavior that forces ASCII-style
357 : * treatment of I/i, but in non-default collations you get exactly
358 : * what the collation says.
359 : */
360 0 : for (p = dest; *p; p++)
361 : {
362 0 : if (locale->is_default)
363 0 : *p = pg_toupper((unsigned char) *p);
364 : else
365 0 : *p = toupper_l((unsigned char) *p, loc);
366 : }
367 : }
368 :
369 0 : return srclen;
370 : }
371 :
372 : static size_t
373 717164 : strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
374 : pg_locale_t locale)
375 : {
376 717164 : locale_t loc = locale->info.lt;
377 : size_t result_size;
378 : wchar_t *workspace;
379 : char *result;
380 : size_t curr_char;
381 : size_t max_size;
382 :
383 717164 : if (srclen < 0)
384 0 : srclen = strlen(src);
385 :
386 : /* Overflow paranoia */
387 717164 : if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
388 0 : ereport(ERROR,
389 : (errcode(ERRCODE_OUT_OF_MEMORY),
390 : errmsg("out of memory")));
391 :
392 : /* Output workspace cannot have more codes than input bytes */
393 717164 : workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
394 :
395 717164 : char2wchar(workspace, srclen + 1, src, srclen, locale);
396 :
397 2352796 : for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
398 1635632 : workspace[curr_char] = towupper_l(workspace[curr_char], loc);
399 :
400 : /*
401 : * Make result large enough; case change might change number of bytes
402 : */
403 717164 : max_size = curr_char * pg_database_encoding_max_length();
404 717164 : result = palloc(max_size + 1);
405 :
406 717164 : result_size = wchar2char(result, workspace, max_size + 1, locale);
407 :
408 717164 : if (result_size + 1 > destsize)
409 0 : return result_size;
410 :
411 717164 : memcpy(dest, result, result_size);
412 717164 : dest[result_size] = '\0';
413 :
414 717164 : pfree(workspace);
415 717164 : pfree(result);
416 :
417 717164 : return result_size;
418 : }
419 :
420 : pg_locale_t
421 33890 : create_pg_locale_libc(Oid collid, MemoryContext context)
422 : {
423 : const char *collate;
424 : const char *ctype;
425 : locale_t loc;
426 : pg_locale_t result;
427 :
428 33890 : if (collid == DEFAULT_COLLATION_OID)
429 : {
430 : HeapTuple tp;
431 : Datum datum;
432 :
433 30140 : tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
434 30140 : if (!HeapTupleIsValid(tp))
435 0 : elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
436 30140 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
437 : Anum_pg_database_datcollate);
438 30140 : collate = TextDatumGetCString(datum);
439 30140 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
440 : Anum_pg_database_datctype);
441 30140 : ctype = TextDatumGetCString(datum);
442 :
443 30140 : ReleaseSysCache(tp);
444 : }
445 : else
446 : {
447 : HeapTuple tp;
448 : Datum datum;
449 :
450 3750 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
451 3750 : if (!HeapTupleIsValid(tp))
452 0 : elog(ERROR, "cache lookup failed for collation %u", collid);
453 :
454 3750 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
455 : Anum_pg_collation_collcollate);
456 3750 : collate = TextDatumGetCString(datum);
457 3750 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
458 : Anum_pg_collation_collctype);
459 3750 : ctype = TextDatumGetCString(datum);
460 :
461 3750 : ReleaseSysCache(tp);
462 : }
463 :
464 :
465 33890 : loc = make_libc_collator(collate, ctype);
466 :
467 33890 : result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
468 33890 : result->provider = COLLPROVIDER_LIBC;
469 33890 : result->deterministic = true;
470 62998 : result->collate_is_c = (strcmp(collate, "C") == 0) ||
471 29108 : (strcmp(collate, "POSIX") == 0);
472 62998 : result->ctype_is_c = (strcmp(ctype, "C") == 0) ||
473 29108 : (strcmp(ctype, "POSIX") == 0);
474 33890 : result->info.lt = loc;
475 33890 : if (!result->collate_is_c)
476 : {
477 : #ifdef WIN32
478 : if (GetDatabaseEncoding() == PG_UTF8)
479 : result->collate = &collate_methods_libc_win32_utf8;
480 : else
481 : #endif
482 29044 : result->collate = &collate_methods_libc;
483 : }
484 :
485 33890 : return result;
486 : }
487 :
488 : /*
489 : * Create a locale_t with the given collation and ctype.
490 : *
491 : * The "C" and "POSIX" locales are not actually handled by libc, so return
492 : * NULL.
493 : *
494 : * Ensure that no path leaks a locale_t.
495 : */
496 : static locale_t
497 33890 : make_libc_collator(const char *collate, const char *ctype)
498 : {
499 33890 : locale_t loc = 0;
500 :
501 33890 : if (strcmp(collate, ctype) == 0)
502 : {
503 33890 : if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
504 : {
505 : /* Normal case where they're the same */
506 29044 : errno = 0;
507 : #ifndef WIN32
508 29044 : loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
509 : NULL);
510 : #else
511 : loc = _create_locale(LC_ALL, collate);
512 : #endif
513 29044 : if (!loc)
514 0 : report_newlocale_failure(collate);
515 : }
516 : }
517 : else
518 : {
519 : #ifndef WIN32
520 : /* We need two newlocale() steps */
521 0 : locale_t loc1 = 0;
522 :
523 0 : if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0)
524 : {
525 0 : errno = 0;
526 0 : loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
527 0 : if (!loc1)
528 0 : report_newlocale_failure(collate);
529 : }
530 :
531 0 : if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
532 : {
533 0 : errno = 0;
534 0 : loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
535 0 : if (!loc)
536 : {
537 0 : if (loc1)
538 0 : freelocale(loc1);
539 0 : report_newlocale_failure(ctype);
540 : }
541 : }
542 : else
543 0 : loc = loc1;
544 : #else
545 :
546 : /*
547 : * XXX The _create_locale() API doesn't appear to support this. Could
548 : * perhaps be worked around by changing pg_locale_t to contain two
549 : * separate fields.
550 : */
551 : ereport(ERROR,
552 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
553 : errmsg("collations with different collate and ctype values are not supported on this platform")));
554 : #endif
555 : }
556 :
557 33890 : return loc;
558 : }
559 :
560 : /*
561 : * strncoll_libc
562 : *
563 : * NUL-terminate arguments, if necessary, and pass to strcoll_l().
564 : *
565 : * An input string length of -1 means that it's already NUL-terminated.
566 : */
567 : int
568 29624396 : strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
569 : pg_locale_t locale)
570 : {
571 : char sbuf[TEXTBUFLEN];
572 29624396 : char *buf = sbuf;
573 29624396 : size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1;
574 29624396 : size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1;
575 : const char *arg1n;
576 : const char *arg2n;
577 : int result;
578 :
579 : Assert(locale->provider == COLLPROVIDER_LIBC);
580 :
581 29624396 : if (bufsize1 + bufsize2 > TEXTBUFLEN)
582 360 : buf = palloc(bufsize1 + bufsize2);
583 :
584 : /* nul-terminate arguments if necessary */
585 29624396 : if (len1 == -1)
586 : {
587 25177110 : arg1n = arg1;
588 : }
589 : else
590 : {
591 4447286 : char *buf1 = buf;
592 :
593 4447286 : memcpy(buf1, arg1, len1);
594 4447286 : buf1[len1] = '\0';
595 4447286 : arg1n = buf1;
596 : }
597 :
598 29624396 : if (len2 == -1)
599 : {
600 25177110 : arg2n = arg2;
601 : }
602 : else
603 : {
604 4447286 : char *buf2 = buf + bufsize1;
605 :
606 4447286 : memcpy(buf2, arg2, len2);
607 4447286 : buf2[len2] = '\0';
608 4447286 : arg2n = buf2;
609 : }
610 :
611 29624396 : result = strcoll_l(arg1n, arg2n, locale->info.lt);
612 :
613 29624396 : if (buf != sbuf)
614 360 : pfree(buf);
615 :
616 29624396 : return result;
617 : }
618 :
619 : /*
620 : * strnxfrm_libc
621 : *
622 : * NUL-terminate src, if necessary, and pass to strxfrm_l().
623 : *
624 : * A source length of -1 means that it's already NUL-terminated.
625 : */
626 : size_t
627 144 : strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
628 : pg_locale_t locale)
629 : {
630 : char sbuf[TEXTBUFLEN];
631 144 : char *buf = sbuf;
632 144 : size_t bufsize = srclen + 1;
633 : size_t result;
634 :
635 : Assert(locale->provider == COLLPROVIDER_LIBC);
636 :
637 144 : if (srclen == -1)
638 144 : return strxfrm_l(dest, src, destsize, locale->info.lt);
639 :
640 0 : if (bufsize > TEXTBUFLEN)
641 0 : buf = palloc(bufsize);
642 :
643 : /* nul-terminate argument */
644 0 : memcpy(buf, src, srclen);
645 0 : buf[srclen] = '\0';
646 :
647 0 : result = strxfrm_l(dest, buf, destsize, locale->info.lt);
648 :
649 0 : if (buf != sbuf)
650 0 : pfree(buf);
651 :
652 : /* if dest is defined, it should be nul-terminated */
653 : Assert(result >= destsize || dest[result] == '\0');
654 :
655 0 : return result;
656 : }
657 :
658 : char *
659 29468 : get_collation_actual_version_libc(const char *collcollate)
660 : {
661 29468 : char *collversion = NULL;
662 :
663 58756 : if (pg_strcasecmp("C", collcollate) != 0 &&
664 58404 : pg_strncasecmp("C.", collcollate, 2) != 0 &&
665 29116 : pg_strcasecmp("POSIX", collcollate) != 0)
666 : {
667 : #if defined(__GLIBC__)
668 : /* Use the glibc version because we don't have anything better. */
669 29090 : collversion = pstrdup(gnu_get_libc_version());
670 : #elif defined(LC_VERSION_MASK)
671 : locale_t loc;
672 :
673 : /* Look up FreeBSD collation version. */
674 : loc = newlocale(LC_COLLATE_MASK, collcollate, NULL);
675 : if (loc)
676 : {
677 : collversion =
678 : pstrdup(querylocale(LC_COLLATE_MASK | LC_VERSION_MASK, loc));
679 : freelocale(loc);
680 : }
681 : else
682 : ereport(ERROR,
683 : (errmsg("could not load locale \"%s\"", collcollate)));
684 : #elif defined(WIN32)
685 : /*
686 : * If we are targeting Windows Vista and above, we can ask for a name
687 : * given a collation name (earlier versions required a location code
688 : * that we don't have).
689 : */
690 : NLSVERSIONINFOEX version = {sizeof(NLSVERSIONINFOEX)};
691 : WCHAR wide_collcollate[LOCALE_NAME_MAX_LENGTH];
692 :
693 : MultiByteToWideChar(CP_ACP, 0, collcollate, -1, wide_collcollate,
694 : LOCALE_NAME_MAX_LENGTH);
695 : if (!GetNLSVersionEx(COMPARE_STRING, wide_collcollate, &version))
696 : {
697 : /*
698 : * GetNLSVersionEx() wants a language tag such as "en-US", not a
699 : * locale name like "English_United States.1252". Until those
700 : * values can be prevented from entering the system, or 100%
701 : * reliably converted to the more useful tag format, tolerate the
702 : * resulting error and report that we have no version data.
703 : */
704 : if (GetLastError() == ERROR_INVALID_PARAMETER)
705 : return NULL;
706 :
707 : ereport(ERROR,
708 : (errmsg("could not get collation version for locale \"%s\": error code %lu",
709 : collcollate,
710 : GetLastError())));
711 : }
712 : collversion = psprintf("%lu.%lu,%lu.%lu",
713 : (version.dwNLSVersion >> 8) & 0xFFFF,
714 : version.dwNLSVersion & 0xFF,
715 : (version.dwDefinedVersion >> 8) & 0xFFFF,
716 : version.dwDefinedVersion & 0xFF);
717 : #endif
718 : }
719 :
720 29468 : return collversion;
721 : }
722 :
723 : /*
724 : * strncoll_libc_win32_utf8
725 : *
726 : * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
727 : * invoke wcscoll_l().
728 : *
729 : * An input string length of -1 means that it's NUL-terminated.
730 : */
731 : #ifdef WIN32
732 : static int
733 : strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
734 : ssize_t len2, pg_locale_t locale)
735 : {
736 : char sbuf[TEXTBUFLEN];
737 : char *buf = sbuf;
738 : char *a1p,
739 : *a2p;
740 : int a1len;
741 : int a2len;
742 : int r;
743 : int result;
744 :
745 : Assert(locale->provider == COLLPROVIDER_LIBC);
746 : Assert(GetDatabaseEncoding() == PG_UTF8);
747 :
748 : if (len1 == -1)
749 : len1 = strlen(arg1);
750 : if (len2 == -1)
751 : len2 = strlen(arg2);
752 :
753 : a1len = len1 * 2 + 2;
754 : a2len = len2 * 2 + 2;
755 :
756 : if (a1len + a2len > TEXTBUFLEN)
757 : buf = palloc(a1len + a2len);
758 :
759 : a1p = buf;
760 : a2p = buf + a1len;
761 :
762 : /* API does not work for zero-length input */
763 : if (len1 == 0)
764 : r = 0;
765 : else
766 : {
767 : r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
768 : (LPWSTR) a1p, a1len / 2);
769 : if (!r)
770 : ereport(ERROR,
771 : (errmsg("could not convert string to UTF-16: error code %lu",
772 : GetLastError())));
773 : }
774 : ((LPWSTR) a1p)[r] = 0;
775 :
776 : if (len2 == 0)
777 : r = 0;
778 : else
779 : {
780 : r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
781 : (LPWSTR) a2p, a2len / 2);
782 : if (!r)
783 : ereport(ERROR,
784 : (errmsg("could not convert string to UTF-16: error code %lu",
785 : GetLastError())));
786 : }
787 : ((LPWSTR) a2p)[r] = 0;
788 :
789 : errno = 0;
790 : result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt);
791 : if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */
792 : ereport(ERROR,
793 : (errmsg("could not compare Unicode strings: %m")));
794 :
795 : if (buf != sbuf)
796 : pfree(buf);
797 :
798 : return result;
799 : }
800 : #endif /* WIN32 */
801 :
802 : /* simple subroutine for reporting errors from newlocale() */
803 : void
804 0 : report_newlocale_failure(const char *localename)
805 : {
806 : int save_errno;
807 :
808 : /*
809 : * Windows doesn't provide any useful error indication from
810 : * _create_locale(), and BSD-derived platforms don't seem to feel they
811 : * need to set errno either (even though POSIX is pretty clear that
812 : * newlocale should do so). So, if errno hasn't been set, assume ENOENT
813 : * is what to report.
814 : */
815 0 : if (errno == 0)
816 0 : errno = ENOENT;
817 :
818 : /*
819 : * ENOENT means "no such locale", not "no such file", so clarify that
820 : * errno with an errdetail message.
821 : */
822 0 : save_errno = errno; /* auxiliary funcs might change errno */
823 0 : ereport(ERROR,
824 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
825 : errmsg("could not create locale \"%s\": %m",
826 : localename),
827 : (save_errno == ENOENT ?
828 : errdetail("The operating system could not find any locale data for the locale name \"%s\".",
829 : localename) : 0)));
830 : }
831 :
832 : /*
833 : * POSIX doesn't define _l-variants of these functions, but several systems
834 : * have them. We provide our own replacements here.
835 : */
836 : #ifndef HAVE_MBSTOWCS_L
837 : static size_t
838 1140032 : mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
839 : {
840 : #ifdef WIN32
841 : return _mbstowcs_l(dest, src, n, loc);
842 : #else
843 : size_t result;
844 1140032 : locale_t save_locale = uselocale(loc);
845 :
846 1140032 : result = mbstowcs(dest, src, n);
847 1140032 : uselocale(save_locale);
848 1140032 : return result;
849 : #endif
850 : }
851 : #endif
852 : #ifndef HAVE_WCSTOMBS_L
853 : static size_t
854 1140032 : wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
855 : {
856 : #ifdef WIN32
857 : return _wcstombs_l(dest, src, n, loc);
858 : #else
859 : size_t result;
860 1140032 : locale_t save_locale = uselocale(loc);
861 :
862 1140032 : result = wcstombs(dest, src, n);
863 1140032 : uselocale(save_locale);
864 1140032 : return result;
865 : #endif
866 : }
867 : #endif
868 :
869 : /*
870 : * These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
871 : * Therefore we keep them here rather than with the mbutils code.
872 : */
873 :
874 : /*
875 : * wchar2char --- convert wide characters to multibyte format
876 : *
877 : * This has the same API as the standard wcstombs_l() function; in particular,
878 : * tolen is the maximum number of bytes to store at *to, and *from must be
879 : * zero-terminated. The output will be zero-terminated iff there is room.
880 : */
881 : size_t
882 1140032 : wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
883 : {
884 : size_t result;
885 :
886 1140032 : if (tolen == 0)
887 0 : return 0;
888 :
889 : #ifdef WIN32
890 :
891 : /*
892 : * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
893 : * for some reason mbstowcs and wcstombs won't do this for us, so we use
894 : * MultiByteToWideChar().
895 : */
896 : if (GetDatabaseEncoding() == PG_UTF8)
897 : {
898 : result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
899 : NULL, NULL);
900 : /* A zero return is failure */
901 : if (result <= 0)
902 : result = -1;
903 : else
904 : {
905 : Assert(result <= tolen);
906 : /* Microsoft counts the zero terminator in the result */
907 : result--;
908 : }
909 : }
910 : else
911 : #endif /* WIN32 */
912 1140032 : if (locale == (pg_locale_t) 0)
913 : {
914 : /* Use wcstombs directly for the default locale */
915 0 : result = wcstombs(to, from, tolen);
916 : }
917 : else
918 : {
919 : /* Use wcstombs_l for nondefault locales */
920 1140032 : result = wcstombs_l(to, from, tolen, locale->info.lt);
921 : }
922 :
923 1140032 : return result;
924 : }
925 :
926 : /*
927 : * char2wchar --- convert multibyte characters to wide characters
928 : *
929 : * This has almost the API of mbstowcs_l(), except that *from need not be
930 : * null-terminated; instead, the number of input bytes is specified as
931 : * fromlen. Also, we ereport() rather than returning -1 for invalid
932 : * input encoding. tolen is the maximum number of wchar_t's to store at *to.
933 : * The output will be zero-terminated iff there is room.
934 : */
935 : size_t
936 1143204 : char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
937 : pg_locale_t locale)
938 : {
939 : size_t result;
940 :
941 1143204 : if (tolen == 0)
942 0 : return 0;
943 :
944 : #ifdef WIN32
945 : /* See WIN32 "Unicode" comment above */
946 : if (GetDatabaseEncoding() == PG_UTF8)
947 : {
948 : /* Win32 API does not work for zero-length input */
949 : if (fromlen == 0)
950 : result = 0;
951 : else
952 : {
953 : result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
954 : /* A zero return is failure */
955 : if (result == 0)
956 : result = -1;
957 : }
958 :
959 : if (result != -1)
960 : {
961 : Assert(result < tolen);
962 : /* Append trailing null wchar (MultiByteToWideChar() does not) */
963 : to[result] = 0;
964 : }
965 : }
966 : else
967 : #endif /* WIN32 */
968 : {
969 : /* mbstowcs requires ending '\0' */
970 1143204 : char *str = pnstrdup(from, fromlen);
971 :
972 1143204 : if (locale == (pg_locale_t) 0)
973 : {
974 : /* Use mbstowcs directly for the default locale */
975 3172 : result = mbstowcs(to, str, tolen);
976 : }
977 : else
978 : {
979 : /* Use mbstowcs_l for nondefault locales */
980 1140032 : result = mbstowcs_l(to, str, tolen, locale->info.lt);
981 : }
982 :
983 1143204 : pfree(str);
984 : }
985 :
986 1143204 : if (result == -1)
987 : {
988 : /*
989 : * Invalid multibyte character encountered. We try to give a useful
990 : * error message by letting pg_verifymbstr check the string. But it's
991 : * possible that the string is OK to us, and not OK to mbstowcs ---
992 : * this suggests that the LC_CTYPE locale is different from the
993 : * database encoding. Give a generic error message if pg_verifymbstr
994 : * can't find anything wrong.
995 : */
996 0 : pg_verifymbstr(from, fromlen, false); /* might not return */
997 : /* but if it does ... */
998 0 : ereport(ERROR,
999 : (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1000 : errmsg("invalid multibyte character for locale"),
1001 : errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
1002 : }
1003 :
1004 1143204 : return result;
1005 : }
|