Line data Source code
1 : /*-----------------------------------------------------------------------
2 : *
3 : * PostgreSQL locale utilities for libc
4 : *
5 : * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 : *
7 : * src/backend/utils/adt/pg_locale_libc.c
8 : *
9 : *-----------------------------------------------------------------------
10 : */
11 :
12 : #include "postgres.h"
13 :
14 : #include <limits.h>
15 : #include <wctype.h>
16 :
17 : #include "access/htup_details.h"
18 : #include "catalog/pg_database.h"
19 : #include "catalog/pg_collation.h"
20 : #include "mb/pg_wchar.h"
21 : #include "miscadmin.h"
22 : #include "utils/builtins.h"
23 : #include "utils/formatting.h"
24 : #include "utils/memutils.h"
25 : #include "utils/pg_locale.h"
26 : #include "utils/syscache.h"
27 :
28 : #ifdef __GLIBC__
29 : #include <gnu/libc-version.h>
30 : #endif
31 :
32 : #ifdef WIN32
33 : #include <shlwapi.h>
34 : #endif
35 :
36 : /*
37 : * For the libc provider, to provide as much functionality as possible on a
38 : * variety of platforms without going so far as to implement everything from
39 : * scratch, we use several implementation strategies depending on the
40 : * situation:
41 : *
42 : * 1. In C/POSIX collations, we use hard-wired code. We can't depend on
43 : * the <ctype.h> functions since those will obey LC_CTYPE. Note that these
44 : * collations don't give a fig about multibyte characters.
45 : *
46 : * 2. When working in UTF8 encoding, we use the <wctype.h> functions.
47 : * This assumes that every platform uses Unicode codepoints directly
48 : * as the wchar_t representation of Unicode. On some platforms
49 : * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
50 : *
51 : * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar
52 : * values up to 255, and punt for values above that. This is 100% correct
53 : * only in single-byte encodings such as LATINn. However, non-Unicode
54 : * multibyte encodings are mostly Far Eastern character sets for which the
55 : * properties being tested here aren't very relevant for higher code values
56 : * anyway. The difficulty with using the <wctype.h> functions with
57 : * non-Unicode multibyte encodings is that we can have no certainty that
58 : * the platform's wchar_t representation matches what we do in pg_wchar
59 : * conversions.
60 : *
61 : * As a special case, in the "default" collation, (2) and (3) force ASCII
62 : * letters to follow ASCII upcase/downcase rules, while in a non-default
63 : * collation we just let the library functions do what they will. The case
64 : * where this matters is treatment of I/i in Turkish, and the behavior is
65 : * meant to match the upper()/lower() SQL functions.
66 : *
67 : * We store the active collation setting in static variables. In principle
68 : * it could be passed down to here via the regex library's "struct vars" data
69 : * structure; but that would require somewhat invasive changes in the regex
70 : * library, and right now there's no real benefit to be gained from that.
71 : *
72 : * NB: the coding here assumes pg_wchar is an unsigned type.
73 : */
74 :
75 : /*
76 : * Size of stack buffer to use for string transformations, used to avoid heap
77 : * allocations in typical cases. This should be large enough that most strings
78 : * will fit, but small enough that we feel comfortable putting it on the
79 : * stack.
80 : */
81 : #define TEXTBUFLEN 1024
82 :
83 : extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context);
84 :
85 : static int strncoll_libc(const char *arg1, ssize_t len1,
86 : const char *arg2, ssize_t len2,
87 : pg_locale_t locale);
88 : static size_t strnxfrm_libc(char *dest, size_t destsize,
89 : const char *src, ssize_t srclen,
90 : pg_locale_t locale);
91 : extern char *get_collation_actual_version_libc(const char *collcollate);
92 : static locale_t make_libc_collator(const char *collate,
93 : const char *ctype);
94 :
95 : #ifdef WIN32
96 : static int strncoll_libc_win32_utf8(const char *arg1, ssize_t len1,
97 : const char *arg2, ssize_t len2,
98 : pg_locale_t locale);
99 : #endif
100 :
101 : static size_t char2wchar(wchar_t *to, size_t tolen, const char *from,
102 : size_t fromlen, locale_t loc);
103 :
104 : static size_t strlower_libc_sb(char *dest, size_t destsize,
105 : const char *src, ssize_t srclen,
106 : pg_locale_t locale);
107 : static size_t strlower_libc_mb(char *dest, size_t destsize,
108 : const char *src, ssize_t srclen,
109 : pg_locale_t locale);
110 : static size_t strtitle_libc_sb(char *dest, size_t destsize,
111 : const char *src, ssize_t srclen,
112 : pg_locale_t locale);
113 : static size_t strtitle_libc_mb(char *dest, size_t destsize,
114 : const char *src, ssize_t srclen,
115 : pg_locale_t locale);
116 : static size_t strupper_libc_sb(char *dest, size_t destsize,
117 : const char *src, ssize_t srclen,
118 : pg_locale_t locale);
119 : static size_t strupper_libc_mb(char *dest, size_t destsize,
120 : const char *src, ssize_t srclen,
121 : pg_locale_t locale);
122 :
123 : static bool
124 0 : wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
125 : {
126 0 : return isdigit_l((unsigned char) wc, locale->lt);
127 : }
128 :
129 : static bool
130 0 : wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale)
131 : {
132 0 : return isalpha_l((unsigned char) wc, locale->lt);
133 : }
134 :
135 : static bool
136 0 : wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale)
137 : {
138 0 : return isalnum_l((unsigned char) wc, locale->lt);
139 : }
140 :
141 : static bool
142 0 : wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale)
143 : {
144 0 : return isupper_l((unsigned char) wc, locale->lt);
145 : }
146 :
147 : static bool
148 0 : wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale)
149 : {
150 0 : return islower_l((unsigned char) wc, locale->lt);
151 : }
152 :
153 : static bool
154 0 : wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale)
155 : {
156 0 : return isgraph_l((unsigned char) wc, locale->lt);
157 : }
158 :
159 : static bool
160 0 : wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale)
161 : {
162 0 : return isprint_l((unsigned char) wc, locale->lt);
163 : }
164 :
165 : static bool
166 0 : wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale)
167 : {
168 0 : return ispunct_l((unsigned char) wc, locale->lt);
169 : }
170 :
171 : static bool
172 0 : wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale)
173 : {
174 0 : return isspace_l((unsigned char) wc, locale->lt);
175 : }
176 :
177 : static bool
178 0 : wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
179 : {
180 : #ifndef WIN32
181 0 : return isxdigit_l((unsigned char) wc, locale->lt);
182 : #else
183 : return _isxdigit_l((unsigned char) wc, locale->lt);
184 : #endif
185 : }
186 :
187 : static bool
188 0 : wc_iscased_libc_sb(pg_wchar wc, pg_locale_t locale)
189 : {
190 0 : return isupper_l((unsigned char) wc, locale->lt) ||
191 0 : islower_l((unsigned char) wc, locale->lt);
192 : }
193 :
194 : static bool
195 131608 : wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
196 : {
197 131608 : return iswdigit_l((wint_t) wc, locale->lt);
198 : }
199 :
200 : static bool
201 81148 : wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale)
202 : {
203 81148 : return iswalpha_l((wint_t) wc, locale->lt);
204 : }
205 :
206 : static bool
207 2845684 : wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale)
208 : {
209 2845684 : return iswalnum_l((wint_t) wc, locale->lt);
210 : }
211 :
212 : static bool
213 4112 : wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale)
214 : {
215 4112 : return iswupper_l((wint_t) wc, locale->lt);
216 : }
217 :
218 : static bool
219 4102 : wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale)
220 : {
221 4102 : return iswlower_l((wint_t) wc, locale->lt);
222 : }
223 :
224 : static bool
225 4102 : wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale)
226 : {
227 4102 : return iswgraph_l((wint_t) wc, locale->lt);
228 : }
229 :
230 : static bool
231 4102 : wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale)
232 : {
233 4102 : return iswprint_l((wint_t) wc, locale->lt);
234 : }
235 :
236 : static bool
237 4102 : wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale)
238 : {
239 4102 : return iswpunct_l((wint_t) wc, locale->lt);
240 : }
241 :
242 : static bool
243 48152 : wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale)
244 : {
245 48152 : return iswspace_l((wint_t) wc, locale->lt);
246 : }
247 :
248 : static bool
249 12 : wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
250 : {
251 : #ifndef WIN32
252 12 : return iswxdigit_l((wint_t) wc, locale->lt);
253 : #else
254 : return _iswxdigit_l((wint_t) wc, locale->lt);
255 : #endif
256 : }
257 :
258 : static bool
259 0 : wc_iscased_libc_mb(pg_wchar wc, pg_locale_t locale)
260 : {
261 0 : return iswupper_l((wint_t) wc, locale->lt) ||
262 0 : iswlower_l((wint_t) wc, locale->lt);
263 : }
264 :
265 : static pg_wchar
266 0 : toupper_libc_sb(pg_wchar wc, pg_locale_t locale)
267 : {
268 : Assert(GetDatabaseEncoding() != PG_UTF8);
269 :
270 : /* force C behavior for ASCII characters, per comments above */
271 0 : if (locale->is_default && wc <= (pg_wchar) 127)
272 0 : return pg_ascii_toupper((unsigned char) wc);
273 0 : if (wc <= (pg_wchar) UCHAR_MAX)
274 0 : return toupper_l((unsigned char) wc, locale->lt);
275 : else
276 0 : return wc;
277 : }
278 :
279 : static pg_wchar
280 9088 : toupper_libc_mb(pg_wchar wc, pg_locale_t locale)
281 : {
282 : Assert(GetDatabaseEncoding() == PG_UTF8);
283 :
284 : /* force C behavior for ASCII characters, per comments above */
285 9088 : if (locale->is_default && wc <= (pg_wchar) 127)
286 892 : return pg_ascii_toupper((unsigned char) wc);
287 : if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
288 8196 : return towupper_l((wint_t) wc, locale->lt);
289 : else
290 : return wc;
291 : }
292 :
293 : static pg_wchar
294 0 : tolower_libc_sb(pg_wchar wc, pg_locale_t locale)
295 : {
296 : Assert(GetDatabaseEncoding() != PG_UTF8);
297 :
298 : /* force C behavior for ASCII characters, per comments above */
299 0 : if (locale->is_default && wc <= (pg_wchar) 127)
300 0 : return pg_ascii_tolower((unsigned char) wc);
301 0 : if (wc <= (pg_wchar) UCHAR_MAX)
302 0 : return tolower_l((unsigned char) wc, locale->lt);
303 : else
304 0 : return wc;
305 : }
306 :
307 : static pg_wchar
308 9092 : tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
309 : {
310 : Assert(GetDatabaseEncoding() == PG_UTF8);
311 :
312 : /* force C behavior for ASCII characters, per comments above */
313 9092 : if (locale->is_default && wc <= (pg_wchar) 127)
314 896 : return pg_ascii_tolower((unsigned char) wc);
315 : if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
316 8196 : return towlower_l((wint_t) wc, locale->lt);
317 : else
318 : return wc;
319 : }
320 :
321 : /*
322 : * Characters A..Z always downcase to a..z, even in the Turkish
323 : * locale. Characters beyond 127 use tolower().
324 : */
325 : static size_t
326 25040 : downcase_ident_libc_sb(char *dst, size_t dstsize, const char *src,
327 : ssize_t srclen, pg_locale_t locale)
328 : {
329 25040 : locale_t loc = locale->lt;
330 : int i;
331 :
332 244536 : for (i = 0; i < srclen && i < dstsize; i++)
333 : {
334 219496 : unsigned char ch = (unsigned char) src[i];
335 :
336 219496 : if (ch >= 'A' && ch <= 'Z')
337 13592 : ch = pg_ascii_tolower(ch);
338 205904 : else if (IS_HIGHBIT_SET(ch) && isupper_l(ch, loc))
339 0 : ch = tolower_l(ch, loc);
340 219496 : dst[i] = (char) ch;
341 : }
342 :
343 25040 : if (i < dstsize)
344 25040 : dst[i] = '\0';
345 :
346 25040 : return srclen;
347 : }
348 :
349 : static const struct ctype_methods ctype_methods_libc_sb = {
350 : .strlower = strlower_libc_sb,
351 : .strtitle = strtitle_libc_sb,
352 : .strupper = strupper_libc_sb,
353 : /* in libc, casefolding is the same as lowercasing */
354 : .strfold = strlower_libc_sb,
355 : .downcase_ident = downcase_ident_libc_sb,
356 : .wc_isdigit = wc_isdigit_libc_sb,
357 : .wc_isalpha = wc_isalpha_libc_sb,
358 : .wc_isalnum = wc_isalnum_libc_sb,
359 : .wc_isupper = wc_isupper_libc_sb,
360 : .wc_islower = wc_islower_libc_sb,
361 : .wc_isgraph = wc_isgraph_libc_sb,
362 : .wc_isprint = wc_isprint_libc_sb,
363 : .wc_ispunct = wc_ispunct_libc_sb,
364 : .wc_isspace = wc_isspace_libc_sb,
365 : .wc_isxdigit = wc_isxdigit_libc_sb,
366 : .wc_iscased = wc_iscased_libc_sb,
367 : .wc_toupper = toupper_libc_sb,
368 : .wc_tolower = tolower_libc_sb,
369 : };
370 :
371 : /*
372 : * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but
373 : * single-byte semantics for pattern matching.
374 : */
375 : static const struct ctype_methods ctype_methods_libc_other_mb = {
376 : .strlower = strlower_libc_mb,
377 : .strtitle = strtitle_libc_mb,
378 : .strupper = strupper_libc_mb,
379 : /* in libc, casefolding is the same as lowercasing */
380 : .strfold = strlower_libc_mb,
381 : /* uses plain ASCII semantics for historical reasons */
382 : .downcase_ident = NULL,
383 : .wc_isdigit = wc_isdigit_libc_sb,
384 : .wc_isalpha = wc_isalpha_libc_sb,
385 : .wc_isalnum = wc_isalnum_libc_sb,
386 : .wc_isupper = wc_isupper_libc_sb,
387 : .wc_islower = wc_islower_libc_sb,
388 : .wc_isgraph = wc_isgraph_libc_sb,
389 : .wc_isprint = wc_isprint_libc_sb,
390 : .wc_ispunct = wc_ispunct_libc_sb,
391 : .wc_isspace = wc_isspace_libc_sb,
392 : .wc_isxdigit = wc_isxdigit_libc_sb,
393 : .wc_iscased = wc_iscased_libc_sb,
394 : .wc_toupper = toupper_libc_sb,
395 : .wc_tolower = tolower_libc_sb,
396 : };
397 :
398 : static const struct ctype_methods ctype_methods_libc_utf8 = {
399 : .strlower = strlower_libc_mb,
400 : .strtitle = strtitle_libc_mb,
401 : .strupper = strupper_libc_mb,
402 : /* in libc, casefolding is the same as lowercasing */
403 : .strfold = strlower_libc_mb,
404 : /* uses plain ASCII semantics for historical reasons */
405 : .downcase_ident = NULL,
406 : .wc_isdigit = wc_isdigit_libc_mb,
407 : .wc_isalpha = wc_isalpha_libc_mb,
408 : .wc_isalnum = wc_isalnum_libc_mb,
409 : .wc_isupper = wc_isupper_libc_mb,
410 : .wc_islower = wc_islower_libc_mb,
411 : .wc_isgraph = wc_isgraph_libc_mb,
412 : .wc_isprint = wc_isprint_libc_mb,
413 : .wc_ispunct = wc_ispunct_libc_mb,
414 : .wc_isspace = wc_isspace_libc_mb,
415 : .wc_isxdigit = wc_isxdigit_libc_mb,
416 : .wc_iscased = wc_iscased_libc_mb,
417 : .wc_toupper = toupper_libc_mb,
418 : .wc_tolower = tolower_libc_mb,
419 : };
420 :
421 : static const struct collate_methods collate_methods_libc = {
422 : .strncoll = strncoll_libc,
423 : .strnxfrm = strnxfrm_libc,
424 : .strnxfrm_prefix = NULL,
425 :
426 : /*
427 : * Unfortunately, it seems that strxfrm() for non-C collations is broken
428 : * on many common platforms; testing of multiple versions of glibc reveals
429 : * that, for many locales, strcoll() and strxfrm() do not return
430 : * consistent results. While no other libc other than Cygwin has so far
431 : * been shown to have a problem, we take the conservative course of action
432 : * for right now and disable this categorically. (Users who are certain
433 : * this isn't a problem on their system can define TRUST_STRXFRM.)
434 : */
435 : #ifdef TRUST_STRXFRM
436 : .strxfrm_is_safe = true,
437 : #else
438 : .strxfrm_is_safe = false,
439 : #endif
440 : };
441 :
442 : #ifdef WIN32
443 : static const struct collate_methods collate_methods_libc_win32_utf8 = {
444 : .strncoll = strncoll_libc_win32_utf8,
445 : .strnxfrm = strnxfrm_libc,
446 : .strnxfrm_prefix = NULL,
447 : #ifdef TRUST_STRXFRM
448 : .strxfrm_is_safe = true,
449 : #else
450 : .strxfrm_is_safe = false,
451 : #endif
452 : };
453 : #endif
454 :
455 : static size_t
456 0 : strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
457 : pg_locale_t locale)
458 : {
459 0 : if (srclen < 0)
460 0 : srclen = strlen(src);
461 :
462 0 : if (srclen + 1 <= destsize)
463 : {
464 0 : locale_t loc = locale->lt;
465 : char *p;
466 :
467 0 : memcpy(dest, src, srclen);
468 0 : dest[srclen] = '\0';
469 :
470 : /*
471 : * Note: we assume that tolower_l() will not be so broken as to need
472 : * an isupper_l() guard test. When using the default collation, we
473 : * apply the traditional Postgres behavior that forces ASCII-style
474 : * treatment of I/i, but in non-default collations you get exactly
475 : * what the collation says.
476 : */
477 0 : for (p = dest; *p; p++)
478 : {
479 0 : if (locale->is_default)
480 : {
481 0 : if (*p >= 'A' && *p <= 'Z')
482 0 : *p += 'a' - 'A';
483 0 : else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc))
484 0 : *p = tolower_l((unsigned char) *p, loc);
485 : }
486 : else
487 0 : *p = tolower_l((unsigned char) *p, loc);
488 : }
489 : }
490 :
491 0 : return srclen;
492 : }
493 :
494 : static size_t
495 866204 : strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
496 : pg_locale_t locale)
497 : {
498 866204 : locale_t loc = locale->lt;
499 : size_t result_size;
500 : wchar_t *workspace;
501 : char *result;
502 : size_t curr_char;
503 : size_t max_size;
504 :
505 866204 : if (srclen < 0)
506 0 : srclen = strlen(src);
507 :
508 : /* Overflow paranoia */
509 866204 : if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
510 0 : ereport(ERROR,
511 : (errcode(ERRCODE_OUT_OF_MEMORY),
512 : errmsg("out of memory")));
513 :
514 : /* Output workspace cannot have more codes than input bytes */
515 866204 : workspace = palloc_array(wchar_t, srclen + 1);
516 :
517 866204 : char2wchar(workspace, srclen + 1, src, srclen, loc);
518 :
519 4552202 : for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
520 3685998 : workspace[curr_char] = towlower_l(workspace[curr_char], loc);
521 :
522 : /*
523 : * Make result large enough; case change might change number of bytes
524 : */
525 866204 : max_size = curr_char * pg_database_encoding_max_length();
526 866204 : result = palloc(max_size + 1);
527 :
528 866204 : result_size = wchar2char(result, workspace, max_size + 1, loc);
529 :
530 866204 : if (result_size + 1 > destsize)
531 0 : return result_size;
532 :
533 866204 : memcpy(dest, result, result_size);
534 866204 : dest[result_size] = '\0';
535 :
536 866204 : pfree(workspace);
537 866204 : pfree(result);
538 :
539 866204 : return result_size;
540 : }
541 :
542 : static size_t
543 0 : strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
544 : pg_locale_t locale)
545 : {
546 0 : if (srclen < 0)
547 0 : srclen = strlen(src);
548 :
549 0 : if (srclen + 1 <= destsize)
550 : {
551 0 : locale_t loc = locale->lt;
552 0 : int wasalnum = false;
553 : char *p;
554 :
555 0 : memcpy(dest, src, srclen);
556 0 : dest[srclen] = '\0';
557 :
558 : /*
559 : * Note: we assume that toupper_l()/tolower_l() will not be so broken
560 : * as to need guard tests. When using the default collation, we apply
561 : * the traditional Postgres behavior that forces ASCII-style treatment
562 : * of I/i, but in non-default collations you get exactly what the
563 : * collation says.
564 : */
565 0 : for (p = dest; *p; p++)
566 : {
567 0 : if (locale->is_default)
568 : {
569 0 : if (wasalnum)
570 : {
571 0 : if (*p >= 'A' && *p <= 'Z')
572 0 : *p += 'a' - 'A';
573 0 : else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc))
574 0 : *p = tolower_l((unsigned char) *p, loc);
575 : }
576 : else
577 : {
578 0 : if (*p >= 'a' && *p <= 'z')
579 0 : *p -= 'a' - 'A';
580 0 : else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc))
581 0 : *p = toupper_l((unsigned char) *p, loc);
582 : }
583 : }
584 : else
585 : {
586 0 : if (wasalnum)
587 0 : *p = tolower_l((unsigned char) *p, loc);
588 : else
589 0 : *p = toupper_l((unsigned char) *p, loc);
590 : }
591 0 : wasalnum = isalnum_l((unsigned char) *p, loc);
592 : }
593 : }
594 :
595 0 : return srclen;
596 : }
597 :
598 : static size_t
599 8 : strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
600 : pg_locale_t locale)
601 : {
602 8 : locale_t loc = locale->lt;
603 8 : int wasalnum = false;
604 : size_t result_size;
605 : wchar_t *workspace;
606 : char *result;
607 : size_t curr_char;
608 : size_t max_size;
609 :
610 8 : if (srclen < 0)
611 0 : srclen = strlen(src);
612 :
613 : /* Overflow paranoia */
614 8 : if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
615 0 : ereport(ERROR,
616 : (errcode(ERRCODE_OUT_OF_MEMORY),
617 : errmsg("out of memory")));
618 :
619 : /* Output workspace cannot have more codes than input bytes */
620 8 : workspace = palloc_array(wchar_t, srclen + 1);
621 :
622 8 : char2wchar(workspace, srclen + 1, src, srclen, loc);
623 :
624 80 : for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
625 : {
626 72 : if (wasalnum)
627 56 : workspace[curr_char] = towlower_l(workspace[curr_char], loc);
628 : else
629 16 : workspace[curr_char] = towupper_l(workspace[curr_char], loc);
630 72 : wasalnum = iswalnum_l(workspace[curr_char], loc);
631 : }
632 :
633 : /*
634 : * Make result large enough; case change might change number of bytes
635 : */
636 8 : max_size = curr_char * pg_database_encoding_max_length();
637 8 : result = palloc(max_size + 1);
638 :
639 8 : result_size = wchar2char(result, workspace, max_size + 1, loc);
640 :
641 8 : if (result_size + 1 > destsize)
642 0 : return result_size;
643 :
644 8 : memcpy(dest, result, result_size);
645 8 : dest[result_size] = '\0';
646 :
647 8 : pfree(workspace);
648 8 : pfree(result);
649 :
650 8 : return result_size;
651 : }
652 :
653 : static size_t
654 0 : strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
655 : pg_locale_t locale)
656 : {
657 0 : if (srclen < 0)
658 0 : srclen = strlen(src);
659 :
660 0 : if (srclen + 1 <= destsize)
661 : {
662 0 : locale_t loc = locale->lt;
663 : char *p;
664 :
665 0 : memcpy(dest, src, srclen);
666 0 : dest[srclen] = '\0';
667 :
668 : /*
669 : * Note: we assume that toupper_l() will not be so broken as to need
670 : * an islower_l() guard test. When using the default collation, we
671 : * apply the traditional Postgres behavior that forces ASCII-style
672 : * treatment of I/i, but in non-default collations you get exactly
673 : * what the collation says.
674 : */
675 0 : for (p = dest; *p; p++)
676 : {
677 0 : if (locale->is_default)
678 : {
679 0 : if (*p >= 'a' && *p <= 'z')
680 0 : *p -= 'a' - 'A';
681 0 : else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc))
682 0 : *p = toupper_l((unsigned char) *p, loc);
683 : }
684 : else
685 0 : *p = toupper_l((unsigned char) *p, loc);
686 : }
687 : }
688 :
689 0 : return srclen;
690 : }
691 :
692 : static size_t
693 721134 : strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
694 : pg_locale_t locale)
695 : {
696 721134 : locale_t loc = locale->lt;
697 : size_t result_size;
698 : wchar_t *workspace;
699 : char *result;
700 : size_t curr_char;
701 : size_t max_size;
702 :
703 721134 : if (srclen < 0)
704 0 : srclen = strlen(src);
705 :
706 : /* Overflow paranoia */
707 721134 : if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
708 0 : ereport(ERROR,
709 : (errcode(ERRCODE_OUT_OF_MEMORY),
710 : errmsg("out of memory")));
711 :
712 : /* Output workspace cannot have more codes than input bytes */
713 721134 : workspace = palloc_array(wchar_t, srclen + 1);
714 :
715 721134 : char2wchar(workspace, srclen + 1, src, srclen, loc);
716 :
717 2380512 : for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
718 1659378 : workspace[curr_char] = towupper_l(workspace[curr_char], loc);
719 :
720 : /*
721 : * Make result large enough; case change might change number of bytes
722 : */
723 721134 : max_size = curr_char * pg_database_encoding_max_length();
724 721134 : result = palloc(max_size + 1);
725 :
726 721134 : result_size = wchar2char(result, workspace, max_size + 1, loc);
727 :
728 721134 : if (result_size + 1 > destsize)
729 0 : return result_size;
730 :
731 721134 : memcpy(dest, result, result_size);
732 721134 : dest[result_size] = '\0';
733 :
734 721134 : pfree(workspace);
735 721134 : pfree(result);
736 :
737 721134 : return result_size;
738 : }
739 :
740 : pg_locale_t
741 31360 : create_pg_locale_libc(Oid collid, MemoryContext context)
742 : {
743 : const char *collate;
744 : const char *ctype;
745 : locale_t loc;
746 : pg_locale_t result;
747 :
748 31360 : if (collid == DEFAULT_COLLATION_OID)
749 : {
750 : HeapTuple tp;
751 : Datum datum;
752 :
753 31270 : tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
754 31270 : if (!HeapTupleIsValid(tp))
755 0 : elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
756 31270 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
757 : Anum_pg_database_datcollate);
758 31270 : collate = TextDatumGetCString(datum);
759 31270 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
760 : Anum_pg_database_datctype);
761 31270 : ctype = TextDatumGetCString(datum);
762 :
763 31270 : ReleaseSysCache(tp);
764 : }
765 : else
766 : {
767 : HeapTuple tp;
768 : Datum datum;
769 :
770 90 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
771 90 : if (!HeapTupleIsValid(tp))
772 0 : elog(ERROR, "cache lookup failed for collation %u", collid);
773 :
774 90 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
775 : Anum_pg_collation_collcollate);
776 90 : collate = TextDatumGetCString(datum);
777 90 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
778 : Anum_pg_collation_collctype);
779 90 : ctype = TextDatumGetCString(datum);
780 :
781 90 : ReleaseSysCache(tp);
782 : }
783 :
784 :
785 31360 : loc = make_libc_collator(collate, ctype);
786 :
787 31360 : result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
788 31360 : result->deterministic = true;
789 61588 : result->collate_is_c = (strcmp(collate, "C") == 0) ||
790 30228 : (strcmp(collate, "POSIX") == 0);
791 61588 : result->ctype_is_c = (strcmp(ctype, "C") == 0) ||
792 30228 : (strcmp(ctype, "POSIX") == 0);
793 31360 : result->lt = loc;
794 31360 : if (!result->collate_is_c)
795 : {
796 : #ifdef WIN32
797 : if (GetDatabaseEncoding() == PG_UTF8)
798 : result->collate = &collate_methods_libc_win32_utf8;
799 : else
800 : #endif
801 30164 : result->collate = &collate_methods_libc;
802 : }
803 31360 : if (!result->ctype_is_c)
804 : {
805 30164 : if (GetDatabaseEncoding() == PG_UTF8)
806 30100 : result->ctype = &ctype_methods_libc_utf8;
807 64 : else if (pg_database_encoding_max_length() > 1)
808 0 : result->ctype = &ctype_methods_libc_other_mb;
809 : else
810 64 : result->ctype = &ctype_methods_libc_sb;
811 : }
812 :
813 31360 : return result;
814 : }
815 :
816 : /*
817 : * Create a locale_t with the given collation and ctype.
818 : *
819 : * The "C" and "POSIX" locales are not actually handled by libc, so return
820 : * NULL.
821 : *
822 : * Ensure that no path leaks a locale_t.
823 : */
824 : static locale_t
825 31360 : make_libc_collator(const char *collate, const char *ctype)
826 : {
827 31360 : locale_t loc = 0;
828 :
829 31360 : if (strcmp(collate, ctype) == 0)
830 : {
831 31360 : if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
832 : {
833 : /* Normal case where they're the same */
834 30164 : errno = 0;
835 : #ifndef WIN32
836 30164 : loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
837 : NULL);
838 : #else
839 : loc = _create_locale(LC_ALL, collate);
840 : #endif
841 30164 : if (!loc)
842 0 : report_newlocale_failure(collate);
843 : }
844 : }
845 : else
846 : {
847 : #ifndef WIN32
848 : /* We need two newlocale() steps */
849 0 : locale_t loc1 = 0;
850 :
851 0 : if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0)
852 : {
853 0 : errno = 0;
854 0 : loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
855 0 : if (!loc1)
856 0 : report_newlocale_failure(collate);
857 : }
858 :
859 0 : if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
860 : {
861 0 : errno = 0;
862 0 : loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
863 0 : if (!loc)
864 : {
865 0 : if (loc1)
866 0 : freelocale(loc1);
867 0 : report_newlocale_failure(ctype);
868 : }
869 : }
870 : else
871 0 : loc = loc1;
872 : #else
873 :
874 : /*
875 : * XXX The _create_locale() API doesn't appear to support this. Could
876 : * perhaps be worked around by changing pg_locale_t to contain two
877 : * separate fields.
878 : */
879 : ereport(ERROR,
880 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
881 : errmsg("collations with different collate and ctype values are not supported on this platform")));
882 : #endif
883 : }
884 :
885 31360 : return loc;
886 : }
887 :
888 : /*
889 : * strncoll_libc
890 : *
891 : * NUL-terminate arguments, if necessary, and pass to strcoll_l().
892 : *
893 : * An input string length of -1 means that it's already NUL-terminated.
894 : */
895 : int
896 29627258 : strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
897 : pg_locale_t locale)
898 : {
899 : char sbuf[TEXTBUFLEN];
900 29627258 : char *buf = sbuf;
901 29627258 : size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1;
902 29627258 : size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1;
903 : const char *arg1n;
904 : const char *arg2n;
905 : int result;
906 :
907 29627258 : if (bufsize1 + bufsize2 > TEXTBUFLEN)
908 568 : buf = palloc(bufsize1 + bufsize2);
909 :
910 : /* nul-terminate arguments if necessary */
911 29627258 : if (len1 == -1)
912 : {
913 25198370 : arg1n = arg1;
914 : }
915 : else
916 : {
917 4428888 : char *buf1 = buf;
918 :
919 4428888 : memcpy(buf1, arg1, len1);
920 4428888 : buf1[len1] = '\0';
921 4428888 : arg1n = buf1;
922 : }
923 :
924 29627258 : if (len2 == -1)
925 : {
926 25198370 : arg2n = arg2;
927 : }
928 : else
929 : {
930 4428888 : char *buf2 = buf + bufsize1;
931 :
932 4428888 : memcpy(buf2, arg2, len2);
933 4428888 : buf2[len2] = '\0';
934 4428888 : arg2n = buf2;
935 : }
936 :
937 29627258 : result = strcoll_l(arg1n, arg2n, locale->lt);
938 :
939 29627258 : if (buf != sbuf)
940 568 : pfree(buf);
941 :
942 29627258 : return result;
943 : }
944 :
945 : /*
946 : * strnxfrm_libc
947 : *
948 : * NUL-terminate src, if necessary, and pass to strxfrm_l().
949 : *
950 : * A source length of -1 means that it's already NUL-terminated.
951 : */
952 : size_t
953 144 : strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
954 : pg_locale_t locale)
955 : {
956 : char sbuf[TEXTBUFLEN];
957 144 : char *buf = sbuf;
958 144 : size_t bufsize = srclen + 1;
959 : size_t result;
960 :
961 144 : if (srclen == -1)
962 144 : return strxfrm_l(dest, src, destsize, locale->lt);
963 :
964 0 : if (bufsize > TEXTBUFLEN)
965 0 : buf = palloc(bufsize);
966 :
967 : /* nul-terminate argument */
968 0 : memcpy(buf, src, srclen);
969 0 : buf[srclen] = '\0';
970 :
971 0 : result = strxfrm_l(dest, buf, destsize, locale->lt);
972 :
973 0 : if (buf != sbuf)
974 0 : pfree(buf);
975 :
976 : /* if dest is defined, it should be nul-terminated */
977 : Assert(result >= destsize || dest[result] == '\0');
978 :
979 0 : return result;
980 : }
981 :
982 : char *
983 30756 : get_collation_actual_version_libc(const char *collcollate)
984 : {
985 30756 : char *collversion = NULL;
986 :
987 61336 : if (pg_strcasecmp("C", collcollate) != 0 &&
988 60964 : pg_strncasecmp("C.", collcollate, 2) != 0 &&
989 30384 : pg_strcasecmp("POSIX", collcollate) != 0)
990 : {
991 : #if defined(__GLIBC__)
992 : /* Use the glibc version because we don't have anything better. */
993 30358 : collversion = pstrdup(gnu_get_libc_version());
994 : #elif defined(LC_VERSION_MASK)
995 : locale_t loc;
996 :
997 : /* Look up FreeBSD collation version. */
998 : loc = newlocale(LC_COLLATE_MASK, collcollate, NULL);
999 : if (loc)
1000 : {
1001 : collversion =
1002 : pstrdup(querylocale(LC_COLLATE_MASK | LC_VERSION_MASK, loc));
1003 : freelocale(loc);
1004 : }
1005 : else
1006 : ereport(ERROR,
1007 : (errmsg("could not load locale \"%s\"", collcollate)));
1008 : #elif defined(WIN32)
1009 : /*
1010 : * If we are targeting Windows Vista and above, we can ask for a name
1011 : * given a collation name (earlier versions required a location code
1012 : * that we don't have).
1013 : */
1014 : NLSVERSIONINFOEX version = {sizeof(NLSVERSIONINFOEX)};
1015 : WCHAR wide_collcollate[LOCALE_NAME_MAX_LENGTH];
1016 :
1017 : MultiByteToWideChar(CP_ACP, 0, collcollate, -1, wide_collcollate,
1018 : LOCALE_NAME_MAX_LENGTH);
1019 : if (!GetNLSVersionEx(COMPARE_STRING, wide_collcollate, &version))
1020 : {
1021 : /*
1022 : * GetNLSVersionEx() wants a language tag such as "en-US", not a
1023 : * locale name like "English_United States.1252". Until those
1024 : * values can be prevented from entering the system, or 100%
1025 : * reliably converted to the more useful tag format, tolerate the
1026 : * resulting error and report that we have no version data.
1027 : */
1028 : if (GetLastError() == ERROR_INVALID_PARAMETER)
1029 : return NULL;
1030 :
1031 : ereport(ERROR,
1032 : (errmsg("could not get collation version for locale \"%s\": error code %lu",
1033 : collcollate,
1034 : GetLastError())));
1035 : }
1036 : collversion = psprintf("%lu.%lu,%lu.%lu",
1037 : (version.dwNLSVersion >> 8) & 0xFFFF,
1038 : version.dwNLSVersion & 0xFF,
1039 : (version.dwDefinedVersion >> 8) & 0xFFFF,
1040 : version.dwDefinedVersion & 0xFF);
1041 : #endif
1042 : }
1043 :
1044 30756 : return collversion;
1045 : }
1046 :
1047 : /*
1048 : * strncoll_libc_win32_utf8
1049 : *
1050 : * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
1051 : * invoke wcscoll_l().
1052 : *
1053 : * An input string length of -1 means that it's NUL-terminated.
1054 : */
1055 : #ifdef WIN32
1056 : static int
1057 : strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
1058 : ssize_t len2, pg_locale_t locale)
1059 : {
1060 : char sbuf[TEXTBUFLEN];
1061 : char *buf = sbuf;
1062 : char *a1p,
1063 : *a2p;
1064 : int a1len;
1065 : int a2len;
1066 : int r;
1067 : int result;
1068 :
1069 : Assert(GetDatabaseEncoding() == PG_UTF8);
1070 :
1071 : if (len1 == -1)
1072 : len1 = strlen(arg1);
1073 : if (len2 == -1)
1074 : len2 = strlen(arg2);
1075 :
1076 : a1len = len1 * 2 + 2;
1077 : a2len = len2 * 2 + 2;
1078 :
1079 : if (a1len + a2len > TEXTBUFLEN)
1080 : buf = palloc(a1len + a2len);
1081 :
1082 : a1p = buf;
1083 : a2p = buf + a1len;
1084 :
1085 : /* API does not work for zero-length input */
1086 : if (len1 == 0)
1087 : r = 0;
1088 : else
1089 : {
1090 : r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1091 : (LPWSTR) a1p, a1len / 2);
1092 : if (!r)
1093 : ereport(ERROR,
1094 : (errmsg("could not convert string to UTF-16: error code %lu",
1095 : GetLastError())));
1096 : }
1097 : ((LPWSTR) a1p)[r] = 0;
1098 :
1099 : if (len2 == 0)
1100 : r = 0;
1101 : else
1102 : {
1103 : r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1104 : (LPWSTR) a2p, a2len / 2);
1105 : if (!r)
1106 : ereport(ERROR,
1107 : (errmsg("could not convert string to UTF-16: error code %lu",
1108 : GetLastError())));
1109 : }
1110 : ((LPWSTR) a2p)[r] = 0;
1111 :
1112 : errno = 0;
1113 : result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->lt);
1114 : if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */
1115 : ereport(ERROR,
1116 : (errmsg("could not compare Unicode strings: %m")));
1117 :
1118 : if (buf != sbuf)
1119 : pfree(buf);
1120 :
1121 : return result;
1122 : }
1123 : #endif /* WIN32 */
1124 :
1125 : /* simple subroutine for reporting errors from newlocale() */
1126 : void
1127 0 : report_newlocale_failure(const char *localename)
1128 : {
1129 : int save_errno;
1130 :
1131 : /*
1132 : * Windows doesn't provide any useful error indication from
1133 : * _create_locale(), and BSD-derived platforms don't seem to feel they
1134 : * need to set errno either (even though POSIX is pretty clear that
1135 : * newlocale should do so). So, if errno hasn't been set, assume ENOENT
1136 : * is what to report.
1137 : */
1138 0 : if (errno == 0)
1139 0 : errno = ENOENT;
1140 :
1141 : /*
1142 : * ENOENT means "no such locale", not "no such file", so clarify that
1143 : * errno with an errdetail message.
1144 : */
1145 0 : save_errno = errno; /* auxiliary funcs might change errno */
1146 0 : ereport(ERROR,
1147 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1148 : errmsg("could not create locale \"%s\": %m",
1149 : localename),
1150 : (save_errno == ENOENT ?
1151 : errdetail("The operating system could not find any locale data for the locale name \"%s\".",
1152 : localename) : 0)));
1153 : }
1154 :
1155 : /*
1156 : * POSIX doesn't define _l-variants of these functions, but several systems
1157 : * have them. We provide our own replacements here.
1158 : */
1159 : #ifndef HAVE_MBSTOWCS_L
1160 : static size_t
1161 1587346 : mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
1162 : {
1163 : #ifdef WIN32
1164 : return _mbstowcs_l(dest, src, n, loc);
1165 : #else
1166 : size_t result;
1167 1587346 : locale_t save_locale = uselocale(loc);
1168 :
1169 1587346 : result = mbstowcs(dest, src, n);
1170 1587346 : uselocale(save_locale);
1171 1587346 : return result;
1172 : #endif
1173 : }
1174 : #endif
1175 : #ifndef HAVE_WCSTOMBS_L
1176 : static size_t
1177 1587346 : wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
1178 : {
1179 : #ifdef WIN32
1180 : return _wcstombs_l(dest, src, n, loc);
1181 : #else
1182 : size_t result;
1183 1587346 : locale_t save_locale = uselocale(loc);
1184 :
1185 1587346 : result = wcstombs(dest, src, n);
1186 1587346 : uselocale(save_locale);
1187 1587346 : return result;
1188 : #endif
1189 : }
1190 : #endif
1191 :
1192 : /*
1193 : * These functions convert from/to libc's wchar_t, *not* pg_wchar.
1194 : * Therefore we keep them here rather than with the mbutils code.
1195 : */
1196 :
1197 : /*
1198 : * wchar2char --- convert wide characters to multibyte format
1199 : *
1200 : * This has the same API as the standard wcstombs_l() function; in particular,
1201 : * tolen is the maximum number of bytes to store at *to, and *from must be
1202 : * zero-terminated. The output will be zero-terminated iff there is room.
1203 : */
1204 : size_t
1205 1587346 : wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc)
1206 : {
1207 : size_t result;
1208 :
1209 1587346 : if (tolen == 0)
1210 0 : return 0;
1211 :
1212 : #ifdef WIN32
1213 :
1214 : /*
1215 : * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
1216 : * for some reason mbstowcs and wcstombs won't do this for us, so we use
1217 : * MultiByteToWideChar().
1218 : */
1219 : if (GetDatabaseEncoding() == PG_UTF8)
1220 : {
1221 : result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
1222 : NULL, NULL);
1223 : /* A zero return is failure */
1224 : if (result <= 0)
1225 : result = -1;
1226 : else
1227 : {
1228 : Assert(result <= tolen);
1229 : /* Microsoft counts the zero terminator in the result */
1230 : result--;
1231 : }
1232 : }
1233 : else
1234 : #endif /* WIN32 */
1235 1587346 : if (loc == (locale_t) 0)
1236 : {
1237 : /* Use wcstombs directly for the default locale */
1238 0 : result = wcstombs(to, from, tolen);
1239 : }
1240 : else
1241 : {
1242 : /* Use wcstombs_l for nondefault locales */
1243 1587346 : result = wcstombs_l(to, from, tolen, loc);
1244 : }
1245 :
1246 1587346 : return result;
1247 : }
1248 :
1249 : /*
1250 : * char2wchar --- convert multibyte characters to wide characters
1251 : *
1252 : * This has almost the API of mbstowcs_l(), except that *from need not be
1253 : * null-terminated; instead, the number of input bytes is specified as
1254 : * fromlen. Also, we ereport() rather than returning -1 for invalid
1255 : * input encoding. tolen is the maximum number of wchar_t's to store at *to.
1256 : * The output will be zero-terminated iff there is room.
1257 : */
1258 : static size_t
1259 1587346 : char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
1260 : locale_t loc)
1261 : {
1262 : size_t result;
1263 :
1264 1587346 : if (tolen == 0)
1265 0 : return 0;
1266 :
1267 : #ifdef WIN32
1268 : /* See WIN32 "Unicode" comment above */
1269 : if (GetDatabaseEncoding() == PG_UTF8)
1270 : {
1271 : /* Win32 API does not work for zero-length input */
1272 : if (fromlen == 0)
1273 : result = 0;
1274 : else
1275 : {
1276 : result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
1277 : /* A zero return is failure */
1278 : if (result == 0)
1279 : result = -1;
1280 : }
1281 :
1282 : if (result != -1)
1283 : {
1284 : Assert(result < tolen);
1285 : /* Append trailing null wchar (MultiByteToWideChar() does not) */
1286 : to[result] = 0;
1287 : }
1288 : }
1289 : else
1290 : #endif /* WIN32 */
1291 : {
1292 : /* mbstowcs requires ending '\0' */
1293 1587346 : char *str = pnstrdup(from, fromlen);
1294 :
1295 1587346 : if (loc == (locale_t) 0)
1296 : {
1297 : /* Use mbstowcs directly for the default locale */
1298 0 : result = mbstowcs(to, str, tolen);
1299 : }
1300 : else
1301 : {
1302 : /* Use mbstowcs_l for nondefault locales */
1303 1587346 : result = mbstowcs_l(to, str, tolen, loc);
1304 : }
1305 :
1306 1587346 : pfree(str);
1307 : }
1308 :
1309 1587346 : if (result == -1)
1310 : {
1311 : /*
1312 : * Invalid multibyte character encountered. We try to give a useful
1313 : * error message by letting pg_verifymbstr check the string. But it's
1314 : * possible that the string is OK to us, and not OK to mbstowcs ---
1315 : * this suggests that the LC_CTYPE locale is different from the
1316 : * database encoding. Give a generic error message if pg_verifymbstr
1317 : * can't find anything wrong.
1318 : */
1319 0 : pg_verifymbstr(from, fromlen, false); /* might not return */
1320 : /* but if it does ... */
1321 0 : ereport(ERROR,
1322 : (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1323 : errmsg("invalid multibyte character for locale"),
1324 : errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
1325 : }
1326 :
1327 1587346 : return result;
1328 : }
|