Line data Source code
1 : /*-----------------------------------------------------------------------
2 : *
3 : * PostgreSQL locale utilities for libc
4 : *
5 : * Portions Copyright (c) 2002-2026, PostgreSQL Global Development Group
6 : *
7 : * src/backend/utils/adt/pg_locale_libc.c
8 : *
9 : *-----------------------------------------------------------------------
10 : */
11 :
12 : #include "postgres.h"
13 :
14 : #include <limits.h>
15 : #include <wctype.h>
16 :
17 : #include "access/htup_details.h"
18 : #include "catalog/pg_database.h"
19 : #include "catalog/pg_collation.h"
20 : #include "mb/pg_wchar.h"
21 : #include "miscadmin.h"
22 : #include "utils/builtins.h"
23 : #include "utils/formatting.h"
24 : #include "utils/memutils.h"
25 : #include "utils/pg_locale.h"
26 : #include "utils/syscache.h"
27 :
28 : #ifdef __GLIBC__
29 : #include <gnu/libc-version.h>
30 : #endif
31 :
32 : #ifdef WIN32
33 : #include <shlwapi.h>
34 : #endif
35 :
36 : /*
37 : * For the libc provider, to provide as much functionality as possible on a
38 : * variety of platforms without going so far as to implement everything from
39 : * scratch, we use several implementation strategies depending on the
40 : * situation:
41 : *
42 : * 1. In C/POSIX collations, we use hard-wired code. We can't depend on
43 : * the <ctype.h> functions since those will obey LC_CTYPE. Note that these
44 : * collations don't give a fig about multibyte characters.
45 : *
46 : * 2. When working in UTF8 encoding, we use the <wctype.h> functions.
47 : * This assumes that every platform uses Unicode codepoints directly
48 : * as the wchar_t representation of Unicode. On some platforms
49 : * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
50 : *
51 : * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar
52 : * values up to 255, and punt for values above that. This is 100% correct
53 : * only in single-byte encodings such as LATINn. However, non-Unicode
54 : * multibyte encodings are mostly Far Eastern character sets for which the
55 : * properties being tested here aren't very relevant for higher code values
56 : * anyway. The difficulty with using the <wctype.h> functions with
57 : * non-Unicode multibyte encodings is that we can have no certainty that
58 : * the platform's wchar_t representation matches what we do in pg_wchar
59 : * conversions.
60 : *
61 : * As a special case, in the "default" collation, (2) and (3) force ASCII
62 : * letters to follow ASCII upcase/downcase rules, while in a non-default
63 : * collation we just let the library functions do what they will. The case
64 : * where this matters is treatment of I/i in Turkish, and the behavior is
65 : * meant to match the upper()/lower() SQL functions.
66 : *
67 : * We store the active collation setting in static variables. In principle
68 : * it could be passed down to here via the regex library's "struct vars" data
69 : * structure; but that would require somewhat invasive changes in the regex
70 : * library, and right now there's no real benefit to be gained from that.
71 : *
72 : * NB: the coding here assumes pg_wchar is an unsigned type.
73 : */
74 :
75 : /*
76 : * Size of stack buffer to use for string transformations, used to avoid heap
77 : * allocations in typical cases. This should be large enough that most strings
78 : * will fit, but small enough that we feel comfortable putting it on the
79 : * stack.
80 : */
81 : #define TEXTBUFLEN 1024
82 :
83 : extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context);
84 :
85 : static int strncoll_libc(const char *arg1, size_t len1,
86 : const char *arg2, size_t len2,
87 : pg_locale_t locale);
88 : static int strcoll_libc(const char *arg1, const char *arg2,
89 : pg_locale_t locale);
90 : static size_t strnxfrm_libc(char *dest, size_t destsize,
91 : const char *src, size_t srclen,
92 : pg_locale_t locale);
93 : static size_t strxfrm_libc(char *dest, size_t destsize,
94 : const char *src, pg_locale_t locale);
95 : extern char *get_collation_actual_version_libc(const char *collcollate);
96 : static locale_t make_libc_collator(const char *collate,
97 : const char *ctype);
98 :
99 : #ifdef WIN32
100 : static int strncoll_libc_win32_utf8(const char *arg1, size_t len1,
101 : const char *arg2, size_t len2,
102 : pg_locale_t locale);
103 : static int strcoll_libc_win32_utf8(const char *arg1, const char *arg2,
104 : pg_locale_t locale);
105 : #endif
106 :
107 : static size_t char2wchar(wchar_t *to, size_t tolen, const char *from,
108 : size_t fromlen, locale_t loc);
109 :
110 : static size_t strlower_libc_sb(char *dest, size_t destsize,
111 : const char *src, size_t srclen,
112 : pg_locale_t locale);
113 : static size_t strlower_libc_mb(char *dest, size_t destsize,
114 : const char *src, size_t srclen,
115 : pg_locale_t locale);
116 : static size_t strtitle_libc_sb(char *dest, size_t destsize,
117 : const char *src, size_t srclen,
118 : pg_locale_t locale);
119 : static size_t strtitle_libc_mb(char *dest, size_t destsize,
120 : const char *src, size_t srclen,
121 : pg_locale_t locale);
122 : static size_t strupper_libc_sb(char *dest, size_t destsize,
123 : const char *src, size_t srclen,
124 : pg_locale_t locale);
125 : static size_t strupper_libc_mb(char *dest, size_t destsize,
126 : const char *src, size_t srclen,
127 : pg_locale_t locale);
128 :
129 : static bool
130 0 : wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
131 : {
132 0 : return isdigit_l((unsigned char) wc, locale->lt);
133 : }
134 :
135 : static bool
136 0 : wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale)
137 : {
138 0 : return isalpha_l((unsigned char) wc, locale->lt);
139 : }
140 :
141 : static bool
142 0 : wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale)
143 : {
144 0 : return isalnum_l((unsigned char) wc, locale->lt);
145 : }
146 :
147 : static bool
148 0 : wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale)
149 : {
150 0 : return isupper_l((unsigned char) wc, locale->lt);
151 : }
152 :
153 : static bool
154 0 : wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale)
155 : {
156 0 : return islower_l((unsigned char) wc, locale->lt);
157 : }
158 :
159 : static bool
160 0 : wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale)
161 : {
162 0 : return isgraph_l((unsigned char) wc, locale->lt);
163 : }
164 :
165 : static bool
166 0 : wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale)
167 : {
168 0 : return isprint_l((unsigned char) wc, locale->lt);
169 : }
170 :
171 : static bool
172 0 : wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale)
173 : {
174 0 : return ispunct_l((unsigned char) wc, locale->lt);
175 : }
176 :
177 : static bool
178 0 : wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale)
179 : {
180 0 : return isspace_l((unsigned char) wc, locale->lt);
181 : }
182 :
183 : static bool
184 0 : wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
185 : {
186 : #ifndef WIN32
187 0 : return isxdigit_l((unsigned char) wc, locale->lt);
188 : #else
189 : return _isxdigit_l((unsigned char) wc, locale->lt);
190 : #endif
191 : }
192 :
193 : static bool
194 0 : wc_iscased_libc_sb(pg_wchar wc, pg_locale_t locale)
195 : {
196 0 : return isupper_l((unsigned char) wc, locale->lt) ||
197 0 : islower_l((unsigned char) wc, locale->lt);
198 : }
199 :
200 : static bool
201 100063 : wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
202 : {
203 100063 : return iswdigit_l((wint_t) wc, locale->lt);
204 : }
205 :
206 : static bool
207 73895 : wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale)
208 : {
209 73895 : return iswalpha_l((wint_t) wc, locale->lt);
210 : }
211 :
212 : static bool
213 1597379 : wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale)
214 : {
215 1597379 : return iswalnum_l((wint_t) wc, locale->lt);
216 : }
217 :
218 : static bool
219 2056 : wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale)
220 : {
221 2056 : return iswupper_l((wint_t) wc, locale->lt);
222 : }
223 :
224 : static bool
225 2051 : wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale)
226 : {
227 2051 : return iswlower_l((wint_t) wc, locale->lt);
228 : }
229 :
230 : static bool
231 2051 : wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale)
232 : {
233 2051 : return iswgraph_l((wint_t) wc, locale->lt);
234 : }
235 :
236 : static bool
237 2051 : wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale)
238 : {
239 2051 : return iswprint_l((wint_t) wc, locale->lt);
240 : }
241 :
242 : static bool
243 2051 : wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale)
244 : {
245 2051 : return iswpunct_l((wint_t) wc, locale->lt);
246 : }
247 :
248 : static bool
249 34486 : wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale)
250 : {
251 34486 : return iswspace_l((wint_t) wc, locale->lt);
252 : }
253 :
254 : static bool
255 9 : wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
256 : {
257 : #ifndef WIN32
258 9 : return iswxdigit_l((wint_t) wc, locale->lt);
259 : #else
260 : return _iswxdigit_l((wint_t) wc, locale->lt);
261 : #endif
262 : }
263 :
264 : static bool
265 0 : wc_iscased_libc_mb(pg_wchar wc, pg_locale_t locale)
266 : {
267 0 : return iswupper_l((wint_t) wc, locale->lt) ||
268 0 : iswlower_l((wint_t) wc, locale->lt);
269 : }
270 :
271 : static pg_wchar
272 0 : toupper_libc_sb(pg_wchar wc, pg_locale_t locale)
273 : {
274 : Assert(GetDatabaseEncoding() != PG_UTF8);
275 :
276 : /* force C behavior for ASCII characters, per comments above */
277 0 : if (locale->is_default && wc <= (pg_wchar) 127)
278 0 : return pg_ascii_toupper((unsigned char) wc);
279 0 : if (wc <= (pg_wchar) UCHAR_MAX)
280 0 : return toupper_l((unsigned char) wc, locale->lt);
281 : else
282 0 : return wc;
283 : }
284 :
285 : static pg_wchar
286 4679 : toupper_libc_mb(pg_wchar wc, pg_locale_t locale)
287 : {
288 : Assert(GetDatabaseEncoding() == PG_UTF8);
289 :
290 : /* force C behavior for ASCII characters, per comments above */
291 4679 : if (locale->is_default && wc <= (pg_wchar) 127)
292 581 : return pg_ascii_toupper((unsigned char) wc);
293 : if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
294 4098 : return towupper_l((wint_t) wc, locale->lt);
295 : else
296 : return wc;
297 : }
298 :
299 : static pg_wchar
300 0 : tolower_libc_sb(pg_wchar wc, pg_locale_t locale)
301 : {
302 : Assert(GetDatabaseEncoding() != PG_UTF8);
303 :
304 : /* force C behavior for ASCII characters, per comments above */
305 0 : if (locale->is_default && wc <= (pg_wchar) 127)
306 0 : return pg_ascii_tolower((unsigned char) wc);
307 0 : if (wc <= (pg_wchar) UCHAR_MAX)
308 0 : return tolower_l((unsigned char) wc, locale->lt);
309 : else
310 0 : return wc;
311 : }
312 :
313 : static pg_wchar
314 4681 : tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
315 : {
316 : Assert(GetDatabaseEncoding() == PG_UTF8);
317 :
318 : /* force C behavior for ASCII characters, per comments above */
319 4681 : if (locale->is_default && wc <= (pg_wchar) 127)
320 583 : return pg_ascii_tolower((unsigned char) wc);
321 : if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
322 4098 : return towlower_l((wint_t) wc, locale->lt);
323 : else
324 : return wc;
325 : }
326 :
327 : /*
328 : * Characters A..Z always downcase to a..z, even in the Turkish
329 : * locale. Characters beyond 127 use tolower().
330 : */
331 : static size_t
332 13247 : downcase_ident_libc_sb(char *dst, size_t dstsize, const char *src,
333 : size_t srclen, pg_locale_t locale)
334 : {
335 13247 : locale_t loc = locale->lt;
336 : int i;
337 :
338 128853 : for (i = 0; i < srclen && i < dstsize; i++)
339 : {
340 115606 : unsigned char ch = (unsigned char) src[i];
341 :
342 115606 : if (ch >= 'A' && ch <= 'Z')
343 6728 : ch = pg_ascii_tolower(ch);
344 108878 : else if (IS_HIGHBIT_SET(ch) && isupper_l(ch, loc))
345 0 : ch = tolower_l(ch, loc);
346 115606 : dst[i] = (char) ch;
347 : }
348 :
349 13247 : if (i < dstsize)
350 13247 : dst[i] = '\0';
351 :
352 13247 : return srclen;
353 : }
354 :
355 : static const struct ctype_methods ctype_methods_libc_sb = {
356 : .strlower = strlower_libc_sb,
357 : .strtitle = strtitle_libc_sb,
358 : .strupper = strupper_libc_sb,
359 : /* in libc, casefolding is the same as lowercasing */
360 : .strfold = strlower_libc_sb,
361 : .downcase_ident = downcase_ident_libc_sb,
362 : .wc_isdigit = wc_isdigit_libc_sb,
363 : .wc_isalpha = wc_isalpha_libc_sb,
364 : .wc_isalnum = wc_isalnum_libc_sb,
365 : .wc_isupper = wc_isupper_libc_sb,
366 : .wc_islower = wc_islower_libc_sb,
367 : .wc_isgraph = wc_isgraph_libc_sb,
368 : .wc_isprint = wc_isprint_libc_sb,
369 : .wc_ispunct = wc_ispunct_libc_sb,
370 : .wc_isspace = wc_isspace_libc_sb,
371 : .wc_isxdigit = wc_isxdigit_libc_sb,
372 : .wc_iscased = wc_iscased_libc_sb,
373 : .wc_toupper = toupper_libc_sb,
374 : .wc_tolower = tolower_libc_sb,
375 : };
376 :
377 : /*
378 : * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but
379 : * single-byte semantics for pattern matching.
380 : */
381 : static const struct ctype_methods ctype_methods_libc_other_mb = {
382 : .strlower = strlower_libc_mb,
383 : .strtitle = strtitle_libc_mb,
384 : .strupper = strupper_libc_mb,
385 : /* in libc, casefolding is the same as lowercasing */
386 : .strfold = strlower_libc_mb,
387 : /* uses plain ASCII semantics for historical reasons */
388 : .downcase_ident = NULL,
389 : .wc_isdigit = wc_isdigit_libc_sb,
390 : .wc_isalpha = wc_isalpha_libc_sb,
391 : .wc_isalnum = wc_isalnum_libc_sb,
392 : .wc_isupper = wc_isupper_libc_sb,
393 : .wc_islower = wc_islower_libc_sb,
394 : .wc_isgraph = wc_isgraph_libc_sb,
395 : .wc_isprint = wc_isprint_libc_sb,
396 : .wc_ispunct = wc_ispunct_libc_sb,
397 : .wc_isspace = wc_isspace_libc_sb,
398 : .wc_isxdigit = wc_isxdigit_libc_sb,
399 : .wc_iscased = wc_iscased_libc_sb,
400 : .wc_toupper = toupper_libc_sb,
401 : .wc_tolower = tolower_libc_sb,
402 : };
403 :
404 : static const struct ctype_methods ctype_methods_libc_utf8 = {
405 : .strlower = strlower_libc_mb,
406 : .strtitle = strtitle_libc_mb,
407 : .strupper = strupper_libc_mb,
408 : /* in libc, casefolding is the same as lowercasing */
409 : .strfold = strlower_libc_mb,
410 : /* uses plain ASCII semantics for historical reasons */
411 : .downcase_ident = NULL,
412 : .wc_isdigit = wc_isdigit_libc_mb,
413 : .wc_isalpha = wc_isalpha_libc_mb,
414 : .wc_isalnum = wc_isalnum_libc_mb,
415 : .wc_isupper = wc_isupper_libc_mb,
416 : .wc_islower = wc_islower_libc_mb,
417 : .wc_isgraph = wc_isgraph_libc_mb,
418 : .wc_isprint = wc_isprint_libc_mb,
419 : .wc_ispunct = wc_ispunct_libc_mb,
420 : .wc_isspace = wc_isspace_libc_mb,
421 : .wc_isxdigit = wc_isxdigit_libc_mb,
422 : .wc_iscased = wc_iscased_libc_mb,
423 : .wc_toupper = toupper_libc_mb,
424 : .wc_tolower = tolower_libc_mb,
425 : };
426 :
427 : static const struct collate_methods collate_methods_libc = {
428 : .strncoll = strncoll_libc,
429 : .strcoll = strcoll_libc,
430 : .strnxfrm = strnxfrm_libc,
431 : .strxfrm = strxfrm_libc,
432 : .strnxfrm_prefix = NULL,
433 : .strxfrm_prefix = NULL,
434 :
435 : /*
436 : * Unfortunately, it seems that strxfrm() for non-C collations is broken
437 : * on many common platforms; testing of multiple versions of glibc reveals
438 : * that, for many locales, strcoll() and strxfrm() do not return
439 : * consistent results. While no other libc other than Cygwin has so far
440 : * been shown to have a problem, we take the conservative course of action
441 : * for right now and disable this categorically. (Users who are certain
442 : * this isn't a problem on their system can define TRUST_STRXFRM.)
443 : */
444 : #ifdef TRUST_STRXFRM
445 : .strxfrm_is_safe = true,
446 : #else
447 : .strxfrm_is_safe = false,
448 : #endif
449 : };
450 :
451 : #ifdef WIN32
452 : static const struct collate_methods collate_methods_libc_win32_utf8 = {
453 : .strncoll = strncoll_libc_win32_utf8,
454 : .strcoll = strcoll_libc_win32_utf8,
455 : .strnxfrm = strnxfrm_libc,
456 : .strxfrm = strxfrm_libc,
457 : .strnxfrm_prefix = NULL,
458 : #ifdef TRUST_STRXFRM
459 : .strxfrm_is_safe = true,
460 : #else
461 : .strxfrm_is_safe = false,
462 : #endif
463 : };
464 : #endif
465 :
466 : static size_t
467 0 : strlower_libc_sb(char *dest, size_t destsize, const char *src, size_t srclen,
468 : pg_locale_t locale)
469 : {
470 0 : if (srclen + 1 <= destsize)
471 : {
472 0 : locale_t loc = locale->lt;
473 : char *p;
474 :
475 0 : memcpy(dest, src, srclen);
476 0 : dest[srclen] = '\0';
477 :
478 : /*
479 : * Note: we assume that tolower_l() will not be so broken as to need
480 : * an isupper_l() guard test. When using the default collation, we
481 : * apply the traditional Postgres behavior that forces ASCII-style
482 : * treatment of I/i, but in non-default collations you get exactly
483 : * what the collation says.
484 : */
485 0 : for (p = dest; *p; p++)
486 : {
487 0 : if (locale->is_default)
488 : {
489 0 : if (*p >= 'A' && *p <= 'Z')
490 0 : *p += 'a' - 'A';
491 0 : else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc))
492 0 : *p = tolower_l((unsigned char) *p, loc);
493 : }
494 : else
495 0 : *p = tolower_l((unsigned char) *p, loc);
496 : }
497 : }
498 :
499 0 : return srclen;
500 : }
501 :
502 : static size_t
503 521267 : strlower_libc_mb(char *dest, size_t destsize, const char *src, size_t srclen,
504 : pg_locale_t locale)
505 : {
506 521267 : locale_t loc = locale->lt;
507 : size_t result_size;
508 : wchar_t *workspace;
509 : char *result;
510 : size_t curr_char;
511 : size_t max_size;
512 :
513 : /* Overflow paranoia */
514 521267 : if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
515 0 : ereport(ERROR,
516 : (errcode(ERRCODE_OUT_OF_MEMORY),
517 : errmsg("out of memory")));
518 :
519 : /* Output workspace cannot have more codes than input bytes */
520 521267 : workspace = palloc_array(wchar_t, srclen + 1);
521 :
522 521267 : char2wchar(workspace, srclen + 1, src, srclen, loc);
523 :
524 2497120 : for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
525 1975853 : workspace[curr_char] = towlower_l(workspace[curr_char], loc);
526 :
527 : /*
528 : * Make result large enough; case change might change number of bytes
529 : */
530 521267 : max_size = curr_char * pg_database_encoding_max_length();
531 521267 : result = palloc(max_size + 1);
532 :
533 521267 : result_size = wchar2char(result, workspace, max_size + 1, loc);
534 :
535 521267 : if (destsize >= result_size + 1)
536 : {
537 521267 : memcpy(dest, result, result_size);
538 521267 : dest[result_size] = '\0';
539 : }
540 :
541 521267 : pfree(workspace);
542 521267 : pfree(result);
543 :
544 521267 : return result_size;
545 : }
546 :
547 : static size_t
548 0 : strtitle_libc_sb(char *dest, size_t destsize, const char *src, size_t srclen,
549 : pg_locale_t locale)
550 : {
551 0 : if (srclen + 1 <= destsize)
552 : {
553 0 : locale_t loc = locale->lt;
554 0 : int wasalnum = false;
555 : char *p;
556 :
557 0 : memcpy(dest, src, srclen);
558 0 : dest[srclen] = '\0';
559 :
560 : /*
561 : * Note: we assume that toupper_l()/tolower_l() will not be so broken
562 : * as to need guard tests. When using the default collation, we apply
563 : * the traditional Postgres behavior that forces ASCII-style treatment
564 : * of I/i, but in non-default collations you get exactly what the
565 : * collation says.
566 : */
567 0 : for (p = dest; *p; p++)
568 : {
569 0 : if (locale->is_default)
570 : {
571 0 : if (wasalnum)
572 : {
573 0 : if (*p >= 'A' && *p <= 'Z')
574 0 : *p += 'a' - 'A';
575 0 : else if (IS_HIGHBIT_SET(*p) && isupper_l(*p, loc))
576 0 : *p = tolower_l((unsigned char) *p, loc);
577 : }
578 : else
579 : {
580 0 : if (*p >= 'a' && *p <= 'z')
581 0 : *p -= 'a' - 'A';
582 0 : else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc))
583 0 : *p = toupper_l((unsigned char) *p, loc);
584 : }
585 : }
586 : else
587 : {
588 0 : if (wasalnum)
589 0 : *p = tolower_l((unsigned char) *p, loc);
590 : else
591 0 : *p = toupper_l((unsigned char) *p, loc);
592 : }
593 0 : wasalnum = isalnum_l((unsigned char) *p, loc);
594 : }
595 : }
596 :
597 0 : return srclen;
598 : }
599 :
600 : static size_t
601 18 : strtitle_libc_mb(char *dest, size_t destsize, const char *src, size_t srclen,
602 : pg_locale_t locale)
603 : {
604 18 : locale_t loc = locale->lt;
605 18 : int wasalnum = false;
606 : size_t result_size;
607 : wchar_t *workspace;
608 : char *result;
609 : size_t curr_char;
610 : size_t max_size;
611 :
612 : /* Overflow paranoia */
613 18 : if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
614 0 : ereport(ERROR,
615 : (errcode(ERRCODE_OUT_OF_MEMORY),
616 : errmsg("out of memory")));
617 :
618 : /* Output workspace cannot have more codes than input bytes */
619 18 : workspace = palloc_array(wchar_t, srclen + 1);
620 :
621 18 : char2wchar(workspace, srclen + 1, src, srclen, loc);
622 :
623 165 : for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
624 : {
625 147 : if (wasalnum)
626 111 : workspace[curr_char] = towlower_l(workspace[curr_char], loc);
627 : else
628 36 : workspace[curr_char] = towupper_l(workspace[curr_char], loc);
629 147 : wasalnum = iswalnum_l(workspace[curr_char], loc);
630 : }
631 :
632 : /*
633 : * Make result large enough; case change might change number of bytes
634 : */
635 18 : max_size = curr_char * pg_database_encoding_max_length();
636 18 : result = palloc(max_size + 1);
637 :
638 18 : result_size = wchar2char(result, workspace, max_size + 1, loc);
639 :
640 18 : if (destsize >= result_size + 1)
641 : {
642 18 : memcpy(dest, result, result_size);
643 18 : dest[result_size] = '\0';
644 : }
645 :
646 18 : pfree(workspace);
647 18 : pfree(result);
648 :
649 18 : return result_size;
650 : }
651 :
652 : static size_t
653 0 : strupper_libc_sb(char *dest, size_t destsize, const char *src, size_t srclen,
654 : pg_locale_t locale)
655 : {
656 0 : if (srclen + 1 <= destsize)
657 : {
658 0 : locale_t loc = locale->lt;
659 : char *p;
660 :
661 0 : memcpy(dest, src, srclen);
662 0 : dest[srclen] = '\0';
663 :
664 : /*
665 : * Note: we assume that toupper_l() will not be so broken as to need
666 : * an islower_l() guard test. When using the default collation, we
667 : * apply the traditional Postgres behavior that forces ASCII-style
668 : * treatment of I/i, but in non-default collations you get exactly
669 : * what the collation says.
670 : */
671 0 : for (p = dest; *p; p++)
672 : {
673 0 : if (locale->is_default)
674 : {
675 0 : if (*p >= 'a' && *p <= 'z')
676 0 : *p -= 'a' - 'A';
677 0 : else if (IS_HIGHBIT_SET(*p) && islower_l(*p, loc))
678 0 : *p = toupper_l((unsigned char) *p, loc);
679 : }
680 : else
681 0 : *p = toupper_l((unsigned char) *p, loc);
682 : }
683 : }
684 :
685 0 : return srclen;
686 : }
687 :
688 : static size_t
689 521850 : strupper_libc_mb(char *dest, size_t destsize, const char *src, size_t srclen,
690 : pg_locale_t locale)
691 : {
692 521850 : locale_t loc = locale->lt;
693 : size_t result_size;
694 : wchar_t *workspace;
695 : char *result;
696 : size_t curr_char;
697 : size_t max_size;
698 :
699 : /* Overflow paranoia */
700 521850 : if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
701 0 : ereport(ERROR,
702 : (errcode(ERRCODE_OUT_OF_MEMORY),
703 : errmsg("out of memory")));
704 :
705 : /* Output workspace cannot have more codes than input bytes */
706 521850 : workspace = palloc_array(wchar_t, srclen + 1);
707 :
708 521850 : char2wchar(workspace, srclen + 1, src, srclen, loc);
709 :
710 1652339 : for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
711 1130489 : workspace[curr_char] = towupper_l(workspace[curr_char], loc);
712 :
713 : /*
714 : * Make result large enough; case change might change number of bytes
715 : */
716 521850 : max_size = curr_char * pg_database_encoding_max_length();
717 521850 : result = palloc(max_size + 1);
718 :
719 521850 : result_size = wchar2char(result, workspace, max_size + 1, loc);
720 :
721 521850 : if (destsize >= result_size + 1)
722 : {
723 521850 : memcpy(dest, result, result_size);
724 521850 : dest[result_size] = '\0';
725 : }
726 :
727 521850 : pfree(workspace);
728 521850 : pfree(result);
729 :
730 521850 : return result_size;
731 : }
732 :
733 : pg_locale_t
734 17640 : create_pg_locale_libc(Oid collid, MemoryContext context)
735 : {
736 : const char *collate;
737 : const char *ctype;
738 : locale_t loc;
739 : pg_locale_t result;
740 :
741 17640 : if (collid == DEFAULT_COLLATION_OID)
742 : {
743 : HeapTuple tp;
744 : Datum datum;
745 :
746 17583 : tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
747 17583 : if (!HeapTupleIsValid(tp))
748 0 : elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
749 17583 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
750 : Anum_pg_database_datcollate);
751 17583 : collate = TextDatumGetCString(datum);
752 17583 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
753 : Anum_pg_database_datctype);
754 17583 : ctype = TextDatumGetCString(datum);
755 :
756 17583 : ReleaseSysCache(tp);
757 : }
758 : else
759 : {
760 : HeapTuple tp;
761 : Datum datum;
762 :
763 57 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
764 57 : if (!HeapTupleIsValid(tp))
765 0 : elog(ERROR, "cache lookup failed for collation %u", collid);
766 :
767 57 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
768 : Anum_pg_collation_collcollate);
769 57 : collate = TextDatumGetCString(datum);
770 57 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
771 : Anum_pg_collation_collctype);
772 57 : ctype = TextDatumGetCString(datum);
773 :
774 57 : ReleaseSysCache(tp);
775 : }
776 :
777 :
778 17640 : loc = make_libc_collator(collate, ctype);
779 :
780 17640 : result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
781 17640 : result->deterministic = true;
782 34714 : result->collate_is_c = (strcmp(collate, "C") == 0) ||
783 17074 : (strcmp(collate, "POSIX") == 0);
784 34714 : result->ctype_is_c = (strcmp(ctype, "C") == 0) ||
785 17074 : (strcmp(ctype, "POSIX") == 0);
786 17640 : result->lt = loc;
787 17640 : if (!result->collate_is_c)
788 : {
789 : #ifdef WIN32
790 : if (GetDatabaseEncoding() == PG_UTF8)
791 : result->collate = &collate_methods_libc_win32_utf8;
792 : else
793 : #endif
794 17034 : result->collate = &collate_methods_libc;
795 : }
796 17640 : if (!result->ctype_is_c)
797 : {
798 17034 : if (GetDatabaseEncoding() == PG_UTF8)
799 17002 : result->ctype = &ctype_methods_libc_utf8;
800 32 : else if (pg_database_encoding_max_length() > 1)
801 0 : result->ctype = &ctype_methods_libc_other_mb;
802 : else
803 32 : result->ctype = &ctype_methods_libc_sb;
804 : }
805 :
806 17640 : return result;
807 : }
808 :
809 : /*
810 : * Create a locale_t with the given collation and ctype.
811 : *
812 : * The "C" and "POSIX" locales are not actually handled by libc, so return
813 : * NULL.
814 : *
815 : * Ensure that no path leaks a locale_t.
816 : */
817 : static locale_t
818 17640 : make_libc_collator(const char *collate, const char *ctype)
819 : {
820 17640 : locale_t loc = 0;
821 :
822 17640 : if (strcmp(collate, ctype) == 0)
823 : {
824 17640 : if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
825 : {
826 : /* Normal case where they're the same */
827 17034 : errno = 0;
828 : #ifndef WIN32
829 17034 : loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
830 : NULL);
831 : #else
832 : loc = _create_locale(LC_ALL, collate);
833 : #endif
834 17034 : if (!loc)
835 0 : report_newlocale_failure(collate);
836 : }
837 : }
838 : else
839 : {
840 : #ifndef WIN32
841 : /* We need two newlocale() steps */
842 0 : locale_t loc1 = 0;
843 :
844 0 : if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0)
845 : {
846 0 : errno = 0;
847 0 : loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
848 0 : if (!loc1)
849 0 : report_newlocale_failure(collate);
850 : }
851 :
852 0 : if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
853 : {
854 0 : errno = 0;
855 0 : loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
856 0 : if (!loc)
857 : {
858 0 : if (loc1)
859 0 : freelocale(loc1);
860 0 : report_newlocale_failure(ctype);
861 : }
862 : }
863 : else
864 0 : loc = loc1;
865 : #else
866 :
867 : /*
868 : * XXX The _create_locale() API doesn't appear to support this. Could
869 : * perhaps be worked around by changing pg_locale_t to contain two
870 : * separate fields.
871 : */
872 : ereport(ERROR,
873 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
874 : errmsg("collations with different collate and ctype values are not supported on this platform")));
875 : #endif
876 : }
877 :
878 17640 : return loc;
879 : }
880 :
881 : /*
882 : * strncoll_libc
883 : *
884 : * NUL-terminate arguments and pass to strcoll_l().
885 : */
886 : static int
887 2773313 : strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
888 : pg_locale_t locale)
889 : {
890 : char sbuf[TEXTBUFLEN];
891 2773313 : char *buf = sbuf;
892 2773313 : size_t bufsize1 = len1 + 1;
893 2773313 : size_t bufsize2 = len2 + 1;
894 : char *buf1;
895 : char *buf2;
896 : const char *arg1n;
897 : const char *arg2n;
898 : int result;
899 :
900 2773313 : if (bufsize1 + bufsize2 > TEXTBUFLEN)
901 318 : buf = palloc(bufsize1 + bufsize2);
902 :
903 2773313 : buf1 = buf;
904 2773313 : buf2 = buf + bufsize1;
905 :
906 2773313 : memcpy(buf1, arg1, len1);
907 2773313 : buf1[len1] = '\0';
908 2773313 : arg1n = buf1;
909 :
910 2773313 : memcpy(buf2, arg2, len2);
911 2773313 : buf2[len2] = '\0';
912 2773313 : arg2n = buf2;
913 :
914 2773313 : result = strcoll_l(arg1n, arg2n, locale->lt);
915 :
916 2773313 : if (buf != sbuf)
917 318 : pfree(buf);
918 :
919 2773313 : return result;
920 : }
921 :
922 : /*
923 : * strcoll_libc
924 : */
925 : static int
926 14471845 : strcoll_libc(const char *arg1, const char *arg2, pg_locale_t locale)
927 : {
928 14471845 : return strcoll_l(arg1, arg2, locale->lt);
929 : }
930 :
931 : /*
932 : * strnxfrm_libc
933 : *
934 : * NUL-terminate src and pass to strxfrm_l().
935 : */
936 : static size_t
937 0 : strnxfrm_libc(char *dest, size_t destsize, const char *src, size_t srclen,
938 : pg_locale_t locale)
939 : {
940 : char sbuf[TEXTBUFLEN];
941 0 : char *buf = sbuf;
942 0 : size_t bufsize = srclen + 1;
943 : size_t result;
944 :
945 0 : if (bufsize > TEXTBUFLEN)
946 0 : buf = palloc(bufsize);
947 :
948 : /* nul-terminate argument */
949 0 : memcpy(buf, src, srclen);
950 0 : buf[srclen] = '\0';
951 :
952 0 : result = strxfrm_l(dest, buf, destsize, locale->lt);
953 :
954 0 : if (buf != sbuf)
955 0 : pfree(buf);
956 :
957 : /* if dest is defined, it should be nul-terminated */
958 : Assert(result >= destsize || dest[result] == '\0');
959 :
960 0 : return result;
961 : }
962 :
963 : /*
964 : * strxfrm_libc
965 : */
966 : static size_t
967 132 : strxfrm_libc(char *dest, size_t destsize, const char *src, pg_locale_t locale)
968 : {
969 132 : return strxfrm_l(dest, src, destsize, locale->lt);
970 : }
971 :
972 : char *
973 17462 : get_collation_actual_version_libc(const char *collcollate)
974 : {
975 17462 : char *collversion = NULL;
976 :
977 34831 : if (pg_strcasecmp("C", collcollate) != 0 &&
978 34628 : pg_strncasecmp("C.", collcollate, 2) != 0 &&
979 17259 : pg_strcasecmp("POSIX", collcollate) != 0)
980 : {
981 : #if defined(__GLIBC__)
982 : /* Use the glibc version because we don't have anything better. */
983 17245 : collversion = pstrdup(gnu_get_libc_version());
984 : #elif defined(LC_VERSION_MASK)
985 : locale_t loc;
986 :
987 : /* Look up FreeBSD collation version. */
988 : loc = newlocale(LC_COLLATE_MASK, collcollate, NULL);
989 : if (loc)
990 : {
991 : collversion =
992 : pstrdup(querylocale(LC_COLLATE_MASK | LC_VERSION_MASK, loc));
993 : freelocale(loc);
994 : }
995 : else
996 : ereport(ERROR,
997 : (errmsg("could not load locale \"%s\"", collcollate)));
998 : #elif defined(WIN32)
999 : /*
1000 : * If we are targeting Windows Vista and above, we can ask for a name
1001 : * given a collation name (earlier versions required a location code
1002 : * that we don't have).
1003 : */
1004 : NLSVERSIONINFOEX version = {sizeof(NLSVERSIONINFOEX)};
1005 : WCHAR wide_collcollate[LOCALE_NAME_MAX_LENGTH];
1006 :
1007 : MultiByteToWideChar(CP_ACP, 0, collcollate, -1, wide_collcollate,
1008 : LOCALE_NAME_MAX_LENGTH);
1009 : if (!GetNLSVersionEx(COMPARE_STRING, wide_collcollate, &version))
1010 : {
1011 : /*
1012 : * GetNLSVersionEx() wants a language tag such as "en-US", not a
1013 : * locale name like "English_United States.1252". Until those
1014 : * values can be prevented from entering the system, or 100%
1015 : * reliably converted to the more useful tag format, tolerate the
1016 : * resulting error and report that we have no version data.
1017 : */
1018 : if (GetLastError() == ERROR_INVALID_PARAMETER)
1019 : return NULL;
1020 :
1021 : ereport(ERROR,
1022 : (errmsg("could not get collation version for locale \"%s\": error code %lu",
1023 : collcollate,
1024 : GetLastError())));
1025 : }
1026 : collversion = psprintf("%lu.%lu,%lu.%lu",
1027 : (version.dwNLSVersion >> 8) & 0xFFFF,
1028 : version.dwNLSVersion & 0xFF,
1029 : (version.dwDefinedVersion >> 8) & 0xFFFF,
1030 : version.dwDefinedVersion & 0xFF);
1031 : #endif
1032 : }
1033 :
1034 17462 : return collversion;
1035 : }
1036 :
1037 : /*
1038 : * strncoll_libc_win32_utf8
1039 : *
1040 : * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
1041 : * invoke wcscoll_l().
1042 : */
1043 : #ifdef WIN32
1044 : static int
1045 : strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
1046 : size_t len2, pg_locale_t locale)
1047 : {
1048 : char sbuf[TEXTBUFLEN];
1049 : char *buf = sbuf;
1050 : char *a1p,
1051 : *a2p;
1052 : size_t a1len,
1053 : a2len,
1054 : buflen;
1055 : int r;
1056 : int result;
1057 :
1058 : Assert(GetDatabaseEncoding() == PG_UTF8);
1059 :
1060 : /*
1061 : * In a 32-bit build, twice the input length can overflow size_t, so we
1062 : * must be careful.
1063 : */
1064 : a1len = add_size(add_size(len1, len1), 2);
1065 : a2len = add_size(add_size(len2, len2), 2);
1066 : buflen = add_size(a1len, a2len);
1067 :
1068 : if (buflen > TEXTBUFLEN)
1069 : buf = palloc(buflen);
1070 :
1071 : a1p = buf;
1072 : a2p = buf + a1len;
1073 :
1074 : /* API does not work for zero-length input */
1075 : if (len1 == 0)
1076 : r = 0;
1077 : else
1078 : {
1079 : r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1080 : (LPWSTR) a1p, a1len / 2);
1081 : if (!r)
1082 : ereport(ERROR,
1083 : (errmsg("could not convert string to UTF-16: error code %lu",
1084 : GetLastError())));
1085 : }
1086 : ((LPWSTR) a1p)[r] = 0;
1087 :
1088 : if (len2 == 0)
1089 : r = 0;
1090 : else
1091 : {
1092 : r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1093 : (LPWSTR) a2p, a2len / 2);
1094 : if (!r)
1095 : ereport(ERROR,
1096 : (errmsg("could not convert string to UTF-16: error code %lu",
1097 : GetLastError())));
1098 : }
1099 : ((LPWSTR) a2p)[r] = 0;
1100 :
1101 : errno = 0;
1102 : result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->lt);
1103 : if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */
1104 : ereport(ERROR,
1105 : (errmsg("could not compare Unicode strings: %m")));
1106 :
1107 : if (buf != sbuf)
1108 : pfree(buf);
1109 :
1110 : return result;
1111 : }
1112 :
1113 : static int
1114 : strcoll_libc_win32_utf8(const char *arg1, const char *arg2,
1115 : pg_locale_t locale)
1116 : {
1117 : size_t len1 = strlen(arg1);
1118 : size_t len2 = strlen(arg2);
1119 :
1120 : return strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
1121 : }
1122 : #endif /* WIN32 */
1123 :
1124 : /* simple subroutine for reporting errors from newlocale() */
1125 : void
1126 0 : report_newlocale_failure(const char *localename)
1127 : {
1128 : int save_errno;
1129 :
1130 : /*
1131 : * Windows doesn't provide any useful error indication from
1132 : * _create_locale(), and BSD-derived platforms don't seem to feel they
1133 : * need to set errno either (even though POSIX is pretty clear that
1134 : * newlocale should do so). So, if errno hasn't been set, assume ENOENT
1135 : * is what to report.
1136 : */
1137 0 : if (errno == 0)
1138 0 : errno = ENOENT;
1139 :
1140 : /*
1141 : * ENOENT means "no such locale", not "no such file", so clarify that
1142 : * errno with an errdetail message.
1143 : */
1144 0 : save_errno = errno; /* auxiliary funcs might change errno */
1145 0 : ereport(ERROR,
1146 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1147 : errmsg("could not create locale \"%s\": %m",
1148 : localename),
1149 : (save_errno == ENOENT ?
1150 : errdetail("The operating system could not find any locale data for the locale name \"%s\".",
1151 : localename) : 0)));
1152 : }
1153 :
1154 : /*
1155 : * POSIX doesn't define _l-variants of these functions, but several systems
1156 : * have them. We provide our own replacements here.
1157 : */
1158 : #ifndef HAVE_MBSTOWCS_L
1159 : static size_t
1160 1043135 : mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
1161 : {
1162 : #ifdef WIN32
1163 : return _mbstowcs_l(dest, src, n, loc);
1164 : #else
1165 : size_t result;
1166 1043135 : locale_t save_locale = uselocale(loc);
1167 :
1168 1043135 : result = mbstowcs(dest, src, n);
1169 1043135 : uselocale(save_locale);
1170 1043135 : return result;
1171 : #endif
1172 : }
1173 : #endif
1174 : #ifndef HAVE_WCSTOMBS_L
1175 : static size_t
1176 1043135 : wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
1177 : {
1178 : #ifdef WIN32
1179 : return _wcstombs_l(dest, src, n, loc);
1180 : #else
1181 : size_t result;
1182 1043135 : locale_t save_locale = uselocale(loc);
1183 :
1184 1043135 : result = wcstombs(dest, src, n);
1185 1043135 : uselocale(save_locale);
1186 1043135 : return result;
1187 : #endif
1188 : }
1189 : #endif
1190 :
1191 : /*
1192 : * These functions convert from/to libc's wchar_t, *not* pg_wchar.
1193 : * Therefore we keep them here rather than with the mbutils code.
1194 : */
1195 :
1196 : /*
1197 : * wchar2char --- convert wide characters to multibyte format
1198 : *
1199 : * This has the same API as the standard wcstombs_l() function; in particular,
1200 : * tolen is the maximum number of bytes to store at *to, and *from must be
1201 : * zero-terminated. The output will be zero-terminated iff there is room.
1202 : */
1203 : size_t
1204 1043135 : wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc)
1205 : {
1206 : size_t result;
1207 :
1208 1043135 : if (tolen == 0)
1209 0 : return 0;
1210 :
1211 : #ifdef WIN32
1212 :
1213 : /*
1214 : * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
1215 : * for some reason mbstowcs and wcstombs won't do this for us, so we use
1216 : * MultiByteToWideChar().
1217 : */
1218 : if (GetDatabaseEncoding() == PG_UTF8)
1219 : {
1220 : result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
1221 : NULL, NULL);
1222 : /* A zero return is failure */
1223 : if (result <= 0)
1224 : result = -1;
1225 : else
1226 : {
1227 : Assert(result <= tolen);
1228 : /* Microsoft counts the zero terminator in the result */
1229 : result--;
1230 : }
1231 : }
1232 : else
1233 : #endif /* WIN32 */
1234 1043135 : if (loc == (locale_t) 0)
1235 : {
1236 : /* Use wcstombs directly for the default locale */
1237 0 : result = wcstombs(to, from, tolen);
1238 : }
1239 : else
1240 : {
1241 : /* Use wcstombs_l for nondefault locales */
1242 1043135 : result = wcstombs_l(to, from, tolen, loc);
1243 : }
1244 :
1245 1043135 : return result;
1246 : }
1247 :
1248 : /*
1249 : * char2wchar --- convert multibyte characters to wide characters
1250 : *
1251 : * This has almost the API of mbstowcs_l(), except that *from need not be
1252 : * null-terminated; instead, the number of input bytes is specified as
1253 : * fromlen. Also, we ereport() rather than returning -1 for invalid
1254 : * input encoding. tolen is the maximum number of wchar_t's to store at *to.
1255 : * The output will be zero-terminated iff there is room.
1256 : */
1257 : static size_t
1258 1043135 : char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
1259 : locale_t loc)
1260 : {
1261 : size_t result;
1262 :
1263 1043135 : if (tolen == 0)
1264 0 : return 0;
1265 :
1266 : #ifdef WIN32
1267 : /* See WIN32 "Unicode" comment above */
1268 : if (GetDatabaseEncoding() == PG_UTF8)
1269 : {
1270 : /* Win32 API does not work for zero-length input */
1271 : if (fromlen == 0)
1272 : result = 0;
1273 : else
1274 : {
1275 : result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
1276 : /* A zero return is failure */
1277 : if (result == 0)
1278 : result = -1;
1279 : }
1280 :
1281 : if (result != -1)
1282 : {
1283 : Assert(result < tolen);
1284 : /* Append trailing null wchar (MultiByteToWideChar() does not) */
1285 : to[result] = 0;
1286 : }
1287 : }
1288 : else
1289 : #endif /* WIN32 */
1290 : {
1291 : /* mbstowcs requires ending '\0' */
1292 1043135 : char *str = pnstrdup(from, fromlen);
1293 :
1294 1043135 : if (loc == (locale_t) 0)
1295 : {
1296 : /* Use mbstowcs directly for the default locale */
1297 0 : result = mbstowcs(to, str, tolen);
1298 : }
1299 : else
1300 : {
1301 : /* Use mbstowcs_l for nondefault locales */
1302 1043135 : result = mbstowcs_l(to, str, tolen, loc);
1303 : }
1304 :
1305 1043135 : pfree(str);
1306 : }
1307 :
1308 1043135 : if (result == -1)
1309 : {
1310 : /*
1311 : * Invalid multibyte character encountered. We try to give a useful
1312 : * error message by letting pg_verifymbstr check the string. But it's
1313 : * possible that the string is OK to us, and not OK to mbstowcs ---
1314 : * this suggests that the LC_CTYPE locale is different from the
1315 : * database encoding. Give a generic error message if pg_verifymbstr
1316 : * can't find anything wrong.
1317 : */
1318 0 : pg_verifymbstr(from, fromlen, false); /* might not return */
1319 : /* but if it does ... */
1320 0 : ereport(ERROR,
1321 : (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1322 : errmsg("invalid multibyte character for locale"),
1323 : errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
1324 : }
1325 :
1326 1043135 : return result;
1327 : }
|