Line data Source code
1 : /*-----------------------------------------------------------------------
2 : *
3 : * PostgreSQL locale utilities
4 : *
5 : * Portions Copyright (c) 2002-2024, PostgreSQL Global Development Group
6 : *
7 : * src/backend/utils/adt/pg_locale.c
8 : *
9 : *-----------------------------------------------------------------------
10 : */
11 :
12 : /*----------
13 : * Here is how the locale stuff is handled: LC_COLLATE and LC_CTYPE
14 : * are fixed at CREATE DATABASE time, stored in pg_database, and cannot
15 : * be changed. Thus, the effects of strcoll(), strxfrm(), isupper(),
16 : * toupper(), etc. are always in the same fixed locale.
17 : *
18 : * LC_MESSAGES is settable at run time and will take effect
19 : * immediately.
20 : *
21 : * The other categories, LC_MONETARY, LC_NUMERIC, and LC_TIME are also
22 : * settable at run-time. However, we don't actually set those locale
23 : * categories permanently. This would have bizarre effects like no
24 : * longer accepting standard floating-point literals in some locales.
25 : * Instead, we only set these locale categories briefly when needed,
26 : * cache the required information obtained from localeconv() or
27 : * strftime(), and then set the locale categories back to "C".
28 : * The cached information is only used by the formatting functions
29 : * (to_char, etc.) and the money type. For the user, this should all be
30 : * transparent.
31 : *
32 : * !!! NOW HEAR THIS !!!
33 : *
34 : * We've been bitten repeatedly by this bug, so let's try to keep it in
35 : * mind in future: on some platforms, the locale functions return pointers
36 : * to static data that will be overwritten by any later locale function.
37 : * Thus, for example, the obvious-looking sequence
38 : * save = setlocale(category, NULL);
39 : * if (!setlocale(category, value))
40 : * fail = true;
41 : * setlocale(category, save);
42 : * DOES NOT WORK RELIABLY: on some platforms the second setlocale() call
43 : * will change the memory save is pointing at. To do this sort of thing
44 : * safely, you *must* pstrdup what setlocale returns the first time.
45 : *
46 : * The POSIX locale standard is available here:
47 : *
48 : * http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap07.html
49 : *----------
50 : */
51 :
52 :
53 : #include "postgres.h"
54 :
55 : #include <time.h>
56 :
57 : #include "access/htup_details.h"
58 : #include "catalog/pg_collation.h"
59 : #include "mb/pg_wchar.h"
60 : #include "miscadmin.h"
61 : #include "utils/builtins.h"
62 : #include "utils/formatting.h"
63 : #include "utils/guc_hooks.h"
64 : #include "utils/hsearch.h"
65 : #include "utils/lsyscache.h"
66 : #include "utils/memutils.h"
67 : #include "utils/pg_locale.h"
68 : #include "utils/syscache.h"
69 :
70 : #ifdef USE_ICU
71 : #include <unicode/ucnv.h>
72 : #include <unicode/ustring.h>
73 : #endif
74 :
75 : #ifdef __GLIBC__
76 : #include <gnu/libc-version.h>
77 : #endif
78 :
79 : #ifdef WIN32
80 : #include <shlwapi.h>
81 : #endif
82 :
83 : /* Error triggered for locale-sensitive subroutines */
84 : #define PGLOCALE_SUPPORT_ERROR(provider) \
85 : elog(ERROR, "unsupported collprovider for %s: %c", __func__, provider)
86 :
87 : /*
88 : * This should be large enough that most strings will fit, but small enough
89 : * that we feel comfortable putting it on the stack
90 : */
91 : #define TEXTBUFLEN 1024
92 :
93 : #define MAX_L10N_DATA 80
94 :
95 :
96 : /* GUC settings */
97 : char *locale_messages;
98 : char *locale_monetary;
99 : char *locale_numeric;
100 : char *locale_time;
101 :
102 : int icu_validation_level = WARNING;
103 :
104 : /*
105 : * lc_time localization cache.
106 : *
107 : * We use only the first 7 or 12 entries of these arrays. The last array
108 : * element is left as NULL for the convenience of outside code that wants
109 : * to sequentially scan these arrays.
110 : */
111 : char *localized_abbrev_days[7 + 1];
112 : char *localized_full_days[7 + 1];
113 : char *localized_abbrev_months[12 + 1];
114 : char *localized_full_months[12 + 1];
115 :
116 : /* is the databases's LC_CTYPE the C locale? */
117 : bool database_ctype_is_c = false;
118 :
119 : /* indicates whether locale information cache is valid */
120 : static bool CurrentLocaleConvValid = false;
121 : static bool CurrentLCTimeValid = false;
122 :
123 : /* Cache for collation-related knowledge */
124 :
125 : typedef struct
126 : {
127 : Oid collid; /* hash key: pg_collation OID */
128 : bool collate_is_c; /* is collation's LC_COLLATE C? */
129 : bool ctype_is_c; /* is collation's LC_CTYPE C? */
130 : bool flags_valid; /* true if above flags are valid */
131 : pg_locale_t locale; /* locale_t struct, or 0 if not valid */
132 : } collation_cache_entry;
133 :
134 : static HTAB *collation_cache = NULL;
135 :
136 :
137 : #if defined(WIN32) && defined(LC_MESSAGES)
138 : static char *IsoLocaleName(const char *);
139 : #endif
140 :
141 : #ifdef USE_ICU
142 : /*
143 : * Converter object for converting between ICU's UChar strings and C strings
144 : * in database encoding. Since the database encoding doesn't change, we only
145 : * need one of these per session.
146 : */
147 : static UConverter *icu_converter = NULL;
148 :
149 : static UCollator *pg_ucol_open(const char *loc_str);
150 : static void init_icu_converter(void);
151 : static size_t uchar_length(UConverter *converter,
152 : const char *str, int32_t len);
153 : static int32_t uchar_convert(UConverter *converter,
154 : UChar *dest, int32_t destlen,
155 : const char *src, int32_t srclen);
156 : static void icu_set_collation_attributes(UCollator *collator, const char *loc,
157 : UErrorCode *status);
158 : #endif
159 :
160 : /*
161 : * POSIX doesn't define _l-variants of these functions, but several systems
162 : * have them. We provide our own replacements here.
163 : */
164 : #ifndef HAVE_MBSTOWCS_L
165 : static size_t
166 0 : mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
167 : {
168 : #ifdef WIN32
169 : return _mbstowcs_l(dest, src, n, loc);
170 : #else
171 : size_t result;
172 0 : locale_t save_locale = uselocale(loc);
173 :
174 0 : result = mbstowcs(dest, src, n);
175 0 : uselocale(save_locale);
176 0 : return result;
177 : #endif
178 : }
179 : #endif
180 : #ifndef HAVE_WCSTOMBS_L
181 : static size_t
182 0 : wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
183 : {
184 : #ifdef WIN32
185 : return _wcstombs_l(dest, src, n, loc);
186 : #else
187 : size_t result;
188 0 : locale_t save_locale = uselocale(loc);
189 :
190 0 : result = wcstombs(dest, src, n);
191 0 : uselocale(save_locale);
192 0 : return result;
193 : #endif
194 : }
195 : #endif
196 :
197 : /*
198 : * pg_perm_setlocale
199 : *
200 : * This wraps the libc function setlocale(), with two additions. First, when
201 : * changing LC_CTYPE, update gettext's encoding for the current message
202 : * domain. GNU gettext automatically tracks LC_CTYPE on most platforms, but
203 : * not on Windows. Second, if the operation is successful, the corresponding
204 : * LC_XXX environment variable is set to match. By setting the environment
205 : * variable, we ensure that any subsequent use of setlocale(..., "") will
206 : * preserve the settings made through this routine. Of course, LC_ALL must
207 : * also be unset to fully ensure that, but that has to be done elsewhere after
208 : * all the individual LC_XXX variables have been set correctly. (Thank you
209 : * Perl for making this kluge necessary.)
210 : */
211 : char *
212 81358 : pg_perm_setlocale(int category, const char *locale)
213 : {
214 : char *result;
215 : const char *envvar;
216 :
217 : #ifndef WIN32
218 81358 : result = setlocale(category, locale);
219 : #else
220 :
221 : /*
222 : * On Windows, setlocale(LC_MESSAGES) does not work, so just assume that
223 : * the given value is good and set it in the environment variables. We
224 : * must ignore attempts to set to "", which means "keep using the old
225 : * environment value".
226 : */
227 : #ifdef LC_MESSAGES
228 : if (category == LC_MESSAGES)
229 : {
230 : result = (char *) locale;
231 : if (locale == NULL || locale[0] == '\0')
232 : return result;
233 : }
234 : else
235 : #endif
236 : result = setlocale(category, locale);
237 : #endif /* WIN32 */
238 :
239 81358 : if (result == NULL)
240 0 : return result; /* fall out immediately on failure */
241 :
242 : /*
243 : * Use the right encoding in translated messages. Under ENABLE_NLS, let
244 : * pg_bind_textdomain_codeset() figure it out. Under !ENABLE_NLS, message
245 : * format strings are ASCII, but database-encoding strings may enter the
246 : * message via %s. This makes the overall message encoding equal to the
247 : * database encoding.
248 : */
249 81358 : if (category == LC_CTYPE)
250 : {
251 : static char save_lc_ctype[LOCALE_NAME_BUFLEN];
252 :
253 : /* copy setlocale() return value before callee invokes it again */
254 26518 : strlcpy(save_lc_ctype, result, sizeof(save_lc_ctype));
255 26518 : result = save_lc_ctype;
256 :
257 : #ifdef ENABLE_NLS
258 26518 : SetMessageEncoding(pg_bind_textdomain_codeset(textdomain(NULL)));
259 : #else
260 : SetMessageEncoding(GetDatabaseEncoding());
261 : #endif
262 : }
263 :
264 81358 : switch (category)
265 : {
266 26518 : case LC_COLLATE:
267 26518 : envvar = "LC_COLLATE";
268 26518 : break;
269 26518 : case LC_CTYPE:
270 26518 : envvar = "LC_CTYPE";
271 26518 : break;
272 : #ifdef LC_MESSAGES
273 19088 : case LC_MESSAGES:
274 19088 : envvar = "LC_MESSAGES";
275 : #ifdef WIN32
276 : result = IsoLocaleName(locale);
277 : if (result == NULL)
278 : result = (char *) locale;
279 : elog(DEBUG3, "IsoLocaleName() executed; locale: \"%s\"", result);
280 : #endif /* WIN32 */
281 19088 : break;
282 : #endif /* LC_MESSAGES */
283 3078 : case LC_MONETARY:
284 3078 : envvar = "LC_MONETARY";
285 3078 : break;
286 3078 : case LC_NUMERIC:
287 3078 : envvar = "LC_NUMERIC";
288 3078 : break;
289 3078 : case LC_TIME:
290 3078 : envvar = "LC_TIME";
291 3078 : break;
292 0 : default:
293 0 : elog(FATAL, "unrecognized LC category: %d", category);
294 : return NULL; /* keep compiler quiet */
295 : }
296 :
297 81358 : if (setenv(envvar, result, 1) != 0)
298 0 : return NULL;
299 :
300 81358 : return result;
301 : }
302 :
303 :
304 : /*
305 : * Is the locale name valid for the locale category?
306 : *
307 : * If successful, and canonname isn't NULL, a palloc'd copy of the locale's
308 : * canonical name is stored there. This is especially useful for figuring out
309 : * what locale name "" means (ie, the server environment value). (Actually,
310 : * it seems that on most implementations that's the only thing it's good for;
311 : * we could wish that setlocale gave back a canonically spelled version of
312 : * the locale name, but typically it doesn't.)
313 : */
314 : bool
315 61490 : check_locale(int category, const char *locale, char **canonname)
316 : {
317 : char *save;
318 : char *res;
319 :
320 61490 : if (canonname)
321 1258 : *canonname = NULL; /* in case of failure */
322 :
323 61490 : save = setlocale(category, NULL);
324 61490 : if (!save)
325 0 : return false; /* won't happen, we hope */
326 :
327 : /* save may be pointing at a modifiable scratch variable, see above. */
328 61490 : save = pstrdup(save);
329 :
330 : /* set the locale with setlocale, to see if it accepts it. */
331 61490 : res = setlocale(category, locale);
332 :
333 : /* save canonical name if requested. */
334 61490 : if (res && canonname)
335 1254 : *canonname = pstrdup(res);
336 :
337 : /* restore old value. */
338 61490 : if (!setlocale(category, save))
339 0 : elog(WARNING, "failed to restore old locale \"%s\"", save);
340 61490 : pfree(save);
341 :
342 61490 : return (res != NULL);
343 : }
344 :
345 :
346 : /*
347 : * GUC check/assign hooks
348 : *
349 : * For most locale categories, the assign hook doesn't actually set the locale
350 : * permanently, just reset flags so that the next use will cache the
351 : * appropriate values. (See explanation at the top of this file.)
352 : *
353 : * Note: we accept value = "" as selecting the postmaster's environment
354 : * value, whatever it was (so long as the environment setting is legal).
355 : * This will have been locked down by an earlier call to pg_perm_setlocale.
356 : */
357 : bool
358 16166 : check_locale_monetary(char **newval, void **extra, GucSource source)
359 : {
360 16166 : return check_locale(LC_MONETARY, *newval, NULL);
361 : }
362 :
363 : void
364 15982 : assign_locale_monetary(const char *newval, void *extra)
365 : {
366 15982 : CurrentLocaleConvValid = false;
367 15982 : }
368 :
369 : bool
370 16172 : check_locale_numeric(char **newval, void **extra, GucSource source)
371 : {
372 16172 : return check_locale(LC_NUMERIC, *newval, NULL);
373 : }
374 :
375 : void
376 15994 : assign_locale_numeric(const char *newval, void *extra)
377 : {
378 15994 : CurrentLocaleConvValid = false;
379 15994 : }
380 :
381 : bool
382 16166 : check_locale_time(char **newval, void **extra, GucSource source)
383 : {
384 16166 : return check_locale(LC_TIME, *newval, NULL);
385 : }
386 :
387 : void
388 15982 : assign_locale_time(const char *newval, void *extra)
389 : {
390 15982 : CurrentLCTimeValid = false;
391 15982 : }
392 :
393 : /*
394 : * We allow LC_MESSAGES to actually be set globally.
395 : *
396 : * Note: we normally disallow value = "" because it wouldn't have consistent
397 : * semantics (it'd effectively just use the previous value). However, this
398 : * is the value passed for PGC_S_DEFAULT, so don't complain in that case,
399 : * not even if the attempted setting fails due to invalid environment value.
400 : * The idea there is just to accept the environment setting *if possible*
401 : * during startup, until we can read the proper value from postgresql.conf.
402 : */
403 : bool
404 16196 : check_locale_messages(char **newval, void **extra, GucSource source)
405 : {
406 16196 : if (**newval == '\0')
407 : {
408 4468 : if (source == PGC_S_DEFAULT)
409 4468 : return true;
410 : else
411 0 : return false;
412 : }
413 :
414 : /*
415 : * LC_MESSAGES category does not exist everywhere, but accept it anyway
416 : *
417 : * On Windows, we can't even check the value, so accept blindly
418 : */
419 : #if defined(LC_MESSAGES) && !defined(WIN32)
420 11728 : return check_locale(LC_MESSAGES, *newval, NULL);
421 : #else
422 : return true;
423 : #endif
424 : }
425 :
426 : void
427 16010 : assign_locale_messages(const char *newval, void *extra)
428 : {
429 : /*
430 : * LC_MESSAGES category does not exist everywhere, but accept it anyway.
431 : * We ignore failure, as per comment above.
432 : */
433 : #ifdef LC_MESSAGES
434 16010 : (void) pg_perm_setlocale(LC_MESSAGES, newval);
435 : #endif
436 16010 : }
437 :
438 :
439 : /*
440 : * Frees the malloced content of a struct lconv. (But not the struct
441 : * itself.) It's important that this not throw elog(ERROR).
442 : */
443 : static void
444 6 : free_struct_lconv(struct lconv *s)
445 : {
446 6 : free(s->decimal_point);
447 6 : free(s->thousands_sep);
448 6 : free(s->grouping);
449 6 : free(s->int_curr_symbol);
450 6 : free(s->currency_symbol);
451 6 : free(s->mon_decimal_point);
452 6 : free(s->mon_thousands_sep);
453 6 : free(s->mon_grouping);
454 6 : free(s->positive_sign);
455 6 : free(s->negative_sign);
456 6 : }
457 :
458 : /*
459 : * Check that all fields of a struct lconv (or at least, the ones we care
460 : * about) are non-NULL. The field list must match free_struct_lconv().
461 : */
462 : static bool
463 56 : struct_lconv_is_valid(struct lconv *s)
464 : {
465 56 : if (s->decimal_point == NULL)
466 0 : return false;
467 56 : if (s->thousands_sep == NULL)
468 0 : return false;
469 56 : if (s->grouping == NULL)
470 0 : return false;
471 56 : if (s->int_curr_symbol == NULL)
472 0 : return false;
473 56 : if (s->currency_symbol == NULL)
474 0 : return false;
475 56 : if (s->mon_decimal_point == NULL)
476 0 : return false;
477 56 : if (s->mon_thousands_sep == NULL)
478 0 : return false;
479 56 : if (s->mon_grouping == NULL)
480 0 : return false;
481 56 : if (s->positive_sign == NULL)
482 0 : return false;
483 56 : if (s->negative_sign == NULL)
484 0 : return false;
485 56 : return true;
486 : }
487 :
488 :
489 : /*
490 : * Convert the strdup'd string at *str from the specified encoding to the
491 : * database encoding.
492 : */
493 : static void
494 448 : db_encoding_convert(int encoding, char **str)
495 : {
496 : char *pstr;
497 : char *mstr;
498 :
499 : /* convert the string to the database encoding */
500 448 : pstr = pg_any_to_server(*str, strlen(*str), encoding);
501 448 : if (pstr == *str)
502 448 : return; /* no conversion happened */
503 :
504 : /* need it malloc'd not palloc'd */
505 0 : mstr = strdup(pstr);
506 0 : if (mstr == NULL)
507 0 : ereport(ERROR,
508 : (errcode(ERRCODE_OUT_OF_MEMORY),
509 : errmsg("out of memory")));
510 :
511 : /* replace old string */
512 0 : free(*str);
513 0 : *str = mstr;
514 :
515 0 : pfree(pstr);
516 : }
517 :
518 :
519 : /*
520 : * Return the POSIX lconv struct (contains number/money formatting
521 : * information) with locale information for all categories.
522 : */
523 : struct lconv *
524 2900 : PGLC_localeconv(void)
525 : {
526 : static struct lconv CurrentLocaleConv;
527 : static bool CurrentLocaleConvAllocated = false;
528 : struct lconv *extlconv;
529 : struct lconv worklconv;
530 : char *save_lc_monetary;
531 : char *save_lc_numeric;
532 : #ifdef WIN32
533 : char *save_lc_ctype;
534 : #endif
535 :
536 : /* Did we do it already? */
537 2900 : if (CurrentLocaleConvValid)
538 2844 : return &CurrentLocaleConv;
539 :
540 : /* Free any already-allocated storage */
541 56 : if (CurrentLocaleConvAllocated)
542 : {
543 6 : free_struct_lconv(&CurrentLocaleConv);
544 6 : CurrentLocaleConvAllocated = false;
545 : }
546 :
547 : /*
548 : * This is tricky because we really don't want to risk throwing error
549 : * while the locale is set to other than our usual settings. Therefore,
550 : * the process is: collect the usual settings, set locale to special
551 : * setting, copy relevant data into worklconv using strdup(), restore
552 : * normal settings, convert data to desired encoding, and finally stash
553 : * the collected data in CurrentLocaleConv. This makes it safe if we
554 : * throw an error during encoding conversion or run out of memory anywhere
555 : * in the process. All data pointed to by struct lconv members is
556 : * allocated with strdup, to avoid premature elog(ERROR) and to allow
557 : * using a single cleanup routine.
558 : */
559 56 : memset(&worklconv, 0, sizeof(worklconv));
560 :
561 : /* Save prevailing values of monetary and numeric locales */
562 56 : save_lc_monetary = setlocale(LC_MONETARY, NULL);
563 56 : if (!save_lc_monetary)
564 0 : elog(ERROR, "setlocale(NULL) failed");
565 56 : save_lc_monetary = pstrdup(save_lc_monetary);
566 :
567 56 : save_lc_numeric = setlocale(LC_NUMERIC, NULL);
568 56 : if (!save_lc_numeric)
569 0 : elog(ERROR, "setlocale(NULL) failed");
570 56 : save_lc_numeric = pstrdup(save_lc_numeric);
571 :
572 : #ifdef WIN32
573 :
574 : /*
575 : * The POSIX standard explicitly says that it is undefined what happens if
576 : * LC_MONETARY or LC_NUMERIC imply an encoding (codeset) different from
577 : * that implied by LC_CTYPE. In practice, all Unix-ish platforms seem to
578 : * believe that localeconv() should return strings that are encoded in the
579 : * codeset implied by the LC_MONETARY or LC_NUMERIC locale name. Hence,
580 : * once we have successfully collected the localeconv() results, we will
581 : * convert them from that codeset to the desired server encoding.
582 : *
583 : * Windows, of course, resolutely does things its own way; on that
584 : * platform LC_CTYPE has to match LC_MONETARY/LC_NUMERIC to get sane
585 : * results. Hence, we must temporarily set that category as well.
586 : */
587 :
588 : /* Save prevailing value of ctype locale */
589 : save_lc_ctype = setlocale(LC_CTYPE, NULL);
590 : if (!save_lc_ctype)
591 : elog(ERROR, "setlocale(NULL) failed");
592 : save_lc_ctype = pstrdup(save_lc_ctype);
593 :
594 : /* Here begins the critical section where we must not throw error */
595 :
596 : /* use numeric to set the ctype */
597 : setlocale(LC_CTYPE, locale_numeric);
598 : #endif
599 :
600 : /* Get formatting information for numeric */
601 56 : setlocale(LC_NUMERIC, locale_numeric);
602 56 : extlconv = localeconv();
603 :
604 : /* Must copy data now in case setlocale() overwrites it */
605 56 : worklconv.decimal_point = strdup(extlconv->decimal_point);
606 56 : worklconv.thousands_sep = strdup(extlconv->thousands_sep);
607 56 : worklconv.grouping = strdup(extlconv->grouping);
608 :
609 : #ifdef WIN32
610 : /* use monetary to set the ctype */
611 : setlocale(LC_CTYPE, locale_monetary);
612 : #endif
613 :
614 : /* Get formatting information for monetary */
615 56 : setlocale(LC_MONETARY, locale_monetary);
616 56 : extlconv = localeconv();
617 :
618 : /* Must copy data now in case setlocale() overwrites it */
619 56 : worklconv.int_curr_symbol = strdup(extlconv->int_curr_symbol);
620 56 : worklconv.currency_symbol = strdup(extlconv->currency_symbol);
621 56 : worklconv.mon_decimal_point = strdup(extlconv->mon_decimal_point);
622 56 : worklconv.mon_thousands_sep = strdup(extlconv->mon_thousands_sep);
623 56 : worklconv.mon_grouping = strdup(extlconv->mon_grouping);
624 56 : worklconv.positive_sign = strdup(extlconv->positive_sign);
625 56 : worklconv.negative_sign = strdup(extlconv->negative_sign);
626 : /* Copy scalar fields as well */
627 56 : worklconv.int_frac_digits = extlconv->int_frac_digits;
628 56 : worklconv.frac_digits = extlconv->frac_digits;
629 56 : worklconv.p_cs_precedes = extlconv->p_cs_precedes;
630 56 : worklconv.p_sep_by_space = extlconv->p_sep_by_space;
631 56 : worklconv.n_cs_precedes = extlconv->n_cs_precedes;
632 56 : worklconv.n_sep_by_space = extlconv->n_sep_by_space;
633 56 : worklconv.p_sign_posn = extlconv->p_sign_posn;
634 56 : worklconv.n_sign_posn = extlconv->n_sign_posn;
635 :
636 : /*
637 : * Restore the prevailing locale settings; failure to do so is fatal.
638 : * Possibly we could limp along with nondefault LC_MONETARY or LC_NUMERIC,
639 : * but proceeding with the wrong value of LC_CTYPE would certainly be bad
640 : * news; and considering that the prevailing LC_MONETARY and LC_NUMERIC
641 : * are almost certainly "C", there's really no reason that restoring those
642 : * should fail.
643 : */
644 : #ifdef WIN32
645 : if (!setlocale(LC_CTYPE, save_lc_ctype))
646 : elog(FATAL, "failed to restore LC_CTYPE to \"%s\"", save_lc_ctype);
647 : #endif
648 56 : if (!setlocale(LC_MONETARY, save_lc_monetary))
649 0 : elog(FATAL, "failed to restore LC_MONETARY to \"%s\"", save_lc_monetary);
650 56 : if (!setlocale(LC_NUMERIC, save_lc_numeric))
651 0 : elog(FATAL, "failed to restore LC_NUMERIC to \"%s\"", save_lc_numeric);
652 :
653 : /*
654 : * At this point we've done our best to clean up, and can call functions
655 : * that might possibly throw errors with a clean conscience. But let's
656 : * make sure we don't leak any already-strdup'd fields in worklconv.
657 : */
658 56 : PG_TRY();
659 : {
660 : int encoding;
661 :
662 : /* Release the pstrdup'd locale names */
663 56 : pfree(save_lc_monetary);
664 56 : pfree(save_lc_numeric);
665 : #ifdef WIN32
666 : pfree(save_lc_ctype);
667 : #endif
668 :
669 : /* If any of the preceding strdup calls failed, complain now. */
670 56 : if (!struct_lconv_is_valid(&worklconv))
671 0 : ereport(ERROR,
672 : (errcode(ERRCODE_OUT_OF_MEMORY),
673 : errmsg("out of memory")));
674 :
675 : /*
676 : * Now we must perform encoding conversion from whatever's associated
677 : * with the locales into the database encoding. If we can't identify
678 : * the encoding implied by LC_NUMERIC or LC_MONETARY (ie we get -1),
679 : * use PG_SQL_ASCII, which will result in just validating that the
680 : * strings are OK in the database encoding.
681 : */
682 56 : encoding = pg_get_encoding_from_locale(locale_numeric, true);
683 56 : if (encoding < 0)
684 0 : encoding = PG_SQL_ASCII;
685 :
686 56 : db_encoding_convert(encoding, &worklconv.decimal_point);
687 56 : db_encoding_convert(encoding, &worklconv.thousands_sep);
688 : /* grouping is not text and does not require conversion */
689 :
690 56 : encoding = pg_get_encoding_from_locale(locale_monetary, true);
691 56 : if (encoding < 0)
692 0 : encoding = PG_SQL_ASCII;
693 :
694 56 : db_encoding_convert(encoding, &worklconv.int_curr_symbol);
695 56 : db_encoding_convert(encoding, &worklconv.currency_symbol);
696 56 : db_encoding_convert(encoding, &worklconv.mon_decimal_point);
697 56 : db_encoding_convert(encoding, &worklconv.mon_thousands_sep);
698 : /* mon_grouping is not text and does not require conversion */
699 56 : db_encoding_convert(encoding, &worklconv.positive_sign);
700 56 : db_encoding_convert(encoding, &worklconv.negative_sign);
701 : }
702 0 : PG_CATCH();
703 : {
704 0 : free_struct_lconv(&worklconv);
705 0 : PG_RE_THROW();
706 : }
707 56 : PG_END_TRY();
708 :
709 : /*
710 : * Everything is good, so save the results.
711 : */
712 56 : CurrentLocaleConv = worklconv;
713 56 : CurrentLocaleConvAllocated = true;
714 56 : CurrentLocaleConvValid = true;
715 56 : return &CurrentLocaleConv;
716 : }
717 :
718 : #ifdef WIN32
719 : /*
720 : * On Windows, strftime() returns its output in encoding CP_ACP (the default
721 : * operating system codepage for the computer), which is likely different
722 : * from SERVER_ENCODING. This is especially important in Japanese versions
723 : * of Windows which will use SJIS encoding, which we don't support as a
724 : * server encoding.
725 : *
726 : * So, instead of using strftime(), use wcsftime() to return the value in
727 : * wide characters (internally UTF16) and then convert to UTF8, which we
728 : * know how to handle directly.
729 : *
730 : * Note that this only affects the calls to strftime() in this file, which are
731 : * used to get the locale-aware strings. Other parts of the backend use
732 : * pg_strftime(), which isn't locale-aware and does not need to be replaced.
733 : */
734 : static size_t
735 : strftime_win32(char *dst, size_t dstlen,
736 : const char *format, const struct tm *tm)
737 : {
738 : size_t len;
739 : wchar_t wformat[8]; /* formats used below need 3 chars */
740 : wchar_t wbuf[MAX_L10N_DATA];
741 :
742 : /*
743 : * Get a wchar_t version of the format string. We only actually use
744 : * plain-ASCII formats in this file, so we can say that they're UTF8.
745 : */
746 : len = MultiByteToWideChar(CP_UTF8, 0, format, -1,
747 : wformat, lengthof(wformat));
748 : if (len == 0)
749 : elog(ERROR, "could not convert format string from UTF-8: error code %lu",
750 : GetLastError());
751 :
752 : len = wcsftime(wbuf, MAX_L10N_DATA, wformat, tm);
753 : if (len == 0)
754 : {
755 : /*
756 : * wcsftime failed, possibly because the result would not fit in
757 : * MAX_L10N_DATA. Return 0 with the contents of dst unspecified.
758 : */
759 : return 0;
760 : }
761 :
762 : len = WideCharToMultiByte(CP_UTF8, 0, wbuf, len, dst, dstlen - 1,
763 : NULL, NULL);
764 : if (len == 0)
765 : elog(ERROR, "could not convert string to UTF-8: error code %lu",
766 : GetLastError());
767 :
768 : dst[len] = '\0';
769 :
770 : return len;
771 : }
772 :
773 : /* redefine strftime() */
774 : #define strftime(a,b,c,d) strftime_win32(a,b,c,d)
775 : #endif /* WIN32 */
776 :
777 : /*
778 : * Subroutine for cache_locale_time().
779 : * Convert the given string from encoding "encoding" to the database
780 : * encoding, and store the result at *dst, replacing any previous value.
781 : */
782 : static void
783 1748 : cache_single_string(char **dst, const char *src, int encoding)
784 : {
785 : char *ptr;
786 : char *olddst;
787 :
788 : /* Convert the string to the database encoding, or validate it's OK */
789 1748 : ptr = pg_any_to_server(src, strlen(src), encoding);
790 :
791 : /* Store the string in long-lived storage, replacing any previous value */
792 1748 : olddst = *dst;
793 1748 : *dst = MemoryContextStrdup(TopMemoryContext, ptr);
794 1748 : if (olddst)
795 0 : pfree(olddst);
796 :
797 : /* Might as well clean up any palloc'd conversion result, too */
798 1748 : if (ptr != src)
799 0 : pfree(ptr);
800 1748 : }
801 :
802 : /*
803 : * Update the lc_time localization cache variables if needed.
804 : */
805 : void
806 49024 : cache_locale_time(void)
807 : {
808 : char buf[(2 * 7 + 2 * 12) * MAX_L10N_DATA];
809 : char *bufptr;
810 : time_t timenow;
811 : struct tm *timeinfo;
812 49024 : bool strftimefail = false;
813 : int encoding;
814 : int i;
815 : char *save_lc_time;
816 : #ifdef WIN32
817 : char *save_lc_ctype;
818 : #endif
819 :
820 : /* did we do this already? */
821 49024 : if (CurrentLCTimeValid)
822 48978 : return;
823 :
824 46 : elog(DEBUG3, "cache_locale_time() executed; locale: \"%s\"", locale_time);
825 :
826 : /*
827 : * As in PGLC_localeconv(), it's critical that we not throw error while
828 : * libc's locale settings have nondefault values. Hence, we just call
829 : * strftime() within the critical section, and then convert and save its
830 : * results afterwards.
831 : */
832 :
833 : /* Save prevailing value of time locale */
834 46 : save_lc_time = setlocale(LC_TIME, NULL);
835 46 : if (!save_lc_time)
836 0 : elog(ERROR, "setlocale(NULL) failed");
837 46 : save_lc_time = pstrdup(save_lc_time);
838 :
839 : #ifdef WIN32
840 :
841 : /*
842 : * On Windows, it appears that wcsftime() internally uses LC_CTYPE, so we
843 : * must set it here. This code looks the same as what PGLC_localeconv()
844 : * does, but the underlying reason is different: this does NOT determine
845 : * the encoding we'll get back from strftime_win32().
846 : */
847 :
848 : /* Save prevailing value of ctype locale */
849 : save_lc_ctype = setlocale(LC_CTYPE, NULL);
850 : if (!save_lc_ctype)
851 : elog(ERROR, "setlocale(NULL) failed");
852 : save_lc_ctype = pstrdup(save_lc_ctype);
853 :
854 : /* use lc_time to set the ctype */
855 : setlocale(LC_CTYPE, locale_time);
856 : #endif
857 :
858 46 : setlocale(LC_TIME, locale_time);
859 :
860 : /* We use times close to current time as data for strftime(). */
861 46 : timenow = time(NULL);
862 46 : timeinfo = localtime(&timenow);
863 :
864 : /* Store the strftime results in MAX_L10N_DATA-sized portions of buf[] */
865 46 : bufptr = buf;
866 :
867 : /*
868 : * MAX_L10N_DATA is sufficient buffer space for every known locale, and
869 : * POSIX defines no strftime() errors. (Buffer space exhaustion is not an
870 : * error.) An implementation might report errors (e.g. ENOMEM) by
871 : * returning 0 (or, less plausibly, a negative value) and setting errno.
872 : * Report errno just in case the implementation did that, but clear it in
873 : * advance of the calls so we don't emit a stale, unrelated errno.
874 : */
875 46 : errno = 0;
876 :
877 : /* localized days */
878 368 : for (i = 0; i < 7; i++)
879 : {
880 322 : timeinfo->tm_wday = i;
881 322 : if (strftime(bufptr, MAX_L10N_DATA, "%a", timeinfo) <= 0)
882 0 : strftimefail = true;
883 322 : bufptr += MAX_L10N_DATA;
884 322 : if (strftime(bufptr, MAX_L10N_DATA, "%A", timeinfo) <= 0)
885 0 : strftimefail = true;
886 322 : bufptr += MAX_L10N_DATA;
887 : }
888 :
889 : /* localized months */
890 598 : for (i = 0; i < 12; i++)
891 : {
892 552 : timeinfo->tm_mon = i;
893 552 : timeinfo->tm_mday = 1; /* make sure we don't have invalid date */
894 552 : if (strftime(bufptr, MAX_L10N_DATA, "%b", timeinfo) <= 0)
895 0 : strftimefail = true;
896 552 : bufptr += MAX_L10N_DATA;
897 552 : if (strftime(bufptr, MAX_L10N_DATA, "%B", timeinfo) <= 0)
898 0 : strftimefail = true;
899 552 : bufptr += MAX_L10N_DATA;
900 : }
901 :
902 : /*
903 : * Restore the prevailing locale settings; as in PGLC_localeconv(),
904 : * failure to do so is fatal.
905 : */
906 : #ifdef WIN32
907 : if (!setlocale(LC_CTYPE, save_lc_ctype))
908 : elog(FATAL, "failed to restore LC_CTYPE to \"%s\"", save_lc_ctype);
909 : #endif
910 46 : if (!setlocale(LC_TIME, save_lc_time))
911 0 : elog(FATAL, "failed to restore LC_TIME to \"%s\"", save_lc_time);
912 :
913 : /*
914 : * At this point we've done our best to clean up, and can throw errors, or
915 : * call functions that might throw errors, with a clean conscience.
916 : */
917 46 : if (strftimefail)
918 0 : elog(ERROR, "strftime() failed: %m");
919 :
920 : /* Release the pstrdup'd locale names */
921 46 : pfree(save_lc_time);
922 : #ifdef WIN32
923 : pfree(save_lc_ctype);
924 : #endif
925 :
926 : #ifndef WIN32
927 :
928 : /*
929 : * As in PGLC_localeconv(), we must convert strftime()'s output from the
930 : * encoding implied by LC_TIME to the database encoding. If we can't
931 : * identify the LC_TIME encoding, just perform encoding validation.
932 : */
933 46 : encoding = pg_get_encoding_from_locale(locale_time, true);
934 46 : if (encoding < 0)
935 0 : encoding = PG_SQL_ASCII;
936 :
937 : #else
938 :
939 : /*
940 : * On Windows, strftime_win32() always returns UTF8 data, so convert from
941 : * that if necessary.
942 : */
943 : encoding = PG_UTF8;
944 :
945 : #endif /* WIN32 */
946 :
947 46 : bufptr = buf;
948 :
949 : /* localized days */
950 368 : for (i = 0; i < 7; i++)
951 : {
952 322 : cache_single_string(&localized_abbrev_days[i], bufptr, encoding);
953 322 : bufptr += MAX_L10N_DATA;
954 322 : cache_single_string(&localized_full_days[i], bufptr, encoding);
955 322 : bufptr += MAX_L10N_DATA;
956 : }
957 46 : localized_abbrev_days[7] = NULL;
958 46 : localized_full_days[7] = NULL;
959 :
960 : /* localized months */
961 598 : for (i = 0; i < 12; i++)
962 : {
963 552 : cache_single_string(&localized_abbrev_months[i], bufptr, encoding);
964 552 : bufptr += MAX_L10N_DATA;
965 552 : cache_single_string(&localized_full_months[i], bufptr, encoding);
966 552 : bufptr += MAX_L10N_DATA;
967 : }
968 46 : localized_abbrev_months[12] = NULL;
969 46 : localized_full_months[12] = NULL;
970 :
971 46 : CurrentLCTimeValid = true;
972 : }
973 :
974 :
975 : #if defined(WIN32) && defined(LC_MESSAGES)
976 : /*
977 : * Convert a Windows setlocale() argument to a Unix-style one.
978 : *
979 : * Regardless of platform, we install message catalogs under a Unix-style
980 : * LL[_CC][.ENCODING][@VARIANT] naming convention. Only LC_MESSAGES settings
981 : * following that style will elicit localized interface strings.
982 : *
983 : * Before Visual Studio 2012 (msvcr110.dll), Windows setlocale() accepted "C"
984 : * (but not "c") and strings of the form <Language>[_<Country>][.<CodePage>],
985 : * case-insensitive. setlocale() returns the fully-qualified form; for
986 : * example, setlocale("thaI") returns "Thai_Thailand.874". Internally,
987 : * setlocale() and _create_locale() select a "locale identifier"[1] and store
988 : * it in an undocumented _locale_t field. From that LCID, we can retrieve the
989 : * ISO 639 language and the ISO 3166 country. Character encoding does not
990 : * matter, because the server and client encodings govern that.
991 : *
992 : * Windows Vista introduced the "locale name" concept[2], closely following
993 : * RFC 4646. Locale identifiers are now deprecated. Starting with Visual
994 : * Studio 2012, setlocale() accepts locale names in addition to the strings it
995 : * accepted historically. It does not standardize them; setlocale("Th-tH")
996 : * returns "Th-tH". setlocale(category, "") still returns a traditional
997 : * string. Furthermore, msvcr110.dll changed the undocumented _locale_t
998 : * content to carry locale names instead of locale identifiers.
999 : *
1000 : * Visual Studio 2015 should still be able to do the same as Visual Studio
1001 : * 2012, but the declaration of locale_name is missing in _locale_t, causing
1002 : * this code compilation to fail, hence this falls back instead on to
1003 : * enumerating all system locales by using EnumSystemLocalesEx to find the
1004 : * required locale name. If the input argument is in Unix-style then we can
1005 : * get ISO Locale name directly by using GetLocaleInfoEx() with LCType as
1006 : * LOCALE_SNAME.
1007 : *
1008 : * MinGW headers declare _create_locale(), but msvcrt.dll lacks that symbol in
1009 : * releases before Windows 8. IsoLocaleName() always fails in a MinGW-built
1010 : * postgres.exe, so only Unix-style values of the lc_messages GUC can elicit
1011 : * localized messages. In particular, every lc_messages setting that initdb
1012 : * can select automatically will yield only C-locale messages. XXX This could
1013 : * be fixed by running the fully-qualified locale name through a lookup table.
1014 : *
1015 : * This function returns a pointer to a static buffer bearing the converted
1016 : * name or NULL if conversion fails.
1017 : *
1018 : * [1] https://docs.microsoft.com/en-us/windows/win32/intl/locale-identifiers
1019 : * [2] https://docs.microsoft.com/en-us/windows/win32/intl/locale-names
1020 : */
1021 :
1022 : #if defined(_MSC_VER)
1023 :
1024 : /*
1025 : * Callback function for EnumSystemLocalesEx() in get_iso_localename().
1026 : *
1027 : * This function enumerates all system locales, searching for one that matches
1028 : * an input with the format: <Language>[_<Country>], e.g.
1029 : * English[_United States]
1030 : *
1031 : * The input is a three wchar_t array as an LPARAM. The first element is the
1032 : * locale_name we want to match, the second element is an allocated buffer
1033 : * where the Unix-style locale is copied if a match is found, and the third
1034 : * element is the search status, 1 if a match was found, 0 otherwise.
1035 : */
1036 : static BOOL CALLBACK
1037 : search_locale_enum(LPWSTR pStr, DWORD dwFlags, LPARAM lparam)
1038 : {
1039 : wchar_t test_locale[LOCALE_NAME_MAX_LENGTH];
1040 : wchar_t **argv;
1041 :
1042 : (void) (dwFlags);
1043 :
1044 : argv = (wchar_t **) lparam;
1045 : *argv[2] = (wchar_t) 0;
1046 :
1047 : memset(test_locale, 0, sizeof(test_locale));
1048 :
1049 : /* Get the name of the <Language> in English */
1050 : if (GetLocaleInfoEx(pStr, LOCALE_SENGLISHLANGUAGENAME,
1051 : test_locale, LOCALE_NAME_MAX_LENGTH))
1052 : {
1053 : /*
1054 : * If the enumerated locale does not have a hyphen ("en") OR the
1055 : * locale_name input does not have an underscore ("English"), we only
1056 : * need to compare the <Language> tags.
1057 : */
1058 : if (wcsrchr(pStr, '-') == NULL || wcsrchr(argv[0], '_') == NULL)
1059 : {
1060 : if (_wcsicmp(argv[0], test_locale) == 0)
1061 : {
1062 : wcscpy(argv[1], pStr);
1063 : *argv[2] = (wchar_t) 1;
1064 : return FALSE;
1065 : }
1066 : }
1067 :
1068 : /*
1069 : * We have to compare a full <Language>_<Country> tag, so we append
1070 : * the underscore and name of the country/region in English, e.g.
1071 : * "English_United States".
1072 : */
1073 : else
1074 : {
1075 : size_t len;
1076 :
1077 : wcscat(test_locale, L"_");
1078 : len = wcslen(test_locale);
1079 : if (GetLocaleInfoEx(pStr, LOCALE_SENGLISHCOUNTRYNAME,
1080 : test_locale + len,
1081 : LOCALE_NAME_MAX_LENGTH - len))
1082 : {
1083 : if (_wcsicmp(argv[0], test_locale) == 0)
1084 : {
1085 : wcscpy(argv[1], pStr);
1086 : *argv[2] = (wchar_t) 1;
1087 : return FALSE;
1088 : }
1089 : }
1090 : }
1091 : }
1092 :
1093 : return TRUE;
1094 : }
1095 :
1096 : /*
1097 : * This function converts a Windows locale name to an ISO formatted version
1098 : * for Visual Studio 2015 or greater.
1099 : *
1100 : * Returns NULL, if no valid conversion was found.
1101 : */
1102 : static char *
1103 : get_iso_localename(const char *winlocname)
1104 : {
1105 : wchar_t wc_locale_name[LOCALE_NAME_MAX_LENGTH];
1106 : wchar_t buffer[LOCALE_NAME_MAX_LENGTH];
1107 : static char iso_lc_messages[LOCALE_NAME_MAX_LENGTH];
1108 : char *period;
1109 : int len;
1110 : int ret_val;
1111 :
1112 : /*
1113 : * Valid locales have the following syntax:
1114 : * <Language>[_<Country>[.<CodePage>]]
1115 : *
1116 : * GetLocaleInfoEx can only take locale name without code-page and for the
1117 : * purpose of this API the code-page doesn't matter.
1118 : */
1119 : period = strchr(winlocname, '.');
1120 : if (period != NULL)
1121 : len = period - winlocname;
1122 : else
1123 : len = pg_mbstrlen(winlocname);
1124 :
1125 : memset(wc_locale_name, 0, sizeof(wc_locale_name));
1126 : memset(buffer, 0, sizeof(buffer));
1127 : MultiByteToWideChar(CP_ACP, 0, winlocname, len, wc_locale_name,
1128 : LOCALE_NAME_MAX_LENGTH);
1129 :
1130 : /*
1131 : * If the lc_messages is already a Unix-style string, we have a direct
1132 : * match with LOCALE_SNAME, e.g. en-US, en_US.
1133 : */
1134 : ret_val = GetLocaleInfoEx(wc_locale_name, LOCALE_SNAME, (LPWSTR) &buffer,
1135 : LOCALE_NAME_MAX_LENGTH);
1136 : if (!ret_val)
1137 : {
1138 : /*
1139 : * Search for a locale in the system that matches language and country
1140 : * name.
1141 : */
1142 : wchar_t *argv[3];
1143 :
1144 : argv[0] = wc_locale_name;
1145 : argv[1] = buffer;
1146 : argv[2] = (wchar_t *) &ret_val;
1147 : EnumSystemLocalesEx(search_locale_enum, LOCALE_WINDOWS, (LPARAM) argv,
1148 : NULL);
1149 : }
1150 :
1151 : if (ret_val)
1152 : {
1153 : size_t rc;
1154 : char *hyphen;
1155 :
1156 : /* Locale names use only ASCII, any conversion locale suffices. */
1157 : rc = wchar2char(iso_lc_messages, buffer, sizeof(iso_lc_messages), NULL);
1158 : if (rc == -1 || rc == sizeof(iso_lc_messages))
1159 : return NULL;
1160 :
1161 : /*
1162 : * Since the message catalogs sit on a case-insensitive filesystem, we
1163 : * need not standardize letter case here. So long as we do not ship
1164 : * message catalogs for which it would matter, we also need not
1165 : * translate the script/variant portion, e.g. uz-Cyrl-UZ to
1166 : * uz_UZ@cyrillic. Simply replace the hyphen with an underscore.
1167 : */
1168 : hyphen = strchr(iso_lc_messages, '-');
1169 : if (hyphen)
1170 : *hyphen = '_';
1171 : return iso_lc_messages;
1172 : }
1173 :
1174 : return NULL;
1175 : }
1176 :
1177 : static char *
1178 : IsoLocaleName(const char *winlocname)
1179 : {
1180 : static char iso_lc_messages[LOCALE_NAME_MAX_LENGTH];
1181 :
1182 : if (pg_strcasecmp("c", winlocname) == 0 ||
1183 : pg_strcasecmp("posix", winlocname) == 0)
1184 : {
1185 : strcpy(iso_lc_messages, "C");
1186 : return iso_lc_messages;
1187 : }
1188 : else
1189 : return get_iso_localename(winlocname);
1190 : }
1191 :
1192 : #else /* !defined(_MSC_VER) */
1193 :
1194 : static char *
1195 : IsoLocaleName(const char *winlocname)
1196 : {
1197 : return NULL; /* Not supported on MinGW */
1198 : }
1199 :
1200 : #endif /* defined(_MSC_VER) */
1201 :
1202 : #endif /* WIN32 && LC_MESSAGES */
1203 :
1204 :
1205 : /*
1206 : * Cache mechanism for collation information.
1207 : *
1208 : * We cache two flags: whether the collation's LC_COLLATE or LC_CTYPE is C
1209 : * (or POSIX), so we can optimize a few code paths in various places.
1210 : * For the built-in C and POSIX collations, we can know that without even
1211 : * doing a cache lookup, but we want to support aliases for C/POSIX too.
1212 : * For the "default" collation, there are separate static cache variables,
1213 : * since consulting the pg_collation catalog doesn't tell us what we need.
1214 : *
1215 : * Also, if a pg_locale_t has been requested for a collation, we cache that
1216 : * for the life of a backend.
1217 : *
1218 : * Note that some code relies on the flags not reporting false negatives
1219 : * (that is, saying it's not C when it is). For example, char2wchar()
1220 : * could fail if the locale is C, so str_tolower() shouldn't call it
1221 : * in that case.
1222 : *
1223 : * Note that we currently lack any way to flush the cache. Since we don't
1224 : * support ALTER COLLATION, this is OK. The worst case is that someone
1225 : * drops a collation, and a useless cache entry hangs around in existing
1226 : * backends.
1227 : */
1228 :
1229 : static collation_cache_entry *
1230 41752 : lookup_collation_cache(Oid collation, bool set_flags)
1231 : {
1232 : collation_cache_entry *cache_entry;
1233 : bool found;
1234 :
1235 : Assert(OidIsValid(collation));
1236 : Assert(collation != DEFAULT_COLLATION_OID);
1237 :
1238 41752 : if (collation_cache == NULL)
1239 : {
1240 : /* First time through, initialize the hash table */
1241 : HASHCTL ctl;
1242 :
1243 46 : ctl.keysize = sizeof(Oid);
1244 46 : ctl.entrysize = sizeof(collation_cache_entry);
1245 46 : collation_cache = hash_create("Collation cache", 100, &ctl,
1246 : HASH_ELEM | HASH_BLOBS);
1247 : }
1248 :
1249 41752 : cache_entry = hash_search(collation_cache, &collation, HASH_ENTER, &found);
1250 41752 : if (!found)
1251 : {
1252 : /*
1253 : * Make sure cache entry is marked invalid, in case we fail before
1254 : * setting things.
1255 : */
1256 262 : cache_entry->flags_valid = false;
1257 262 : cache_entry->locale = 0;
1258 : }
1259 :
1260 41752 : if (set_flags && !cache_entry->flags_valid)
1261 : {
1262 : /* Attempt to set the flags */
1263 : HeapTuple tp;
1264 : Form_pg_collation collform;
1265 :
1266 262 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collation));
1267 262 : if (!HeapTupleIsValid(tp))
1268 0 : elog(ERROR, "cache lookup failed for collation %u", collation);
1269 262 : collform = (Form_pg_collation) GETSTRUCT(tp);
1270 :
1271 262 : if (collform->collprovider == COLLPROVIDER_BUILTIN)
1272 : {
1273 : Datum datum;
1274 : const char *colllocale;
1275 :
1276 32 : datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
1277 32 : colllocale = TextDatumGetCString(datum);
1278 :
1279 32 : cache_entry->collate_is_c = true;
1280 32 : cache_entry->ctype_is_c = (strcmp(colllocale, "C") == 0);
1281 : }
1282 230 : else if (collform->collprovider == COLLPROVIDER_LIBC)
1283 : {
1284 : Datum datum;
1285 : const char *collcollate;
1286 : const char *collctype;
1287 :
1288 46 : datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
1289 46 : collcollate = TextDatumGetCString(datum);
1290 46 : datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype);
1291 46 : collctype = TextDatumGetCString(datum);
1292 :
1293 66 : cache_entry->collate_is_c = ((strcmp(collcollate, "C") == 0) ||
1294 20 : (strcmp(collcollate, "POSIX") == 0));
1295 66 : cache_entry->ctype_is_c = ((strcmp(collctype, "C") == 0) ||
1296 20 : (strcmp(collctype, "POSIX") == 0));
1297 : }
1298 : else
1299 : {
1300 184 : cache_entry->collate_is_c = false;
1301 184 : cache_entry->ctype_is_c = false;
1302 : }
1303 :
1304 262 : cache_entry->flags_valid = true;
1305 :
1306 262 : ReleaseSysCache(tp);
1307 : }
1308 :
1309 41752 : return cache_entry;
1310 : }
1311 :
1312 :
1313 : /*
1314 : * Detect whether collation's LC_COLLATE property is C
1315 : */
1316 : bool
1317 14101168 : lc_collate_is_c(Oid collation)
1318 : {
1319 : /*
1320 : * If we're asked about "collation 0", return false, so that the code will
1321 : * go into the non-C path and report that the collation is bogus.
1322 : */
1323 14101168 : if (!OidIsValid(collation))
1324 0 : return false;
1325 :
1326 : /*
1327 : * If we're asked about the default collation, we have to inquire of the C
1328 : * library. Cache the result so we only have to compute it once.
1329 : */
1330 14101168 : if (collation == DEFAULT_COLLATION_OID)
1331 : {
1332 : static int result = -1;
1333 : const char *localeptr;
1334 :
1335 11942154 : if (result >= 0)
1336 11937332 : return (bool) result;
1337 :
1338 4822 : if (default_locale.provider == COLLPROVIDER_BUILTIN)
1339 : {
1340 316 : result = true;
1341 316 : return (bool) result;
1342 : }
1343 4506 : else if (default_locale.provider == COLLPROVIDER_ICU)
1344 : {
1345 8 : result = false;
1346 8 : return (bool) result;
1347 : }
1348 4498 : else if (default_locale.provider == COLLPROVIDER_LIBC)
1349 : {
1350 4498 : localeptr = setlocale(LC_CTYPE, NULL);
1351 4498 : if (!localeptr)
1352 0 : elog(ERROR, "invalid LC_CTYPE setting");
1353 : }
1354 : else
1355 0 : elog(ERROR, "unexpected collation provider '%c'",
1356 : default_locale.provider);
1357 :
1358 4498 : if (strcmp(localeptr, "C") == 0)
1359 112 : result = true;
1360 4386 : else if (strcmp(localeptr, "POSIX") == 0)
1361 0 : result = true;
1362 : else
1363 4386 : result = false;
1364 4498 : return (bool) result;
1365 : }
1366 :
1367 : /*
1368 : * If we're asked about the built-in C/POSIX collations, we know that.
1369 : */
1370 2159014 : if (collation == C_COLLATION_OID ||
1371 : collation == POSIX_COLLATION_OID)
1372 2140364 : return true;
1373 :
1374 : /*
1375 : * Otherwise, we have to consult pg_collation, but we cache that.
1376 : */
1377 18650 : return (lookup_collation_cache(collation, true))->collate_is_c;
1378 : }
1379 :
1380 : /*
1381 : * Detect whether collation's LC_CTYPE property is C
1382 : */
1383 : bool
1384 4164110 : lc_ctype_is_c(Oid collation)
1385 : {
1386 : /*
1387 : * If we're asked about "collation 0", return false, so that the code will
1388 : * go into the non-C path and report that the collation is bogus.
1389 : */
1390 4164110 : if (!OidIsValid(collation))
1391 0 : return false;
1392 :
1393 : /*
1394 : * If we're asked about the default collation, we have to inquire of the C
1395 : * library. Cache the result so we only have to compute it once.
1396 : */
1397 4164110 : if (collation == DEFAULT_COLLATION_OID)
1398 : {
1399 : static int result = -1;
1400 : const char *localeptr;
1401 :
1402 3229208 : if (result >= 0)
1403 3228404 : return (bool) result;
1404 :
1405 804 : if (default_locale.provider == COLLPROVIDER_BUILTIN)
1406 : {
1407 178 : localeptr = default_locale.info.builtin.locale;
1408 : }
1409 626 : else if (default_locale.provider == COLLPROVIDER_ICU)
1410 : {
1411 6 : result = false;
1412 6 : return (bool) result;
1413 : }
1414 620 : else if (default_locale.provider == COLLPROVIDER_LIBC)
1415 : {
1416 620 : localeptr = setlocale(LC_CTYPE, NULL);
1417 620 : if (!localeptr)
1418 0 : elog(ERROR, "invalid LC_CTYPE setting");
1419 : }
1420 : else
1421 0 : elog(ERROR, "unexpected collation provider '%c'",
1422 : default_locale.provider);
1423 :
1424 798 : if (strcmp(localeptr, "C") == 0)
1425 36 : result = true;
1426 762 : else if (strcmp(localeptr, "POSIX") == 0)
1427 0 : result = true;
1428 : else
1429 762 : result = false;
1430 798 : return (bool) result;
1431 : }
1432 :
1433 : /*
1434 : * If we're asked about the built-in C/POSIX collations, we know that.
1435 : */
1436 934902 : if (collation == C_COLLATION_OID ||
1437 : collation == POSIX_COLLATION_OID)
1438 932574 : return true;
1439 :
1440 : /*
1441 : * Otherwise, we have to consult pg_collation, but we cache that.
1442 : */
1443 2328 : return (lookup_collation_cache(collation, true))->ctype_is_c;
1444 : }
1445 :
1446 : struct pg_locale_struct default_locale;
1447 :
1448 : void
1449 210 : make_icu_collator(const char *iculocstr,
1450 : const char *icurules,
1451 : struct pg_locale_struct *resultp)
1452 : {
1453 : #ifdef USE_ICU
1454 : UCollator *collator;
1455 :
1456 210 : collator = pg_ucol_open(iculocstr);
1457 :
1458 : /*
1459 : * If rules are specified, we extract the rules of the standard collation,
1460 : * add our own rules, and make a new collator with the combined rules.
1461 : */
1462 206 : if (icurules)
1463 : {
1464 : const UChar *default_rules;
1465 : UChar *agg_rules;
1466 : UChar *my_rules;
1467 : UErrorCode status;
1468 : int32_t length;
1469 :
1470 12 : default_rules = ucol_getRules(collator, &length);
1471 12 : icu_to_uchar(&my_rules, icurules, strlen(icurules));
1472 :
1473 12 : agg_rules = palloc_array(UChar, u_strlen(default_rules) + u_strlen(my_rules) + 1);
1474 12 : u_strcpy(agg_rules, default_rules);
1475 12 : u_strcat(agg_rules, my_rules);
1476 :
1477 12 : ucol_close(collator);
1478 :
1479 12 : status = U_ZERO_ERROR;
1480 12 : collator = ucol_openRules(agg_rules, u_strlen(agg_rules),
1481 : UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH, NULL, &status);
1482 12 : if (U_FAILURE(status))
1483 6 : ereport(ERROR,
1484 : (errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
1485 : iculocstr, icurules, u_errorName(status))));
1486 : }
1487 :
1488 : /* We will leak this string if the caller errors later :-( */
1489 200 : resultp->info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr);
1490 200 : resultp->info.icu.ucol = collator;
1491 : #else /* not USE_ICU */
1492 : /* could get here if a collation was created by a build with ICU */
1493 : ereport(ERROR,
1494 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1495 : errmsg("ICU is not supported in this build")));
1496 : #endif /* not USE_ICU */
1497 200 : }
1498 :
1499 :
1500 : /* simple subroutine for reporting errors from newlocale() */
1501 : static void
1502 0 : report_newlocale_failure(const char *localename)
1503 : {
1504 : int save_errno;
1505 :
1506 : /*
1507 : * Windows doesn't provide any useful error indication from
1508 : * _create_locale(), and BSD-derived platforms don't seem to feel they
1509 : * need to set errno either (even though POSIX is pretty clear that
1510 : * newlocale should do so). So, if errno hasn't been set, assume ENOENT
1511 : * is what to report.
1512 : */
1513 0 : if (errno == 0)
1514 0 : errno = ENOENT;
1515 :
1516 : /*
1517 : * ENOENT means "no such locale", not "no such file", so clarify that
1518 : * errno with an errdetail message.
1519 : */
1520 0 : save_errno = errno; /* auxiliary funcs might change errno */
1521 0 : ereport(ERROR,
1522 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1523 : errmsg("could not create locale \"%s\": %m",
1524 : localename),
1525 : (save_errno == ENOENT ?
1526 : errdetail("The operating system could not find any locale data for the locale name \"%s\".",
1527 : localename) : 0)));
1528 : }
1529 :
1530 : bool
1531 8547278 : pg_locale_deterministic(pg_locale_t locale)
1532 : {
1533 : /* default locale must always be deterministic */
1534 8547278 : if (locale == NULL)
1535 8293208 : return true;
1536 : else
1537 254070 : return locale->deterministic;
1538 : }
1539 :
1540 : /*
1541 : * Create a locale_t from a collation OID. Results are cached for the
1542 : * lifetime of the backend. Thus, do not free the result with freelocale().
1543 : *
1544 : * As a special optimization, the default/database collation returns 0.
1545 : *
1546 : * For simplicity, we always generate COLLATE + CTYPE even though we
1547 : * might only need one of them. Since this is called only once per session,
1548 : * it shouldn't cost much.
1549 : */
1550 : pg_locale_t
1551 12644928 : pg_newlocale_from_collation(Oid collid)
1552 : {
1553 : collation_cache_entry *cache_entry;
1554 :
1555 : /* Callers must pass a valid OID */
1556 : Assert(OidIsValid(collid));
1557 :
1558 12644928 : if (collid == DEFAULT_COLLATION_OID)
1559 : {
1560 12624154 : if (default_locale.provider == COLLPROVIDER_LIBC)
1561 12055030 : return (pg_locale_t) 0;
1562 : else
1563 569124 : return &default_locale;
1564 : }
1565 :
1566 20774 : cache_entry = lookup_collation_cache(collid, false);
1567 :
1568 20774 : if (cache_entry->locale == 0)
1569 : {
1570 : /* We haven't computed this yet in this session, so do it */
1571 : HeapTuple tp;
1572 : Form_pg_collation collform;
1573 : struct pg_locale_struct result;
1574 : pg_locale_t resultp;
1575 : Datum datum;
1576 : bool isnull;
1577 :
1578 204 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
1579 204 : if (!HeapTupleIsValid(tp))
1580 0 : elog(ERROR, "cache lookup failed for collation %u", collid);
1581 204 : collform = (Form_pg_collation) GETSTRUCT(tp);
1582 :
1583 : /* We'll fill in the result struct locally before allocating memory */
1584 204 : memset(&result, 0, sizeof(result));
1585 204 : result.provider = collform->collprovider;
1586 204 : result.deterministic = collform->collisdeterministic;
1587 :
1588 204 : if (collform->collprovider == COLLPROVIDER_BUILTIN)
1589 : {
1590 : const char *locstr;
1591 :
1592 20 : datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
1593 20 : locstr = TextDatumGetCString(datum);
1594 :
1595 20 : builtin_validate_locale(GetDatabaseEncoding(), locstr);
1596 :
1597 20 : result.info.builtin.locale = MemoryContextStrdup(TopMemoryContext,
1598 : locstr);
1599 : }
1600 184 : else if (collform->collprovider == COLLPROVIDER_LIBC)
1601 : {
1602 : const char *collcollate;
1603 : const char *collctype pg_attribute_unused();
1604 : locale_t loc;
1605 :
1606 0 : datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
1607 0 : collcollate = TextDatumGetCString(datum);
1608 0 : datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype);
1609 0 : collctype = TextDatumGetCString(datum);
1610 :
1611 0 : if (strcmp(collcollate, collctype) == 0)
1612 : {
1613 : /* Normal case where they're the same */
1614 0 : errno = 0;
1615 : #ifndef WIN32
1616 0 : loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collcollate,
1617 : NULL);
1618 : #else
1619 : loc = _create_locale(LC_ALL, collcollate);
1620 : #endif
1621 0 : if (!loc)
1622 0 : report_newlocale_failure(collcollate);
1623 : }
1624 : else
1625 : {
1626 : #ifndef WIN32
1627 : /* We need two newlocale() steps */
1628 : locale_t loc1;
1629 :
1630 0 : errno = 0;
1631 0 : loc1 = newlocale(LC_COLLATE_MASK, collcollate, NULL);
1632 0 : if (!loc1)
1633 0 : report_newlocale_failure(collcollate);
1634 0 : errno = 0;
1635 0 : loc = newlocale(LC_CTYPE_MASK, collctype, loc1);
1636 0 : if (!loc)
1637 0 : report_newlocale_failure(collctype);
1638 : #else
1639 :
1640 : /*
1641 : * XXX The _create_locale() API doesn't appear to support
1642 : * this. Could perhaps be worked around by changing
1643 : * pg_locale_t to contain two separate fields.
1644 : */
1645 : ereport(ERROR,
1646 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1647 : errmsg("collations with different collate and ctype values are not supported on this platform")));
1648 : #endif
1649 : }
1650 :
1651 0 : result.info.lt = loc;
1652 : }
1653 184 : else if (collform->collprovider == COLLPROVIDER_ICU)
1654 : {
1655 : const char *iculocstr;
1656 : const char *icurules;
1657 :
1658 184 : datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
1659 184 : iculocstr = TextDatumGetCString(datum);
1660 :
1661 184 : datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull);
1662 184 : if (!isnull)
1663 12 : icurules = TextDatumGetCString(datum);
1664 : else
1665 172 : icurules = NULL;
1666 :
1667 184 : make_icu_collator(iculocstr, icurules, &result);
1668 : }
1669 :
1670 198 : datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion,
1671 : &isnull);
1672 198 : if (!isnull)
1673 : {
1674 : char *actual_versionstr;
1675 : char *collversionstr;
1676 :
1677 198 : collversionstr = TextDatumGetCString(datum);
1678 :
1679 198 : if (collform->collprovider == COLLPROVIDER_LIBC)
1680 0 : datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
1681 : else
1682 198 : datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
1683 :
1684 198 : actual_versionstr = get_collation_actual_version(collform->collprovider,
1685 198 : TextDatumGetCString(datum));
1686 198 : if (!actual_versionstr)
1687 : {
1688 : /*
1689 : * This could happen when specifying a version in CREATE
1690 : * COLLATION but the provider does not support versioning, or
1691 : * manually creating a mess in the catalogs.
1692 : */
1693 0 : ereport(ERROR,
1694 : (errmsg("collation \"%s\" has no actual version, but a version was recorded",
1695 : NameStr(collform->collname))));
1696 : }
1697 :
1698 198 : if (strcmp(actual_versionstr, collversionstr) != 0)
1699 0 : ereport(WARNING,
1700 : (errmsg("collation \"%s\" has version mismatch",
1701 : NameStr(collform->collname)),
1702 : errdetail("The collation in the database was created using version %s, "
1703 : "but the operating system provides version %s.",
1704 : collversionstr, actual_versionstr),
1705 : errhint("Rebuild all objects affected by this collation and run "
1706 : "ALTER COLLATION %s REFRESH VERSION, "
1707 : "or build PostgreSQL with the right library version.",
1708 : quote_qualified_identifier(get_namespace_name(collform->collnamespace),
1709 : NameStr(collform->collname)))));
1710 : }
1711 :
1712 198 : ReleaseSysCache(tp);
1713 :
1714 : /* We'll keep the pg_locale_t structures in TopMemoryContext */
1715 198 : resultp = MemoryContextAlloc(TopMemoryContext, sizeof(*resultp));
1716 198 : *resultp = result;
1717 :
1718 198 : cache_entry->locale = resultp;
1719 : }
1720 :
1721 20768 : return cache_entry->locale;
1722 : }
1723 :
1724 : /*
1725 : * Get provider-specific collation version string for the given collation from
1726 : * the operating system/library.
1727 : */
1728 : char *
1729 83324 : get_collation_actual_version(char collprovider, const char *collcollate)
1730 : {
1731 83324 : char *collversion = NULL;
1732 :
1733 : /*
1734 : * The only two supported locales (C and C.UTF-8) are both based on memcmp
1735 : * and are not expected to change, but track the version anyway.
1736 : *
1737 : * Note that the character semantics may change for some locales, but the
1738 : * collation version only tracks changes to sort order.
1739 : */
1740 83324 : if (collprovider == COLLPROVIDER_BUILTIN)
1741 : {
1742 1810 : if (strcmp(collcollate, "C") == 0)
1743 24 : return "1";
1744 1786 : else if (strcmp(collcollate, "C.UTF-8") == 0)
1745 1786 : return "1";
1746 : else
1747 0 : ereport(ERROR,
1748 : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1749 : errmsg("invalid locale name \"%s\" for builtin provider",
1750 : collcollate)));
1751 : }
1752 :
1753 : #ifdef USE_ICU
1754 81514 : if (collprovider == COLLPROVIDER_ICU)
1755 : {
1756 : UCollator *collator;
1757 : UVersionInfo versioninfo;
1758 : char buf[U_MAX_VERSION_STRING_LENGTH];
1759 :
1760 59952 : collator = pg_ucol_open(collcollate);
1761 :
1762 59952 : ucol_getVersion(collator, versioninfo);
1763 59952 : ucol_close(collator);
1764 :
1765 59952 : u_versionToString(versioninfo, buf);
1766 59952 : collversion = pstrdup(buf);
1767 : }
1768 : else
1769 : #endif
1770 43124 : if (collprovider == COLLPROVIDER_LIBC &&
1771 42948 : pg_strcasecmp("C", collcollate) != 0 &&
1772 42620 : pg_strncasecmp("C.", collcollate, 2) != 0 &&
1773 21234 : pg_strcasecmp("POSIX", collcollate) != 0)
1774 : {
1775 : #if defined(__GLIBC__)
1776 : /* Use the glibc version because we don't have anything better. */
1777 21210 : collversion = pstrdup(gnu_get_libc_version());
1778 : #elif defined(LC_VERSION_MASK)
1779 : locale_t loc;
1780 :
1781 : /* Look up FreeBSD collation version. */
1782 : loc = newlocale(LC_COLLATE, collcollate, NULL);
1783 : if (loc)
1784 : {
1785 : collversion =
1786 : pstrdup(querylocale(LC_COLLATE_MASK | LC_VERSION_MASK, loc));
1787 : freelocale(loc);
1788 : }
1789 : else
1790 : ereport(ERROR,
1791 : (errmsg("could not load locale \"%s\"", collcollate)));
1792 : #elif defined(WIN32)
1793 : /*
1794 : * If we are targeting Windows Vista and above, we can ask for a name
1795 : * given a collation name (earlier versions required a location code
1796 : * that we don't have).
1797 : */
1798 : NLSVERSIONINFOEX version = {sizeof(NLSVERSIONINFOEX)};
1799 : WCHAR wide_collcollate[LOCALE_NAME_MAX_LENGTH];
1800 :
1801 : MultiByteToWideChar(CP_ACP, 0, collcollate, -1, wide_collcollate,
1802 : LOCALE_NAME_MAX_LENGTH);
1803 : if (!GetNLSVersionEx(COMPARE_STRING, wide_collcollate, &version))
1804 : {
1805 : /*
1806 : * GetNLSVersionEx() wants a language tag such as "en-US", not a
1807 : * locale name like "English_United States.1252". Until those
1808 : * values can be prevented from entering the system, or 100%
1809 : * reliably converted to the more useful tag format, tolerate the
1810 : * resulting error and report that we have no version data.
1811 : */
1812 : if (GetLastError() == ERROR_INVALID_PARAMETER)
1813 : return NULL;
1814 :
1815 : ereport(ERROR,
1816 : (errmsg("could not get collation version for locale \"%s\": error code %lu",
1817 : collcollate,
1818 : GetLastError())));
1819 : }
1820 : collversion = psprintf("%lu.%lu,%lu.%lu",
1821 : (version.dwNLSVersion >> 8) & 0xFFFF,
1822 : version.dwNLSVersion & 0xFF,
1823 : (version.dwDefinedVersion >> 8) & 0xFFFF,
1824 : version.dwDefinedVersion & 0xFF);
1825 : #endif
1826 : }
1827 :
1828 81514 : return collversion;
1829 : }
1830 :
1831 : /*
1832 : * pg_strncoll_libc_win32_utf8
1833 : *
1834 : * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
1835 : * invoke wcscoll() or wcscoll_l().
1836 : */
1837 : #ifdef WIN32
1838 : static int
1839 : pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
1840 : size_t len2, pg_locale_t locale)
1841 : {
1842 : char sbuf[TEXTBUFLEN];
1843 : char *buf = sbuf;
1844 : char *a1p,
1845 : *a2p;
1846 : int a1len = len1 * 2 + 2;
1847 : int a2len = len2 * 2 + 2;
1848 : int r;
1849 : int result;
1850 :
1851 : Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
1852 : Assert(GetDatabaseEncoding() == PG_UTF8);
1853 : #ifndef WIN32
1854 : Assert(false);
1855 : #endif
1856 :
1857 : if (a1len + a2len > TEXTBUFLEN)
1858 : buf = palloc(a1len + a2len);
1859 :
1860 : a1p = buf;
1861 : a2p = buf + a1len;
1862 :
1863 : /* API does not work for zero-length input */
1864 : if (len1 == 0)
1865 : r = 0;
1866 : else
1867 : {
1868 : r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1869 : (LPWSTR) a1p, a1len / 2);
1870 : if (!r)
1871 : ereport(ERROR,
1872 : (errmsg("could not convert string to UTF-16: error code %lu",
1873 : GetLastError())));
1874 : }
1875 : ((LPWSTR) a1p)[r] = 0;
1876 :
1877 : if (len2 == 0)
1878 : r = 0;
1879 : else
1880 : {
1881 : r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1882 : (LPWSTR) a2p, a2len / 2);
1883 : if (!r)
1884 : ereport(ERROR,
1885 : (errmsg("could not convert string to UTF-16: error code %lu",
1886 : GetLastError())));
1887 : }
1888 : ((LPWSTR) a2p)[r] = 0;
1889 :
1890 : errno = 0;
1891 : if (locale)
1892 : result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt);
1893 : else
1894 : result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1895 : if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */
1896 : ereport(ERROR,
1897 : (errmsg("could not compare Unicode strings: %m")));
1898 :
1899 : if (buf != sbuf)
1900 : pfree(buf);
1901 :
1902 : return result;
1903 : }
1904 : #endif /* WIN32 */
1905 :
1906 : /*
1907 : * pg_strcoll_libc
1908 : *
1909 : * Call strcoll(), strcoll_l(), wcscoll(), or wcscoll_l() as appropriate for
1910 : * the given locale, platform, and database encoding. If the locale is NULL,
1911 : * use the database collation.
1912 : *
1913 : * Arguments must be encoded in the database encoding and nul-terminated.
1914 : */
1915 : static int
1916 17059290 : pg_strcoll_libc(const char *arg1, const char *arg2, pg_locale_t locale)
1917 : {
1918 : int result;
1919 :
1920 : Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
1921 : #ifdef WIN32
1922 : if (GetDatabaseEncoding() == PG_UTF8)
1923 : {
1924 : size_t len1 = strlen(arg1);
1925 : size_t len2 = strlen(arg2);
1926 :
1927 : result = pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
1928 : }
1929 : else
1930 : #endif /* WIN32 */
1931 17059290 : if (locale)
1932 0 : result = strcoll_l(arg1, arg2, locale->info.lt);
1933 : else
1934 17059290 : result = strcoll(arg1, arg2);
1935 :
1936 17059290 : return result;
1937 : }
1938 :
1939 : /*
1940 : * pg_strncoll_libc
1941 : *
1942 : * Nul-terminate the arguments and call pg_strcoll_libc().
1943 : */
1944 : static int
1945 2039332 : pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
1946 : pg_locale_t locale)
1947 : {
1948 : char sbuf[TEXTBUFLEN];
1949 2039332 : char *buf = sbuf;
1950 2039332 : size_t bufsize1 = len1 + 1;
1951 2039332 : size_t bufsize2 = len2 + 1;
1952 : char *arg1n;
1953 : char *arg2n;
1954 : int result;
1955 :
1956 : Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
1957 :
1958 : #ifdef WIN32
1959 : /* check for this case before doing the work for nul-termination */
1960 : if (GetDatabaseEncoding() == PG_UTF8)
1961 : return pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
1962 : #endif /* WIN32 */
1963 :
1964 2039332 : if (bufsize1 + bufsize2 > TEXTBUFLEN)
1965 360 : buf = palloc(bufsize1 + bufsize2);
1966 :
1967 2039332 : arg1n = buf;
1968 2039332 : arg2n = buf + bufsize1;
1969 :
1970 : /* nul-terminate arguments */
1971 2039332 : memcpy(arg1n, arg1, len1);
1972 2039332 : arg1n[len1] = '\0';
1973 2039332 : memcpy(arg2n, arg2, len2);
1974 2039332 : arg2n[len2] = '\0';
1975 :
1976 2039332 : result = pg_strcoll_libc(arg1n, arg2n, locale);
1977 :
1978 2039332 : if (buf != sbuf)
1979 360 : pfree(buf);
1980 :
1981 2039332 : return result;
1982 : }
1983 :
1984 : #ifdef USE_ICU
1985 :
1986 : /*
1987 : * pg_strncoll_icu_no_utf8
1988 : *
1989 : * Convert the arguments from the database encoding to UChar strings, then
1990 : * call ucol_strcoll(). An argument length of -1 means that the string is
1991 : * NUL-terminated.
1992 : *
1993 : * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
1994 : * caller should call that instead.
1995 : */
1996 : static int
1997 0 : pg_strncoll_icu_no_utf8(const char *arg1, int32_t len1,
1998 : const char *arg2, int32_t len2, pg_locale_t locale)
1999 : {
2000 : char sbuf[TEXTBUFLEN];
2001 0 : char *buf = sbuf;
2002 : int32_t ulen1;
2003 : int32_t ulen2;
2004 : size_t bufsize1;
2005 : size_t bufsize2;
2006 : UChar *uchar1,
2007 : *uchar2;
2008 : int result;
2009 :
2010 : Assert(locale->provider == COLLPROVIDER_ICU);
2011 : #ifdef HAVE_UCOL_STRCOLLUTF8
2012 : Assert(GetDatabaseEncoding() != PG_UTF8);
2013 : #endif
2014 :
2015 0 : init_icu_converter();
2016 :
2017 0 : ulen1 = uchar_length(icu_converter, arg1, len1);
2018 0 : ulen2 = uchar_length(icu_converter, arg2, len2);
2019 :
2020 0 : bufsize1 = (ulen1 + 1) * sizeof(UChar);
2021 0 : bufsize2 = (ulen2 + 1) * sizeof(UChar);
2022 :
2023 0 : if (bufsize1 + bufsize2 > TEXTBUFLEN)
2024 0 : buf = palloc(bufsize1 + bufsize2);
2025 :
2026 0 : uchar1 = (UChar *) buf;
2027 0 : uchar2 = (UChar *) (buf + bufsize1);
2028 :
2029 0 : ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
2030 0 : ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
2031 :
2032 0 : result = ucol_strcoll(locale->info.icu.ucol,
2033 : uchar1, ulen1,
2034 : uchar2, ulen2);
2035 :
2036 0 : if (buf != sbuf)
2037 0 : pfree(buf);
2038 :
2039 0 : return result;
2040 : }
2041 :
2042 : /*
2043 : * pg_strncoll_icu
2044 : *
2045 : * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
2046 : * database encoding. An argument length of -1 means the string is
2047 : * NUL-terminated.
2048 : *
2049 : * Arguments must be encoded in the database encoding.
2050 : */
2051 : static int
2052 16150 : pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2,
2053 : pg_locale_t locale)
2054 : {
2055 : int result;
2056 :
2057 : Assert(locale->provider == COLLPROVIDER_ICU);
2058 :
2059 : #ifdef HAVE_UCOL_STRCOLLUTF8
2060 16150 : if (GetDatabaseEncoding() == PG_UTF8)
2061 : {
2062 : UErrorCode status;
2063 :
2064 16150 : status = U_ZERO_ERROR;
2065 16150 : result = ucol_strcollUTF8(locale->info.icu.ucol,
2066 : arg1, len1,
2067 : arg2, len2,
2068 : &status);
2069 16150 : if (U_FAILURE(status))
2070 0 : ereport(ERROR,
2071 : (errmsg("collation failed: %s", u_errorName(status))));
2072 : }
2073 : else
2074 : #endif
2075 : {
2076 0 : result = pg_strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale);
2077 : }
2078 :
2079 16150 : return result;
2080 : }
2081 :
2082 : #endif /* USE_ICU */
2083 :
2084 : /*
2085 : * pg_strcoll
2086 : *
2087 : * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll(), strcoll_l(), wcscoll(),
2088 : * or wcscoll_l() as appropriate for the given locale, platform, and database
2089 : * encoding. If the locale is not specified, use the database collation.
2090 : *
2091 : * Arguments must be encoded in the database encoding and nul-terminated.
2092 : *
2093 : * The caller is responsible for breaking ties if the collation is
2094 : * deterministic; this maintains consistency with pg_strxfrm(), which cannot
2095 : * easily account for deterministic collations.
2096 : */
2097 : int
2098 15020414 : pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
2099 : {
2100 : int result;
2101 :
2102 15020414 : if (!locale || locale->provider == COLLPROVIDER_LIBC)
2103 15019958 : result = pg_strcoll_libc(arg1, arg2, locale);
2104 : #ifdef USE_ICU
2105 456 : else if (locale->provider == COLLPROVIDER_ICU)
2106 456 : result = pg_strncoll_icu(arg1, -1, arg2, -1, locale);
2107 : #endif
2108 : else
2109 : /* shouldn't happen */
2110 0 : PGLOCALE_SUPPORT_ERROR(locale->provider);
2111 :
2112 15020414 : return result;
2113 : }
2114 :
2115 : /*
2116 : * pg_strncoll
2117 : *
2118 : * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll(), strcoll_l(), wcscoll(),
2119 : * or wcscoll_l() as appropriate for the given locale, platform, and database
2120 : * encoding. If the locale is not specified, use the database collation.
2121 : *
2122 : * Arguments must be encoded in the database encoding.
2123 : *
2124 : * This function may need to nul-terminate the arguments for libc functions;
2125 : * so if the caller already has nul-terminated strings, it should call
2126 : * pg_strcoll() instead.
2127 : *
2128 : * The caller is responsible for breaking ties if the collation is
2129 : * deterministic; this maintains consistency with pg_strnxfrm(), which cannot
2130 : * easily account for deterministic collations.
2131 : */
2132 : int
2133 2055026 : pg_strncoll(const char *arg1, size_t len1, const char *arg2, size_t len2,
2134 : pg_locale_t locale)
2135 : {
2136 : int result;
2137 :
2138 2055026 : if (!locale || locale->provider == COLLPROVIDER_LIBC)
2139 2039332 : result = pg_strncoll_libc(arg1, len1, arg2, len2, locale);
2140 : #ifdef USE_ICU
2141 15694 : else if (locale->provider == COLLPROVIDER_ICU)
2142 15694 : result = pg_strncoll_icu(arg1, len1, arg2, len2, locale);
2143 : #endif
2144 : else
2145 : /* shouldn't happen */
2146 0 : PGLOCALE_SUPPORT_ERROR(locale->provider);
2147 :
2148 2055026 : return result;
2149 : }
2150 :
2151 :
2152 : static size_t
2153 0 : pg_strxfrm_libc(char *dest, const char *src, size_t destsize,
2154 : pg_locale_t locale)
2155 : {
2156 : Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
2157 :
2158 : #ifdef TRUST_STRXFRM
2159 : if (locale)
2160 : return strxfrm_l(dest, src, destsize, locale->info.lt);
2161 : else
2162 : return strxfrm(dest, src, destsize);
2163 : #else
2164 : /* shouldn't happen */
2165 0 : PGLOCALE_SUPPORT_ERROR(locale->provider);
2166 : return 0; /* keep compiler quiet */
2167 : #endif
2168 : }
2169 :
2170 : static size_t
2171 0 : pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize,
2172 : pg_locale_t locale)
2173 : {
2174 : char sbuf[TEXTBUFLEN];
2175 0 : char *buf = sbuf;
2176 0 : size_t bufsize = srclen + 1;
2177 : size_t result;
2178 :
2179 : Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
2180 :
2181 0 : if (bufsize > TEXTBUFLEN)
2182 0 : buf = palloc(bufsize);
2183 :
2184 : /* nul-terminate arguments */
2185 0 : memcpy(buf, src, srclen);
2186 0 : buf[srclen] = '\0';
2187 :
2188 0 : result = pg_strxfrm_libc(dest, buf, destsize, locale);
2189 :
2190 0 : if (buf != sbuf)
2191 0 : pfree(buf);
2192 :
2193 : /* if dest is defined, it should be nul-terminated */
2194 : Assert(result >= destsize || dest[result] == '\0');
2195 :
2196 0 : return result;
2197 : }
2198 :
2199 : #ifdef USE_ICU
2200 :
2201 : /* 'srclen' of -1 means the strings are NUL-terminated */
2202 : static size_t
2203 756 : pg_strnxfrm_icu(char *dest, const char *src, int32_t srclen, int32_t destsize,
2204 : pg_locale_t locale)
2205 : {
2206 : char sbuf[TEXTBUFLEN];
2207 756 : char *buf = sbuf;
2208 : UChar *uchar;
2209 : int32_t ulen;
2210 : size_t uchar_bsize;
2211 : Size result_bsize;
2212 :
2213 : Assert(locale->provider == COLLPROVIDER_ICU);
2214 :
2215 756 : init_icu_converter();
2216 :
2217 756 : ulen = uchar_length(icu_converter, src, srclen);
2218 :
2219 756 : uchar_bsize = (ulen + 1) * sizeof(UChar);
2220 :
2221 756 : if (uchar_bsize > TEXTBUFLEN)
2222 0 : buf = palloc(uchar_bsize);
2223 :
2224 756 : uchar = (UChar *) buf;
2225 :
2226 756 : ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
2227 :
2228 756 : result_bsize = ucol_getSortKey(locale->info.icu.ucol,
2229 : uchar, ulen,
2230 : (uint8_t *) dest, destsize);
2231 :
2232 : /*
2233 : * ucol_getSortKey() counts the nul-terminator in the result length, but
2234 : * this function should not.
2235 : */
2236 : Assert(result_bsize > 0);
2237 756 : result_bsize--;
2238 :
2239 756 : if (buf != sbuf)
2240 0 : pfree(buf);
2241 :
2242 : /* if dest is defined, it should be nul-terminated */
2243 : Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
2244 :
2245 756 : return result_bsize;
2246 : }
2247 :
2248 : /* 'srclen' of -1 means the strings are NUL-terminated */
2249 : static size_t
2250 0 : pg_strnxfrm_prefix_icu_no_utf8(char *dest, const char *src, int32_t srclen,
2251 : int32_t destsize, pg_locale_t locale)
2252 : {
2253 : char sbuf[TEXTBUFLEN];
2254 0 : char *buf = sbuf;
2255 : UCharIterator iter;
2256 : uint32_t state[2];
2257 : UErrorCode status;
2258 0 : int32_t ulen = -1;
2259 0 : UChar *uchar = NULL;
2260 : size_t uchar_bsize;
2261 : Size result_bsize;
2262 :
2263 : Assert(locale->provider == COLLPROVIDER_ICU);
2264 : Assert(GetDatabaseEncoding() != PG_UTF8);
2265 :
2266 0 : init_icu_converter();
2267 :
2268 0 : ulen = uchar_length(icu_converter, src, srclen);
2269 :
2270 0 : uchar_bsize = (ulen + 1) * sizeof(UChar);
2271 :
2272 0 : if (uchar_bsize > TEXTBUFLEN)
2273 0 : buf = palloc(uchar_bsize);
2274 :
2275 0 : uchar = (UChar *) buf;
2276 :
2277 0 : ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
2278 :
2279 0 : uiter_setString(&iter, uchar, ulen);
2280 0 : state[0] = state[1] = 0; /* won't need that again */
2281 0 : status = U_ZERO_ERROR;
2282 0 : result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol,
2283 : &iter,
2284 : state,
2285 : (uint8_t *) dest,
2286 : destsize,
2287 : &status);
2288 0 : if (U_FAILURE(status))
2289 0 : ereport(ERROR,
2290 : (errmsg("sort key generation failed: %s",
2291 : u_errorName(status))));
2292 :
2293 0 : return result_bsize;
2294 : }
2295 :
2296 : /* 'srclen' of -1 means the strings are NUL-terminated */
2297 : static size_t
2298 1626 : pg_strnxfrm_prefix_icu(char *dest, const char *src, int32_t srclen,
2299 : int32_t destsize, pg_locale_t locale)
2300 : {
2301 : size_t result;
2302 :
2303 : Assert(locale->provider == COLLPROVIDER_ICU);
2304 :
2305 1626 : if (GetDatabaseEncoding() == PG_UTF8)
2306 : {
2307 : UCharIterator iter;
2308 : uint32_t state[2];
2309 : UErrorCode status;
2310 :
2311 1626 : uiter_setUTF8(&iter, src, srclen);
2312 1626 : state[0] = state[1] = 0; /* won't need that again */
2313 1626 : status = U_ZERO_ERROR;
2314 1626 : result = ucol_nextSortKeyPart(locale->info.icu.ucol,
2315 : &iter,
2316 : state,
2317 : (uint8_t *) dest,
2318 : destsize,
2319 : &status);
2320 1626 : if (U_FAILURE(status))
2321 0 : ereport(ERROR,
2322 : (errmsg("sort key generation failed: %s",
2323 : u_errorName(status))));
2324 : }
2325 : else
2326 0 : result = pg_strnxfrm_prefix_icu_no_utf8(dest, src, srclen, destsize,
2327 : locale);
2328 :
2329 1626 : return result;
2330 : }
2331 :
2332 : #endif
2333 :
2334 : /*
2335 : * Return true if the collation provider supports pg_strxfrm() and
2336 : * pg_strnxfrm(); otherwise false.
2337 : *
2338 : * Unfortunately, it seems that strxfrm() for non-C collations is broken on
2339 : * many common platforms; testing of multiple versions of glibc reveals that,
2340 : * for many locales, strcoll() and strxfrm() do not return consistent
2341 : * results. While no other libc other than Cygwin has so far been shown to
2342 : * have a problem, we take the conservative course of action for right now and
2343 : * disable this categorically. (Users who are certain this isn't a problem on
2344 : * their system can define TRUST_STRXFRM.)
2345 : *
2346 : * No similar problem is known for the ICU provider.
2347 : */
2348 : bool
2349 35304 : pg_strxfrm_enabled(pg_locale_t locale)
2350 : {
2351 35304 : if (!locale || locale->provider == COLLPROVIDER_LIBC)
2352 : #ifdef TRUST_STRXFRM
2353 : return true;
2354 : #else
2355 34742 : return false;
2356 : #endif
2357 562 : else if (locale->provider == COLLPROVIDER_ICU)
2358 562 : return true;
2359 : else
2360 : /* shouldn't happen */
2361 0 : PGLOCALE_SUPPORT_ERROR(locale->provider);
2362 :
2363 : return false; /* keep compiler quiet */
2364 : }
2365 :
2366 : /*
2367 : * pg_strxfrm
2368 : *
2369 : * Transforms 'src' to a nul-terminated string stored in 'dest' such that
2370 : * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
2371 : * untransformed strings.
2372 : *
2373 : * The provided 'src' must be nul-terminated. If 'destsize' is zero, 'dest'
2374 : * may be NULL.
2375 : *
2376 : * Returns the number of bytes needed to store the transformed string,
2377 : * excluding the terminating nul byte. If the value returned is 'destsize' or
2378 : * greater, the resulting contents of 'dest' are undefined.
2379 : */
2380 : size_t
2381 0 : pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
2382 : {
2383 0 : size_t result = 0; /* keep compiler quiet */
2384 :
2385 0 : if (!locale || locale->provider == COLLPROVIDER_LIBC)
2386 0 : result = pg_strxfrm_libc(dest, src, destsize, locale);
2387 : #ifdef USE_ICU
2388 0 : else if (locale->provider == COLLPROVIDER_ICU)
2389 0 : result = pg_strnxfrm_icu(dest, src, -1, destsize, locale);
2390 : #endif
2391 : else
2392 : /* shouldn't happen */
2393 0 : PGLOCALE_SUPPORT_ERROR(locale->provider);
2394 :
2395 0 : return result;
2396 : }
2397 :
2398 : /*
2399 : * pg_strnxfrm
2400 : *
2401 : * Transforms 'src' to a nul-terminated string stored in 'dest' such that
2402 : * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
2403 : * untransformed strings.
2404 : *
2405 : * 'src' does not need to be nul-terminated. If 'destsize' is zero, 'dest' may
2406 : * be NULL.
2407 : *
2408 : * Returns the number of bytes needed to store the transformed string,
2409 : * excluding the terminating nul byte. If the value returned is 'destsize' or
2410 : * greater, the resulting contents of 'dest' are undefined.
2411 : *
2412 : * This function may need to nul-terminate the argument for libc functions;
2413 : * so if the caller already has a nul-terminated string, it should call
2414 : * pg_strxfrm() instead.
2415 : */
2416 : size_t
2417 756 : pg_strnxfrm(char *dest, size_t destsize, const char *src, size_t srclen,
2418 : pg_locale_t locale)
2419 : {
2420 756 : size_t result = 0; /* keep compiler quiet */
2421 :
2422 756 : if (!locale || locale->provider == COLLPROVIDER_LIBC)
2423 0 : result = pg_strnxfrm_libc(dest, src, srclen, destsize, locale);
2424 : #ifdef USE_ICU
2425 756 : else if (locale->provider == COLLPROVIDER_ICU)
2426 756 : result = pg_strnxfrm_icu(dest, src, srclen, destsize, locale);
2427 : #endif
2428 : else
2429 : /* shouldn't happen */
2430 0 : PGLOCALE_SUPPORT_ERROR(locale->provider);
2431 :
2432 756 : return result;
2433 : }
2434 :
2435 : /*
2436 : * Return true if the collation provider supports pg_strxfrm_prefix() and
2437 : * pg_strnxfrm_prefix(); otherwise false.
2438 : */
2439 : bool
2440 1626 : pg_strxfrm_prefix_enabled(pg_locale_t locale)
2441 : {
2442 1626 : if (!locale || locale->provider == COLLPROVIDER_LIBC)
2443 0 : return false;
2444 1626 : else if (locale->provider == COLLPROVIDER_ICU)
2445 1626 : return true;
2446 : else
2447 : /* shouldn't happen */
2448 0 : PGLOCALE_SUPPORT_ERROR(locale->provider);
2449 :
2450 : return false; /* keep compiler quiet */
2451 : }
2452 :
2453 : /*
2454 : * pg_strxfrm_prefix
2455 : *
2456 : * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
2457 : * memcmp() on the byte sequence is equivalent to pg_strcoll() on
2458 : * untransformed strings. The result is not nul-terminated.
2459 : *
2460 : * The provided 'src' must be nul-terminated.
2461 : *
2462 : * If destsize is not large enough to hold the resulting byte sequence, stores
2463 : * only the first destsize bytes in 'dest'. Returns the number of bytes
2464 : * actually copied to 'dest'.
2465 : */
2466 : size_t
2467 1626 : pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
2468 : pg_locale_t locale)
2469 : {
2470 1626 : size_t result = 0; /* keep compiler quiet */
2471 :
2472 1626 : if (!locale)
2473 0 : PGLOCALE_SUPPORT_ERROR(COLLPROVIDER_LIBC);
2474 : #ifdef USE_ICU
2475 1626 : else if (locale->provider == COLLPROVIDER_ICU)
2476 1626 : result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
2477 : #endif
2478 : else
2479 0 : PGLOCALE_SUPPORT_ERROR(locale->provider);
2480 :
2481 1626 : return result;
2482 : }
2483 :
2484 : /*
2485 : * pg_strnxfrm_prefix
2486 : *
2487 : * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
2488 : * memcmp() on the byte sequence is equivalent to pg_strcoll() on
2489 : * untransformed strings. The result is not nul-terminated.
2490 : *
2491 : * The provided 'src' must be nul-terminated.
2492 : *
2493 : * If destsize is not large enough to hold the resulting byte sequence, stores
2494 : * only the first destsize bytes in 'dest'. Returns the number of bytes
2495 : * actually copied to 'dest'.
2496 : *
2497 : * This function may need to nul-terminate the argument for libc functions;
2498 : * so if the caller already has a nul-terminated string, it should call
2499 : * pg_strxfrm_prefix() instead.
2500 : */
2501 : size_t
2502 0 : pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
2503 : size_t srclen, pg_locale_t locale)
2504 : {
2505 0 : size_t result = 0; /* keep compiler quiet */
2506 :
2507 0 : if (!locale)
2508 0 : PGLOCALE_SUPPORT_ERROR(COLLPROVIDER_LIBC);
2509 : #ifdef USE_ICU
2510 0 : else if (locale->provider == COLLPROVIDER_ICU)
2511 0 : result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
2512 : #endif
2513 : else
2514 0 : PGLOCALE_SUPPORT_ERROR(locale->provider);
2515 :
2516 0 : return result;
2517 : }
2518 :
2519 : /*
2520 : * Return required encoding ID for the given locale, or -1 if any encoding is
2521 : * valid for the locale.
2522 : *
2523 : * The only supported locale for the builtin provider is "C", and it's
2524 : * available for any encoding.
2525 : */
2526 : int
2527 1848 : builtin_locale_encoding(const char *locale)
2528 : {
2529 1848 : if (strcmp(locale, "C") == 0)
2530 32 : return -1;
2531 1816 : if (strcmp(locale, "C.UTF-8") == 0)
2532 1816 : return PG_UTF8;
2533 :
2534 0 : ereport(ERROR,
2535 : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
2536 : errmsg("invalid locale name \"%s\" for builtin provider",
2537 : locale)));
2538 :
2539 : return 0; /* keep compiler quiet */
2540 : }
2541 :
2542 :
2543 : /*
2544 : * Validate the locale and encoding combination, and return the canonical form
2545 : * of the locale name.
2546 : *
2547 : * The only supported locale for the builtin provider is "C", and it's
2548 : * available for any encoding.
2549 : */
2550 : const char *
2551 1840 : builtin_validate_locale(int encoding, const char *locale)
2552 : {
2553 1840 : const char *canonical_name = NULL;
2554 : int required_encoding;
2555 :
2556 1840 : if (strcmp(locale, "C") == 0)
2557 26 : canonical_name = "C";
2558 1814 : else if (strcmp(locale, "C.UTF-8") == 0 || strcmp(locale, "C.UTF8") == 0)
2559 1802 : canonical_name = "C.UTF-8";
2560 :
2561 1840 : if (!canonical_name)
2562 12 : ereport(ERROR,
2563 : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
2564 : errmsg("invalid locale name \"%s\" for builtin provider",
2565 : locale)));
2566 :
2567 1828 : required_encoding = builtin_locale_encoding(canonical_name);
2568 1828 : if (required_encoding >= 0 && encoding != required_encoding)
2569 2 : ereport(ERROR,
2570 : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
2571 : errmsg("encoding \"%s\" does not match locale \"%s\"",
2572 : pg_encoding_to_char(encoding), locale)));
2573 :
2574 1826 : return canonical_name;
2575 : }
2576 :
2577 :
2578 : #ifdef USE_ICU
2579 :
2580 : /*
2581 : * Wrapper around ucol_open() to handle API differences for older ICU
2582 : * versions.
2583 : */
2584 : static UCollator *
2585 60310 : pg_ucol_open(const char *loc_str)
2586 : {
2587 : UCollator *collator;
2588 : UErrorCode status;
2589 60310 : const char *orig_str = loc_str;
2590 60310 : char *fixed_str = NULL;
2591 :
2592 : /*
2593 : * Must never open default collator, because it depends on the environment
2594 : * and may change at any time. Should not happen, but check here to catch
2595 : * bugs that might be hard to catch otherwise.
2596 : *
2597 : * NB: the default collator is not the same as the collator for the root
2598 : * locale. The root locale may be specified as the empty string, "und", or
2599 : * "root". The default collator is opened by passing NULL to ucol_open().
2600 : */
2601 60310 : if (loc_str == NULL)
2602 0 : elog(ERROR, "opening default collator is not supported");
2603 :
2604 : /*
2605 : * In ICU versions 54 and earlier, "und" is not a recognized spelling of
2606 : * the root locale. If the first component of the locale is "und", replace
2607 : * with "root" before opening.
2608 : */
2609 : if (U_ICU_VERSION_MAJOR_NUM < 55)
2610 : {
2611 : char lang[ULOC_LANG_CAPACITY];
2612 :
2613 : status = U_ZERO_ERROR;
2614 : uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
2615 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
2616 : {
2617 : ereport(ERROR,
2618 : (errmsg("could not get language from locale \"%s\": %s",
2619 : loc_str, u_errorName(status))));
2620 : }
2621 :
2622 : if (strcmp(lang, "und") == 0)
2623 : {
2624 : const char *remainder = loc_str + strlen("und");
2625 :
2626 : fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
2627 : strcpy(fixed_str, "root");
2628 : strcat(fixed_str, remainder);
2629 :
2630 : loc_str = fixed_str;
2631 : }
2632 : }
2633 :
2634 60310 : status = U_ZERO_ERROR;
2635 60310 : collator = ucol_open(loc_str, &status);
2636 60310 : if (U_FAILURE(status))
2637 12 : ereport(ERROR,
2638 : /* use original string for error report */
2639 : (errmsg("could not open collator for locale \"%s\": %s",
2640 : orig_str, u_errorName(status))));
2641 :
2642 : if (U_ICU_VERSION_MAJOR_NUM < 54)
2643 : {
2644 : status = U_ZERO_ERROR;
2645 : icu_set_collation_attributes(collator, loc_str, &status);
2646 :
2647 : /*
2648 : * Pretend the error came from ucol_open(), for consistent error
2649 : * message across ICU versions.
2650 : */
2651 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
2652 : {
2653 : ucol_close(collator);
2654 : ereport(ERROR,
2655 : (errmsg("could not open collator for locale \"%s\": %s",
2656 : orig_str, u_errorName(status))));
2657 : }
2658 : }
2659 :
2660 60298 : if (fixed_str != NULL)
2661 0 : pfree(fixed_str);
2662 :
2663 60298 : return collator;
2664 : }
2665 :
2666 : static void
2667 1800 : init_icu_converter(void)
2668 : {
2669 : const char *icu_encoding_name;
2670 : UErrorCode status;
2671 : UConverter *conv;
2672 :
2673 1800 : if (icu_converter)
2674 1794 : return; /* already done */
2675 :
2676 6 : icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
2677 6 : if (!icu_encoding_name)
2678 0 : ereport(ERROR,
2679 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
2680 : errmsg("encoding \"%s\" not supported by ICU",
2681 : pg_encoding_to_char(GetDatabaseEncoding()))));
2682 :
2683 6 : status = U_ZERO_ERROR;
2684 6 : conv = ucnv_open(icu_encoding_name, &status);
2685 6 : if (U_FAILURE(status))
2686 0 : ereport(ERROR,
2687 : (errmsg("could not open ICU converter for encoding \"%s\": %s",
2688 : icu_encoding_name, u_errorName(status))));
2689 :
2690 6 : icu_converter = conv;
2691 : }
2692 :
2693 : /*
2694 : * Find length, in UChars, of given string if converted to UChar string.
2695 : */
2696 : static size_t
2697 1284 : uchar_length(UConverter *converter, const char *str, int32_t len)
2698 : {
2699 1284 : UErrorCode status = U_ZERO_ERROR;
2700 : int32_t ulen;
2701 :
2702 1284 : ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
2703 1284 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
2704 0 : ereport(ERROR,
2705 : (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
2706 1284 : return ulen;
2707 : }
2708 :
2709 : /*
2710 : * Convert the given source string into a UChar string, stored in dest, and
2711 : * return the length (in UChars).
2712 : */
2713 : static int32_t
2714 1284 : uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
2715 : const char *src, int32_t srclen)
2716 : {
2717 1284 : UErrorCode status = U_ZERO_ERROR;
2718 : int32_t ulen;
2719 :
2720 1284 : status = U_ZERO_ERROR;
2721 1284 : ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
2722 1284 : if (U_FAILURE(status))
2723 0 : ereport(ERROR,
2724 : (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
2725 1284 : return ulen;
2726 : }
2727 :
2728 : /*
2729 : * Convert a string in the database encoding into a string of UChars.
2730 : *
2731 : * The source string at buff is of length nbytes
2732 : * (it needn't be nul-terminated)
2733 : *
2734 : * *buff_uchar receives a pointer to the palloc'd result string, and
2735 : * the function's result is the number of UChars generated.
2736 : *
2737 : * The result string is nul-terminated, though most callers rely on the
2738 : * result length instead.
2739 : */
2740 : int32_t
2741 528 : icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
2742 : {
2743 : int32_t len_uchar;
2744 :
2745 528 : init_icu_converter();
2746 :
2747 528 : len_uchar = uchar_length(icu_converter, buff, nbytes);
2748 :
2749 528 : *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
2750 528 : len_uchar = uchar_convert(icu_converter,
2751 : *buff_uchar, len_uchar + 1, buff, nbytes);
2752 :
2753 528 : return len_uchar;
2754 : }
2755 :
2756 : /*
2757 : * Convert a string of UChars into the database encoding.
2758 : *
2759 : * The source string at buff_uchar is of length len_uchar
2760 : * (it needn't be nul-terminated)
2761 : *
2762 : * *result receives a pointer to the palloc'd result string, and the
2763 : * function's result is the number of bytes generated (not counting nul).
2764 : *
2765 : * The result string is nul-terminated.
2766 : */
2767 : int32_t
2768 516 : icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar)
2769 : {
2770 : UErrorCode status;
2771 : int32_t len_result;
2772 :
2773 516 : init_icu_converter();
2774 :
2775 516 : status = U_ZERO_ERROR;
2776 516 : len_result = ucnv_fromUChars(icu_converter, NULL, 0,
2777 : buff_uchar, len_uchar, &status);
2778 516 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
2779 0 : ereport(ERROR,
2780 : (errmsg("%s failed: %s", "ucnv_fromUChars",
2781 : u_errorName(status))));
2782 :
2783 516 : *result = palloc(len_result + 1);
2784 :
2785 516 : status = U_ZERO_ERROR;
2786 516 : len_result = ucnv_fromUChars(icu_converter, *result, len_result + 1,
2787 : buff_uchar, len_uchar, &status);
2788 516 : if (U_FAILURE(status) ||
2789 516 : status == U_STRING_NOT_TERMINATED_WARNING)
2790 0 : ereport(ERROR,
2791 : (errmsg("%s failed: %s", "ucnv_fromUChars",
2792 : u_errorName(status))));
2793 :
2794 516 : return len_result;
2795 : }
2796 :
2797 : /*
2798 : * Parse collation attributes from the given locale string and apply them to
2799 : * the open collator.
2800 : *
2801 : * First, the locale string is canonicalized to an ICU format locale ID such
2802 : * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
2803 : * the key-value arguments.
2804 : *
2805 : * Starting with ICU version 54, the attributes are processed automatically by
2806 : * ucol_open(), so this is only necessary for emulating this behavior on older
2807 : * versions.
2808 : */
2809 : pg_attribute_unused()
2810 : static void
2811 0 : icu_set_collation_attributes(UCollator *collator, const char *loc,
2812 : UErrorCode *status)
2813 : {
2814 : int32_t len;
2815 : char *icu_locale_id;
2816 : char *lower_str;
2817 : char *str;
2818 :
2819 : /*
2820 : * The input locale may be a BCP 47 language tag, e.g.
2821 : * "und-u-kc-ks-level1", which expresses the same attributes in a
2822 : * different form. It will be converted to the equivalent ICU format
2823 : * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
2824 : * uloc_canonicalize().
2825 : */
2826 0 : *status = U_ZERO_ERROR;
2827 0 : len = uloc_canonicalize(loc, NULL, 0, status);
2828 0 : icu_locale_id = palloc(len + 1);
2829 0 : *status = U_ZERO_ERROR;
2830 0 : len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
2831 0 : if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
2832 0 : return;
2833 :
2834 0 : lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
2835 :
2836 0 : pfree(icu_locale_id);
2837 :
2838 0 : str = strchr(lower_str, '@');
2839 0 : if (!str)
2840 0 : return;
2841 0 : str++;
2842 :
2843 0 : for (char *token = strtok(str, ";"); token; token = strtok(NULL, ";"))
2844 : {
2845 0 : char *e = strchr(token, '=');
2846 :
2847 0 : if (e)
2848 : {
2849 : char *name;
2850 : char *value;
2851 : UColAttribute uattr;
2852 : UColAttributeValue uvalue;
2853 :
2854 0 : *status = U_ZERO_ERROR;
2855 :
2856 0 : *e = '\0';
2857 0 : name = token;
2858 0 : value = e + 1;
2859 :
2860 : /*
2861 : * See attribute name and value lists in ICU i18n/coll.cpp
2862 : */
2863 0 : if (strcmp(name, "colstrength") == 0)
2864 0 : uattr = UCOL_STRENGTH;
2865 0 : else if (strcmp(name, "colbackwards") == 0)
2866 0 : uattr = UCOL_FRENCH_COLLATION;
2867 0 : else if (strcmp(name, "colcaselevel") == 0)
2868 0 : uattr = UCOL_CASE_LEVEL;
2869 0 : else if (strcmp(name, "colcasefirst") == 0)
2870 0 : uattr = UCOL_CASE_FIRST;
2871 0 : else if (strcmp(name, "colalternate") == 0)
2872 0 : uattr = UCOL_ALTERNATE_HANDLING;
2873 0 : else if (strcmp(name, "colnormalization") == 0)
2874 0 : uattr = UCOL_NORMALIZATION_MODE;
2875 0 : else if (strcmp(name, "colnumeric") == 0)
2876 0 : uattr = UCOL_NUMERIC_COLLATION;
2877 : else
2878 : /* ignore if unknown */
2879 0 : continue;
2880 :
2881 0 : if (strcmp(value, "primary") == 0)
2882 0 : uvalue = UCOL_PRIMARY;
2883 0 : else if (strcmp(value, "secondary") == 0)
2884 0 : uvalue = UCOL_SECONDARY;
2885 0 : else if (strcmp(value, "tertiary") == 0)
2886 0 : uvalue = UCOL_TERTIARY;
2887 0 : else if (strcmp(value, "quaternary") == 0)
2888 0 : uvalue = UCOL_QUATERNARY;
2889 0 : else if (strcmp(value, "identical") == 0)
2890 0 : uvalue = UCOL_IDENTICAL;
2891 0 : else if (strcmp(value, "no") == 0)
2892 0 : uvalue = UCOL_OFF;
2893 0 : else if (strcmp(value, "yes") == 0)
2894 0 : uvalue = UCOL_ON;
2895 0 : else if (strcmp(value, "shifted") == 0)
2896 0 : uvalue = UCOL_SHIFTED;
2897 0 : else if (strcmp(value, "non-ignorable") == 0)
2898 0 : uvalue = UCOL_NON_IGNORABLE;
2899 0 : else if (strcmp(value, "lower") == 0)
2900 0 : uvalue = UCOL_LOWER_FIRST;
2901 0 : else if (strcmp(value, "upper") == 0)
2902 0 : uvalue = UCOL_UPPER_FIRST;
2903 : else
2904 : {
2905 0 : *status = U_ILLEGAL_ARGUMENT_ERROR;
2906 0 : break;
2907 : }
2908 :
2909 0 : ucol_setAttribute(collator, uattr, uvalue, status);
2910 : }
2911 : }
2912 :
2913 0 : pfree(lower_str);
2914 : }
2915 : #endif
2916 :
2917 : /*
2918 : * Return the BCP47 language tag representation of the requested locale.
2919 : *
2920 : * This function should be called before passing the string to ucol_open(),
2921 : * because conversion to a language tag also performs "level 2
2922 : * canonicalization". In addition to producing a consistent format, level 2
2923 : * canonicalization is able to more accurately interpret different input
2924 : * locale string formats, such as POSIX and .NET IDs.
2925 : */
2926 : char *
2927 59666 : icu_language_tag(const char *loc_str, int elevel)
2928 : {
2929 : #ifdef USE_ICU
2930 : UErrorCode status;
2931 : char *langtag;
2932 59666 : size_t buflen = 32; /* arbitrary starting buffer size */
2933 59666 : const bool strict = true;
2934 :
2935 : /*
2936 : * A BCP47 language tag doesn't have a clearly-defined upper limit (cf.
2937 : * RFC5646 section 4.4). Additionally, in older ICU versions,
2938 : * uloc_toLanguageTag() doesn't always return the ultimate length on the
2939 : * first call, necessitating a loop.
2940 : */
2941 59666 : langtag = palloc(buflen);
2942 : while (true)
2943 : {
2944 59666 : status = U_ZERO_ERROR;
2945 59666 : uloc_toLanguageTag(loc_str, langtag, buflen, strict, &status);
2946 :
2947 : /* try again if the buffer is not large enough */
2948 59666 : if ((status == U_BUFFER_OVERFLOW_ERROR ||
2949 59666 : status == U_STRING_NOT_TERMINATED_WARNING) &&
2950 : buflen < MaxAllocSize)
2951 : {
2952 0 : buflen = Min(buflen * 2, MaxAllocSize);
2953 0 : langtag = repalloc(langtag, buflen);
2954 0 : continue;
2955 : }
2956 :
2957 59666 : break;
2958 : }
2959 :
2960 59666 : if (U_FAILURE(status))
2961 : {
2962 18 : pfree(langtag);
2963 :
2964 18 : if (elevel > 0)
2965 14 : ereport(elevel,
2966 : (errmsg("could not convert locale name \"%s\" to language tag: %s",
2967 : loc_str, u_errorName(status))));
2968 12 : return NULL;
2969 : }
2970 :
2971 59648 : return langtag;
2972 : #else /* not USE_ICU */
2973 : ereport(ERROR,
2974 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
2975 : errmsg("ICU is not supported in this build")));
2976 : return NULL; /* keep compiler quiet */
2977 : #endif /* not USE_ICU */
2978 : }
2979 :
2980 : /*
2981 : * Perform best-effort check that the locale is a valid one.
2982 : */
2983 : void
2984 166 : icu_validate_locale(const char *loc_str)
2985 : {
2986 : #ifdef USE_ICU
2987 : UCollator *collator;
2988 : UErrorCode status;
2989 : char lang[ULOC_LANG_CAPACITY];
2990 166 : bool found = false;
2991 166 : int elevel = icu_validation_level;
2992 :
2993 : /* no validation */
2994 166 : if (elevel < 0)
2995 12 : return;
2996 :
2997 : /* downgrade to WARNING during pg_upgrade */
2998 154 : if (IsBinaryUpgrade && elevel > WARNING)
2999 0 : elevel = WARNING;
3000 :
3001 : /* validate that we can extract the language */
3002 154 : status = U_ZERO_ERROR;
3003 154 : uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
3004 154 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
3005 : {
3006 0 : ereport(elevel,
3007 : (errmsg("could not get language from ICU locale \"%s\": %s",
3008 : loc_str, u_errorName(status)),
3009 : errhint("To disable ICU locale validation, set the parameter %s to \"%s\".",
3010 : "icu_validation_level", "disabled")));
3011 0 : return;
3012 : }
3013 :
3014 : /* check for special language name */
3015 154 : if (strcmp(lang, "") == 0 ||
3016 46 : strcmp(lang, "root") == 0 || strcmp(lang, "und") == 0)
3017 108 : found = true;
3018 :
3019 : /* search for matching language within ICU */
3020 15138 : for (int32_t i = 0; !found && i < uloc_countAvailable(); i++)
3021 : {
3022 14984 : const char *otherloc = uloc_getAvailable(i);
3023 : char otherlang[ULOC_LANG_CAPACITY];
3024 :
3025 14984 : status = U_ZERO_ERROR;
3026 14984 : uloc_getLanguage(otherloc, otherlang, ULOC_LANG_CAPACITY, &status);
3027 14984 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
3028 0 : continue;
3029 :
3030 14984 : if (strcmp(lang, otherlang) == 0)
3031 32 : found = true;
3032 : }
3033 :
3034 154 : if (!found)
3035 14 : ereport(elevel,
3036 : (errmsg("ICU locale \"%s\" has unknown language \"%s\"",
3037 : loc_str, lang),
3038 : errhint("To disable ICU locale validation, set the parameter %s to \"%s\".",
3039 : "icu_validation_level", "disabled")));
3040 :
3041 : /* check that it can be opened */
3042 148 : collator = pg_ucol_open(loc_str);
3043 140 : ucol_close(collator);
3044 : #else /* not USE_ICU */
3045 : /* could get here if a collation was created by a build with ICU */
3046 : ereport(ERROR,
3047 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3048 : errmsg("ICU is not supported in this build")));
3049 : #endif /* not USE_ICU */
3050 : }
3051 :
3052 : /*
3053 : * These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
3054 : * Therefore we keep them here rather than with the mbutils code.
3055 : */
3056 :
3057 : /*
3058 : * wchar2char --- convert wide characters to multibyte format
3059 : *
3060 : * This has the same API as the standard wcstombs_l() function; in particular,
3061 : * tolen is the maximum number of bytes to store at *to, and *from must be
3062 : * zero-terminated. The output will be zero-terminated iff there is room.
3063 : */
3064 : size_t
3065 1136174 : wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
3066 : {
3067 : size_t result;
3068 :
3069 : Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
3070 :
3071 1136174 : if (tolen == 0)
3072 0 : return 0;
3073 :
3074 : #ifdef WIN32
3075 :
3076 : /*
3077 : * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
3078 : * for some reason mbstowcs and wcstombs won't do this for us, so we use
3079 : * MultiByteToWideChar().
3080 : */
3081 : if (GetDatabaseEncoding() == PG_UTF8)
3082 : {
3083 : result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
3084 : NULL, NULL);
3085 : /* A zero return is failure */
3086 : if (result <= 0)
3087 : result = -1;
3088 : else
3089 : {
3090 : Assert(result <= tolen);
3091 : /* Microsoft counts the zero terminator in the result */
3092 : result--;
3093 : }
3094 : }
3095 : else
3096 : #endif /* WIN32 */
3097 1136174 : if (locale == (pg_locale_t) 0)
3098 : {
3099 : /* Use wcstombs directly for the default locale */
3100 1136174 : result = wcstombs(to, from, tolen);
3101 : }
3102 : else
3103 : {
3104 : /* Use wcstombs_l for nondefault locales */
3105 0 : result = wcstombs_l(to, from, tolen, locale->info.lt);
3106 : }
3107 :
3108 1136174 : return result;
3109 : }
3110 :
3111 : /*
3112 : * char2wchar --- convert multibyte characters to wide characters
3113 : *
3114 : * This has almost the API of mbstowcs_l(), except that *from need not be
3115 : * null-terminated; instead, the number of input bytes is specified as
3116 : * fromlen. Also, we ereport() rather than returning -1 for invalid
3117 : * input encoding. tolen is the maximum number of wchar_t's to store at *to.
3118 : * The output will be zero-terminated iff there is room.
3119 : */
3120 : size_t
3121 1146830 : char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
3122 : pg_locale_t locale)
3123 : {
3124 : size_t result;
3125 :
3126 : Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
3127 :
3128 1146830 : if (tolen == 0)
3129 0 : return 0;
3130 :
3131 : #ifdef WIN32
3132 : /* See WIN32 "Unicode" comment above */
3133 : if (GetDatabaseEncoding() == PG_UTF8)
3134 : {
3135 : /* Win32 API does not work for zero-length input */
3136 : if (fromlen == 0)
3137 : result = 0;
3138 : else
3139 : {
3140 : result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
3141 : /* A zero return is failure */
3142 : if (result == 0)
3143 : result = -1;
3144 : }
3145 :
3146 : if (result != -1)
3147 : {
3148 : Assert(result < tolen);
3149 : /* Append trailing null wchar (MultiByteToWideChar() does not) */
3150 : to[result] = 0;
3151 : }
3152 : }
3153 : else
3154 : #endif /* WIN32 */
3155 : {
3156 : /* mbstowcs requires ending '\0' */
3157 1146830 : char *str = pnstrdup(from, fromlen);
3158 :
3159 1146830 : if (locale == (pg_locale_t) 0)
3160 : {
3161 : /* Use mbstowcs directly for the default locale */
3162 1146830 : result = mbstowcs(to, str, tolen);
3163 : }
3164 : else
3165 : {
3166 : /* Use mbstowcs_l for nondefault locales */
3167 0 : result = mbstowcs_l(to, str, tolen, locale->info.lt);
3168 : }
3169 :
3170 1146830 : pfree(str);
3171 : }
3172 :
3173 1146830 : if (result == -1)
3174 : {
3175 : /*
3176 : * Invalid multibyte character encountered. We try to give a useful
3177 : * error message by letting pg_verifymbstr check the string. But it's
3178 : * possible that the string is OK to us, and not OK to mbstowcs ---
3179 : * this suggests that the LC_CTYPE locale is different from the
3180 : * database encoding. Give a generic error message if pg_verifymbstr
3181 : * can't find anything wrong.
3182 : */
3183 0 : pg_verifymbstr(from, fromlen, false); /* might not return */
3184 : /* but if it does ... */
3185 0 : ereport(ERROR,
3186 : (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
3187 : errmsg("invalid multibyte character for locale"),
3188 : errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
3189 : }
3190 :
3191 1146830 : return result;
3192 : }
|