Line data Source code
1 : /*-----------------------------------------------------------------------
2 : *
3 : * PostgreSQL locale utilities
4 : *
5 : * Portions Copyright (c) 2002-2023, PostgreSQL Global Development Group
6 : *
7 : * src/backend/utils/adt/pg_locale.c
8 : *
9 : *-----------------------------------------------------------------------
10 : */
11 :
12 : /*----------
13 : * Here is how the locale stuff is handled: LC_COLLATE and LC_CTYPE
14 : * are fixed at CREATE DATABASE time, stored in pg_database, and cannot
15 : * be changed. Thus, the effects of strcoll(), strxfrm(), isupper(),
16 : * toupper(), etc. are always in the same fixed locale.
17 : *
18 : * LC_MESSAGES is settable at run time and will take effect
19 : * immediately.
20 : *
21 : * The other categories, LC_MONETARY, LC_NUMERIC, and LC_TIME are also
22 : * settable at run-time. However, we don't actually set those locale
23 : * categories permanently. This would have bizarre effects like no
24 : * longer accepting standard floating-point literals in some locales.
25 : * Instead, we only set these locale categories briefly when needed,
26 : * cache the required information obtained from localeconv() or
27 : * strftime(), and then set the locale categories back to "C".
28 : * The cached information is only used by the formatting functions
29 : * (to_char, etc.) and the money type. For the user, this should all be
30 : * transparent.
31 : *
32 : * !!! NOW HEAR THIS !!!
33 : *
34 : * We've been bitten repeatedly by this bug, so let's try to keep it in
35 : * mind in future: on some platforms, the locale functions return pointers
36 : * to static data that will be overwritten by any later locale function.
37 : * Thus, for example, the obvious-looking sequence
38 : * save = setlocale(category, NULL);
39 : * if (!setlocale(category, value))
40 : * fail = true;
41 : * setlocale(category, save);
42 : * DOES NOT WORK RELIABLY: on some platforms the second setlocale() call
43 : * will change the memory save is pointing at. To do this sort of thing
44 : * safely, you *must* pstrdup what setlocale returns the first time.
45 : *
46 : * The POSIX locale standard is available here:
47 : *
48 : * http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap07.html
49 : *----------
50 : */
51 :
52 :
53 : #include "postgres.h"
54 :
55 : #include <time.h>
56 :
57 : #include "access/htup_details.h"
58 : #include "catalog/pg_collation.h"
59 : #include "catalog/pg_control.h"
60 : #include "mb/pg_wchar.h"
61 : #include "miscadmin.h"
62 : #include "utils/builtins.h"
63 : #include "utils/formatting.h"
64 : #include "utils/guc_hooks.h"
65 : #include "utils/hsearch.h"
66 : #include "utils/lsyscache.h"
67 : #include "utils/memutils.h"
68 : #include "utils/pg_locale.h"
69 : #include "utils/syscache.h"
70 :
71 : #ifdef USE_ICU
72 : #include <unicode/ucnv.h>
73 : #include <unicode/ustring.h>
74 : #endif
75 :
76 : #ifdef __GLIBC__
77 : #include <gnu/libc-version.h>
78 : #endif
79 :
80 : #ifdef WIN32
81 : #include <shlwapi.h>
82 : #endif
83 :
84 : /* Error triggered for locale-sensitive subroutines */
85 : #define PGLOCALE_SUPPORT_ERROR(provider) \
86 : elog(ERROR, "unsupported collprovider for %s: %c", __func__, provider)
87 :
88 : /*
89 : * This should be large enough that most strings will fit, but small enough
90 : * that we feel comfortable putting it on the stack
91 : */
92 : #define TEXTBUFLEN 1024
93 :
94 : #define MAX_L10N_DATA 80
95 :
96 :
97 : /* GUC settings */
98 : char *locale_messages;
99 : char *locale_monetary;
100 : char *locale_numeric;
101 : char *locale_time;
102 :
103 : int icu_validation_level = WARNING;
104 :
105 : /*
106 : * lc_time localization cache.
107 : *
108 : * We use only the first 7 or 12 entries of these arrays. The last array
109 : * element is left as NULL for the convenience of outside code that wants
110 : * to sequentially scan these arrays.
111 : */
112 : char *localized_abbrev_days[7 + 1];
113 : char *localized_full_days[7 + 1];
114 : char *localized_abbrev_months[12 + 1];
115 : char *localized_full_months[12 + 1];
116 :
117 : /* is the databases's LC_CTYPE the C locale? */
118 : bool database_ctype_is_c = false;
119 :
120 : /* indicates whether locale information cache is valid */
121 : static bool CurrentLocaleConvValid = false;
122 : static bool CurrentLCTimeValid = false;
123 :
124 : /* Cache for collation-related knowledge */
125 :
126 : typedef struct
127 : {
128 : Oid collid; /* hash key: pg_collation OID */
129 : bool collate_is_c; /* is collation's LC_COLLATE C? */
130 : bool ctype_is_c; /* is collation's LC_CTYPE C? */
131 : bool flags_valid; /* true if above flags are valid */
132 : pg_locale_t locale; /* locale_t struct, or 0 if not valid */
133 : } collation_cache_entry;
134 :
135 : static HTAB *collation_cache = NULL;
136 :
137 :
138 : #if defined(WIN32) && defined(LC_MESSAGES)
139 : static char *IsoLocaleName(const char *);
140 : #endif
141 :
142 : #ifdef USE_ICU
143 : /*
144 : * Converter object for converting between ICU's UChar strings and C strings
145 : * in database encoding. Since the database encoding doesn't change, we only
146 : * need one of these per session.
147 : */
148 : static UConverter *icu_converter = NULL;
149 :
150 : static UCollator *pg_ucol_open(const char *loc_str);
151 : static void init_icu_converter(void);
152 : static size_t uchar_length(UConverter *converter,
153 : const char *str, int32_t len);
154 : static int32_t uchar_convert(UConverter *converter,
155 : UChar *dest, int32_t destlen,
156 : const char *src, int32_t srclen);
157 : static void icu_set_collation_attributes(UCollator *collator, const char *loc,
158 : UErrorCode *status);
159 : #endif
160 :
161 : /*
162 : * POSIX doesn't define _l-variants of these functions, but several systems
163 : * have them. We provide our own replacements here.
164 : */
165 : #ifndef HAVE_MBSTOWCS_L
166 : static size_t
167 0 : mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
168 : {
169 : #ifdef WIN32
170 : return _mbstowcs_l(dest, src, n, loc);
171 : #else
172 : size_t result;
173 0 : locale_t save_locale = uselocale(loc);
174 :
175 0 : result = mbstowcs(dest, src, n);
176 0 : uselocale(save_locale);
177 0 : return result;
178 : #endif
179 : }
180 : #endif
181 : #ifndef HAVE_WCSTOMBS_L
182 : static size_t
183 0 : wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
184 : {
185 : #ifdef WIN32
186 : return _wcstombs_l(dest, src, n, loc);
187 : #else
188 : size_t result;
189 0 : locale_t save_locale = uselocale(loc);
190 :
191 0 : result = wcstombs(dest, src, n);
192 0 : uselocale(save_locale);
193 0 : return result;
194 : #endif
195 : }
196 : #endif
197 :
198 : /*
199 : * pg_perm_setlocale
200 : *
201 : * This wraps the libc function setlocale(), with two additions. First, when
202 : * changing LC_CTYPE, update gettext's encoding for the current message
203 : * domain. GNU gettext automatically tracks LC_CTYPE on most platforms, but
204 : * not on Windows. Second, if the operation is successful, the corresponding
205 : * LC_XXX environment variable is set to match. By setting the environment
206 : * variable, we ensure that any subsequent use of setlocale(..., "") will
207 : * preserve the settings made through this routine. Of course, LC_ALL must
208 : * also be unset to fully ensure that, but that has to be done elsewhere after
209 : * all the individual LC_XXX variables have been set correctly. (Thank you
210 : * Perl for making this kluge necessary.)
211 : */
212 : char *
213 73588 : pg_perm_setlocale(int category, const char *locale)
214 : {
215 : char *result;
216 : const char *envvar;
217 :
218 : #ifndef WIN32
219 73588 : result = setlocale(category, locale);
220 : #else
221 :
222 : /*
223 : * On Windows, setlocale(LC_MESSAGES) does not work, so just assume that
224 : * the given value is good and set it in the environment variables. We
225 : * must ignore attempts to set to "", which means "keep using the old
226 : * environment value".
227 : */
228 : #ifdef LC_MESSAGES
229 : if (category == LC_MESSAGES)
230 : {
231 : result = (char *) locale;
232 : if (locale == NULL || locale[0] == '\0')
233 : return result;
234 : }
235 : else
236 : #endif
237 : result = setlocale(category, locale);
238 : #endif /* WIN32 */
239 :
240 73588 : if (result == NULL)
241 0 : return result; /* fall out immediately on failure */
242 :
243 : /*
244 : * Use the right encoding in translated messages. Under ENABLE_NLS, let
245 : * pg_bind_textdomain_codeset() figure it out. Under !ENABLE_NLS, message
246 : * format strings are ASCII, but database-encoding strings may enter the
247 : * message via %s. This makes the overall message encoding equal to the
248 : * database encoding.
249 : */
250 73588 : if (category == LC_CTYPE)
251 : {
252 : static char save_lc_ctype[LOCALE_NAME_BUFLEN];
253 :
254 : /* copy setlocale() return value before callee invokes it again */
255 24782 : strlcpy(save_lc_ctype, result, sizeof(save_lc_ctype));
256 24782 : result = save_lc_ctype;
257 :
258 : #ifdef ENABLE_NLS
259 24782 : SetMessageEncoding(pg_bind_textdomain_codeset(textdomain(NULL)));
260 : #else
261 : SetMessageEncoding(GetDatabaseEncoding());
262 : #endif
263 : }
264 :
265 73588 : switch (category)
266 : {
267 24782 : case LC_COLLATE:
268 24782 : envvar = "LC_COLLATE";
269 24782 : break;
270 24782 : case LC_CTYPE:
271 24782 : envvar = "LC_CTYPE";
272 24782 : break;
273 : #ifdef LC_MESSAGES
274 15870 : case LC_MESSAGES:
275 15870 : envvar = "LC_MESSAGES";
276 : #ifdef WIN32
277 : result = IsoLocaleName(locale);
278 : if (result == NULL)
279 : result = (char *) locale;
280 : elog(DEBUG3, "IsoLocaleName() executed; locale: \"%s\"", result);
281 : #endif /* WIN32 */
282 15870 : break;
283 : #endif /* LC_MESSAGES */
284 2718 : case LC_MONETARY:
285 2718 : envvar = "LC_MONETARY";
286 2718 : break;
287 2718 : case LC_NUMERIC:
288 2718 : envvar = "LC_NUMERIC";
289 2718 : break;
290 2718 : case LC_TIME:
291 2718 : envvar = "LC_TIME";
292 2718 : break;
293 0 : default:
294 0 : elog(FATAL, "unrecognized LC category: %d", category);
295 : return NULL; /* keep compiler quiet */
296 : }
297 :
298 73588 : if (setenv(envvar, result, 1) != 0)
299 0 : return NULL;
300 :
301 73588 : return result;
302 : }
303 :
304 :
305 : /*
306 : * Is the locale name valid for the locale category?
307 : *
308 : * If successful, and canonname isn't NULL, a palloc'd copy of the locale's
309 : * canonical name is stored there. This is especially useful for figuring out
310 : * what locale name "" means (ie, the server environment value). (Actually,
311 : * it seems that on most implementations that's the only thing it's good for;
312 : * we could wish that setlocale gave back a canonically spelled version of
313 : * the locale name, but typically it doesn't.)
314 : */
315 : bool
316 50716 : check_locale(int category, const char *locale, char **canonname)
317 : {
318 : char *save;
319 : char *res;
320 :
321 50716 : if (canonname)
322 1102 : *canonname = NULL; /* in case of failure */
323 :
324 50716 : save = setlocale(category, NULL);
325 50716 : if (!save)
326 0 : return false; /* won't happen, we hope */
327 :
328 : /* save may be pointing at a modifiable scratch variable, see above. */
329 50716 : save = pstrdup(save);
330 :
331 : /* set the locale with setlocale, to see if it accepts it. */
332 50716 : res = setlocale(category, locale);
333 :
334 : /* save canonical name if requested. */
335 50716 : if (res && canonname)
336 1098 : *canonname = pstrdup(res);
337 :
338 : /* restore old value. */
339 50716 : if (!setlocale(category, save))
340 0 : elog(WARNING, "failed to restore old locale \"%s\"", save);
341 50716 : pfree(save);
342 :
343 50716 : return (res != NULL);
344 : }
345 :
346 :
347 : /*
348 : * GUC check/assign hooks
349 : *
350 : * For most locale categories, the assign hook doesn't actually set the locale
351 : * permanently, just reset flags so that the next use will cache the
352 : * appropriate values. (See explanation at the top of this file.)
353 : *
354 : * Note: we accept value = "" as selecting the postmaster's environment
355 : * value, whatever it was (so long as the environment setting is legal).
356 : * This will have been locked down by an earlier call to pg_perm_setlocale.
357 : */
358 : bool
359 13498 : check_locale_monetary(char **newval, void **extra, GucSource source)
360 : {
361 13498 : return check_locale(LC_MONETARY, *newval, NULL);
362 : }
363 :
364 : void
365 13326 : assign_locale_monetary(const char *newval, void *extra)
366 : {
367 13326 : CurrentLocaleConvValid = false;
368 13326 : }
369 :
370 : bool
371 13504 : check_locale_numeric(char **newval, void **extra, GucSource source)
372 : {
373 13504 : return check_locale(LC_NUMERIC, *newval, NULL);
374 : }
375 :
376 : void
377 13338 : assign_locale_numeric(const char *newval, void *extra)
378 : {
379 13338 : CurrentLocaleConvValid = false;
380 13338 : }
381 :
382 : bool
383 13498 : check_locale_time(char **newval, void **extra, GucSource source)
384 : {
385 13498 : return check_locale(LC_TIME, *newval, NULL);
386 : }
387 :
388 : void
389 13326 : assign_locale_time(const char *newval, void *extra)
390 : {
391 13326 : CurrentLCTimeValid = false;
392 13326 : }
393 :
394 : /*
395 : * We allow LC_MESSAGES to actually be set globally.
396 : *
397 : * Note: we normally disallow value = "" because it wouldn't have consistent
398 : * semantics (it'd effectively just use the previous value). However, this
399 : * is the value passed for PGC_S_DEFAULT, so don't complain in that case,
400 : * not even if the attempted setting fails due to invalid environment value.
401 : * The idea there is just to accept the environment setting *if possible*
402 : * during startup, until we can read the proper value from postgresql.conf.
403 : */
404 : bool
405 13326 : check_locale_messages(char **newval, void **extra, GucSource source)
406 : {
407 13326 : if (**newval == '\0')
408 : {
409 4212 : if (source == PGC_S_DEFAULT)
410 4212 : return true;
411 : else
412 0 : return false;
413 : }
414 :
415 : /*
416 : * LC_MESSAGES category does not exist everywhere, but accept it anyway
417 : *
418 : * On Windows, we can't even check the value, so accept blindly
419 : */
420 : #if defined(LC_MESSAGES) && !defined(WIN32)
421 9114 : return check_locale(LC_MESSAGES, *newval, NULL);
422 : #else
423 : return true;
424 : #endif
425 : }
426 :
427 : void
428 13152 : assign_locale_messages(const char *newval, void *extra)
429 : {
430 : /*
431 : * LC_MESSAGES category does not exist everywhere, but accept it anyway.
432 : * We ignore failure, as per comment above.
433 : */
434 : #ifdef LC_MESSAGES
435 13152 : (void) pg_perm_setlocale(LC_MESSAGES, newval);
436 : #endif
437 13152 : }
438 :
439 :
440 : /*
441 : * Frees the malloced content of a struct lconv. (But not the struct
442 : * itself.) It's important that this not throw elog(ERROR).
443 : */
444 : static void
445 6 : free_struct_lconv(struct lconv *s)
446 : {
447 6 : free(s->decimal_point);
448 6 : free(s->thousands_sep);
449 6 : free(s->grouping);
450 6 : free(s->int_curr_symbol);
451 6 : free(s->currency_symbol);
452 6 : free(s->mon_decimal_point);
453 6 : free(s->mon_thousands_sep);
454 6 : free(s->mon_grouping);
455 6 : free(s->positive_sign);
456 6 : free(s->negative_sign);
457 6 : }
458 :
459 : /*
460 : * Check that all fields of a struct lconv (or at least, the ones we care
461 : * about) are non-NULL. The field list must match free_struct_lconv().
462 : */
463 : static bool
464 102 : struct_lconv_is_valid(struct lconv *s)
465 : {
466 102 : if (s->decimal_point == NULL)
467 0 : return false;
468 102 : if (s->thousands_sep == NULL)
469 0 : return false;
470 102 : if (s->grouping == NULL)
471 0 : return false;
472 102 : if (s->int_curr_symbol == NULL)
473 0 : return false;
474 102 : if (s->currency_symbol == NULL)
475 0 : return false;
476 102 : if (s->mon_decimal_point == NULL)
477 0 : return false;
478 102 : if (s->mon_thousands_sep == NULL)
479 0 : return false;
480 102 : if (s->mon_grouping == NULL)
481 0 : return false;
482 102 : if (s->positive_sign == NULL)
483 0 : return false;
484 102 : if (s->negative_sign == NULL)
485 0 : return false;
486 102 : return true;
487 : }
488 :
489 :
490 : /*
491 : * Convert the strdup'd string at *str from the specified encoding to the
492 : * database encoding.
493 : */
494 : static void
495 816 : db_encoding_convert(int encoding, char **str)
496 : {
497 : char *pstr;
498 : char *mstr;
499 :
500 : /* convert the string to the database encoding */
501 816 : pstr = pg_any_to_server(*str, strlen(*str), encoding);
502 816 : if (pstr == *str)
503 816 : return; /* no conversion happened */
504 :
505 : /* need it malloc'd not palloc'd */
506 0 : mstr = strdup(pstr);
507 0 : if (mstr == NULL)
508 0 : ereport(ERROR,
509 : (errcode(ERRCODE_OUT_OF_MEMORY),
510 : errmsg("out of memory")));
511 :
512 : /* replace old string */
513 0 : free(*str);
514 0 : *str = mstr;
515 :
516 0 : pfree(pstr);
517 : }
518 :
519 :
520 : /*
521 : * Return the POSIX lconv struct (contains number/money formatting
522 : * information) with locale information for all categories.
523 : */
524 : struct lconv *
525 15936 : PGLC_localeconv(void)
526 : {
527 : static struct lconv CurrentLocaleConv;
528 : static bool CurrentLocaleConvAllocated = false;
529 : struct lconv *extlconv;
530 : struct lconv worklconv;
531 : char *save_lc_monetary;
532 : char *save_lc_numeric;
533 : #ifdef WIN32
534 : char *save_lc_ctype;
535 : #endif
536 :
537 : /* Did we do it already? */
538 15936 : if (CurrentLocaleConvValid)
539 15834 : return &CurrentLocaleConv;
540 :
541 : /* Free any already-allocated storage */
542 102 : if (CurrentLocaleConvAllocated)
543 : {
544 6 : free_struct_lconv(&CurrentLocaleConv);
545 6 : CurrentLocaleConvAllocated = false;
546 : }
547 :
548 : /*
549 : * This is tricky because we really don't want to risk throwing error
550 : * while the locale is set to other than our usual settings. Therefore,
551 : * the process is: collect the usual settings, set locale to special
552 : * setting, copy relevant data into worklconv using strdup(), restore
553 : * normal settings, convert data to desired encoding, and finally stash
554 : * the collected data in CurrentLocaleConv. This makes it safe if we
555 : * throw an error during encoding conversion or run out of memory anywhere
556 : * in the process. All data pointed to by struct lconv members is
557 : * allocated with strdup, to avoid premature elog(ERROR) and to allow
558 : * using a single cleanup routine.
559 : */
560 102 : memset(&worklconv, 0, sizeof(worklconv));
561 :
562 : /* Save prevailing values of monetary and numeric locales */
563 102 : save_lc_monetary = setlocale(LC_MONETARY, NULL);
564 102 : if (!save_lc_monetary)
565 0 : elog(ERROR, "setlocale(NULL) failed");
566 102 : save_lc_monetary = pstrdup(save_lc_monetary);
567 :
568 102 : save_lc_numeric = setlocale(LC_NUMERIC, NULL);
569 102 : if (!save_lc_numeric)
570 0 : elog(ERROR, "setlocale(NULL) failed");
571 102 : save_lc_numeric = pstrdup(save_lc_numeric);
572 :
573 : #ifdef WIN32
574 :
575 : /*
576 : * The POSIX standard explicitly says that it is undefined what happens if
577 : * LC_MONETARY or LC_NUMERIC imply an encoding (codeset) different from
578 : * that implied by LC_CTYPE. In practice, all Unix-ish platforms seem to
579 : * believe that localeconv() should return strings that are encoded in the
580 : * codeset implied by the LC_MONETARY or LC_NUMERIC locale name. Hence,
581 : * once we have successfully collected the localeconv() results, we will
582 : * convert them from that codeset to the desired server encoding.
583 : *
584 : * Windows, of course, resolutely does things its own way; on that
585 : * platform LC_CTYPE has to match LC_MONETARY/LC_NUMERIC to get sane
586 : * results. Hence, we must temporarily set that category as well.
587 : */
588 :
589 : /* Save prevailing value of ctype locale */
590 : save_lc_ctype = setlocale(LC_CTYPE, NULL);
591 : if (!save_lc_ctype)
592 : elog(ERROR, "setlocale(NULL) failed");
593 : save_lc_ctype = pstrdup(save_lc_ctype);
594 :
595 : /* Here begins the critical section where we must not throw error */
596 :
597 : /* use numeric to set the ctype */
598 : setlocale(LC_CTYPE, locale_numeric);
599 : #endif
600 :
601 : /* Get formatting information for numeric */
602 102 : setlocale(LC_NUMERIC, locale_numeric);
603 102 : extlconv = localeconv();
604 :
605 : /* Must copy data now in case setlocale() overwrites it */
606 102 : worklconv.decimal_point = strdup(extlconv->decimal_point);
607 102 : worklconv.thousands_sep = strdup(extlconv->thousands_sep);
608 102 : worklconv.grouping = strdup(extlconv->grouping);
609 :
610 : #ifdef WIN32
611 : /* use monetary to set the ctype */
612 : setlocale(LC_CTYPE, locale_monetary);
613 : #endif
614 :
615 : /* Get formatting information for monetary */
616 102 : setlocale(LC_MONETARY, locale_monetary);
617 102 : extlconv = localeconv();
618 :
619 : /* Must copy data now in case setlocale() overwrites it */
620 102 : worklconv.int_curr_symbol = strdup(extlconv->int_curr_symbol);
621 102 : worklconv.currency_symbol = strdup(extlconv->currency_symbol);
622 102 : worklconv.mon_decimal_point = strdup(extlconv->mon_decimal_point);
623 102 : worklconv.mon_thousands_sep = strdup(extlconv->mon_thousands_sep);
624 102 : worklconv.mon_grouping = strdup(extlconv->mon_grouping);
625 102 : worklconv.positive_sign = strdup(extlconv->positive_sign);
626 102 : worklconv.negative_sign = strdup(extlconv->negative_sign);
627 : /* Copy scalar fields as well */
628 102 : worklconv.int_frac_digits = extlconv->int_frac_digits;
629 102 : worklconv.frac_digits = extlconv->frac_digits;
630 102 : worklconv.p_cs_precedes = extlconv->p_cs_precedes;
631 102 : worklconv.p_sep_by_space = extlconv->p_sep_by_space;
632 102 : worklconv.n_cs_precedes = extlconv->n_cs_precedes;
633 102 : worklconv.n_sep_by_space = extlconv->n_sep_by_space;
634 102 : worklconv.p_sign_posn = extlconv->p_sign_posn;
635 102 : worklconv.n_sign_posn = extlconv->n_sign_posn;
636 :
637 : /*
638 : * Restore the prevailing locale settings; failure to do so is fatal.
639 : * Possibly we could limp along with nondefault LC_MONETARY or LC_NUMERIC,
640 : * but proceeding with the wrong value of LC_CTYPE would certainly be bad
641 : * news; and considering that the prevailing LC_MONETARY and LC_NUMERIC
642 : * are almost certainly "C", there's really no reason that restoring those
643 : * should fail.
644 : */
645 : #ifdef WIN32
646 : if (!setlocale(LC_CTYPE, save_lc_ctype))
647 : elog(FATAL, "failed to restore LC_CTYPE to \"%s\"", save_lc_ctype);
648 : #endif
649 102 : if (!setlocale(LC_MONETARY, save_lc_monetary))
650 0 : elog(FATAL, "failed to restore LC_MONETARY to \"%s\"", save_lc_monetary);
651 102 : if (!setlocale(LC_NUMERIC, save_lc_numeric))
652 0 : elog(FATAL, "failed to restore LC_NUMERIC to \"%s\"", save_lc_numeric);
653 :
654 : /*
655 : * At this point we've done our best to clean up, and can call functions
656 : * that might possibly throw errors with a clean conscience. But let's
657 : * make sure we don't leak any already-strdup'd fields in worklconv.
658 : */
659 102 : PG_TRY();
660 : {
661 : int encoding;
662 :
663 : /* Release the pstrdup'd locale names */
664 102 : pfree(save_lc_monetary);
665 102 : pfree(save_lc_numeric);
666 : #ifdef WIN32
667 : pfree(save_lc_ctype);
668 : #endif
669 :
670 : /* If any of the preceding strdup calls failed, complain now. */
671 102 : if (!struct_lconv_is_valid(&worklconv))
672 0 : ereport(ERROR,
673 : (errcode(ERRCODE_OUT_OF_MEMORY),
674 : errmsg("out of memory")));
675 :
676 : /*
677 : * Now we must perform encoding conversion from whatever's associated
678 : * with the locales into the database encoding. If we can't identify
679 : * the encoding implied by LC_NUMERIC or LC_MONETARY (ie we get -1),
680 : * use PG_SQL_ASCII, which will result in just validating that the
681 : * strings are OK in the database encoding.
682 : */
683 102 : encoding = pg_get_encoding_from_locale(locale_numeric, true);
684 102 : if (encoding < 0)
685 0 : encoding = PG_SQL_ASCII;
686 :
687 102 : db_encoding_convert(encoding, &worklconv.decimal_point);
688 102 : db_encoding_convert(encoding, &worklconv.thousands_sep);
689 : /* grouping is not text and does not require conversion */
690 :
691 102 : encoding = pg_get_encoding_from_locale(locale_monetary, true);
692 102 : if (encoding < 0)
693 0 : encoding = PG_SQL_ASCII;
694 :
695 102 : db_encoding_convert(encoding, &worklconv.int_curr_symbol);
696 102 : db_encoding_convert(encoding, &worklconv.currency_symbol);
697 102 : db_encoding_convert(encoding, &worklconv.mon_decimal_point);
698 102 : db_encoding_convert(encoding, &worklconv.mon_thousands_sep);
699 : /* mon_grouping is not text and does not require conversion */
700 102 : db_encoding_convert(encoding, &worklconv.positive_sign);
701 102 : db_encoding_convert(encoding, &worklconv.negative_sign);
702 : }
703 0 : PG_CATCH();
704 : {
705 0 : free_struct_lconv(&worklconv);
706 0 : PG_RE_THROW();
707 : }
708 102 : PG_END_TRY();
709 :
710 : /*
711 : * Everything is good, so save the results.
712 : */
713 102 : CurrentLocaleConv = worklconv;
714 102 : CurrentLocaleConvAllocated = true;
715 102 : CurrentLocaleConvValid = true;
716 102 : return &CurrentLocaleConv;
717 : }
718 :
719 : #ifdef WIN32
720 : /*
721 : * On Windows, strftime() returns its output in encoding CP_ACP (the default
722 : * operating system codepage for the computer), which is likely different
723 : * from SERVER_ENCODING. This is especially important in Japanese versions
724 : * of Windows which will use SJIS encoding, which we don't support as a
725 : * server encoding.
726 : *
727 : * So, instead of using strftime(), use wcsftime() to return the value in
728 : * wide characters (internally UTF16) and then convert to UTF8, which we
729 : * know how to handle directly.
730 : *
731 : * Note that this only affects the calls to strftime() in this file, which are
732 : * used to get the locale-aware strings. Other parts of the backend use
733 : * pg_strftime(), which isn't locale-aware and does not need to be replaced.
734 : */
735 : static size_t
736 : strftime_win32(char *dst, size_t dstlen,
737 : const char *format, const struct tm *tm)
738 : {
739 : size_t len;
740 : wchar_t wformat[8]; /* formats used below need 3 chars */
741 : wchar_t wbuf[MAX_L10N_DATA];
742 :
743 : /*
744 : * Get a wchar_t version of the format string. We only actually use
745 : * plain-ASCII formats in this file, so we can say that they're UTF8.
746 : */
747 : len = MultiByteToWideChar(CP_UTF8, 0, format, -1,
748 : wformat, lengthof(wformat));
749 : if (len == 0)
750 : elog(ERROR, "could not convert format string from UTF-8: error code %lu",
751 : GetLastError());
752 :
753 : len = wcsftime(wbuf, MAX_L10N_DATA, wformat, tm);
754 : if (len == 0)
755 : {
756 : /*
757 : * wcsftime failed, possibly because the result would not fit in
758 : * MAX_L10N_DATA. Return 0 with the contents of dst unspecified.
759 : */
760 : return 0;
761 : }
762 :
763 : len = WideCharToMultiByte(CP_UTF8, 0, wbuf, len, dst, dstlen - 1,
764 : NULL, NULL);
765 : if (len == 0)
766 : elog(ERROR, "could not convert string to UTF-8: error code %lu",
767 : GetLastError());
768 :
769 : dst[len] = '\0';
770 :
771 : return len;
772 : }
773 :
774 : /* redefine strftime() */
775 : #define strftime(a,b,c,d) strftime_win32(a,b,c,d)
776 : #endif /* WIN32 */
777 :
778 : /*
779 : * Subroutine for cache_locale_time().
780 : * Convert the given string from encoding "encoding" to the database
781 : * encoding, and store the result at *dst, replacing any previous value.
782 : */
783 : static void
784 1748 : cache_single_string(char **dst, const char *src, int encoding)
785 : {
786 : char *ptr;
787 : char *olddst;
788 :
789 : /* Convert the string to the database encoding, or validate it's OK */
790 1748 : ptr = pg_any_to_server(src, strlen(src), encoding);
791 :
792 : /* Store the string in long-lived storage, replacing any previous value */
793 1748 : olddst = *dst;
794 1748 : *dst = MemoryContextStrdup(TopMemoryContext, ptr);
795 1748 : if (olddst)
796 0 : pfree(olddst);
797 :
798 : /* Might as well clean up any palloc'd conversion result, too */
799 1748 : if (ptr != src)
800 0 : pfree(ptr);
801 1748 : }
802 :
803 : /*
804 : * Update the lc_time localization cache variables if needed.
805 : */
806 : void
807 25270 : cache_locale_time(void)
808 : {
809 : char buf[(2 * 7 + 2 * 12) * MAX_L10N_DATA];
810 : char *bufptr;
811 : time_t timenow;
812 : struct tm *timeinfo;
813 25270 : bool strftimefail = false;
814 : int encoding;
815 : int i;
816 : char *save_lc_time;
817 : #ifdef WIN32
818 : char *save_lc_ctype;
819 : #endif
820 :
821 : /* did we do this already? */
822 25270 : if (CurrentLCTimeValid)
823 25224 : return;
824 :
825 46 : elog(DEBUG3, "cache_locale_time() executed; locale: \"%s\"", locale_time);
826 :
827 : /*
828 : * As in PGLC_localeconv(), it's critical that we not throw error while
829 : * libc's locale settings have nondefault values. Hence, we just call
830 : * strftime() within the critical section, and then convert and save its
831 : * results afterwards.
832 : */
833 :
834 : /* Save prevailing value of time locale */
835 46 : save_lc_time = setlocale(LC_TIME, NULL);
836 46 : if (!save_lc_time)
837 0 : elog(ERROR, "setlocale(NULL) failed");
838 46 : save_lc_time = pstrdup(save_lc_time);
839 :
840 : #ifdef WIN32
841 :
842 : /*
843 : * On Windows, it appears that wcsftime() internally uses LC_CTYPE, so we
844 : * must set it here. This code looks the same as what PGLC_localeconv()
845 : * does, but the underlying reason is different: this does NOT determine
846 : * the encoding we'll get back from strftime_win32().
847 : */
848 :
849 : /* Save prevailing value of ctype locale */
850 : save_lc_ctype = setlocale(LC_CTYPE, NULL);
851 : if (!save_lc_ctype)
852 : elog(ERROR, "setlocale(NULL) failed");
853 : save_lc_ctype = pstrdup(save_lc_ctype);
854 :
855 : /* use lc_time to set the ctype */
856 : setlocale(LC_CTYPE, locale_time);
857 : #endif
858 :
859 46 : setlocale(LC_TIME, locale_time);
860 :
861 : /* We use times close to current time as data for strftime(). */
862 46 : timenow = time(NULL);
863 46 : timeinfo = localtime(&timenow);
864 :
865 : /* Store the strftime results in MAX_L10N_DATA-sized portions of buf[] */
866 46 : bufptr = buf;
867 :
868 : /*
869 : * MAX_L10N_DATA is sufficient buffer space for every known locale, and
870 : * POSIX defines no strftime() errors. (Buffer space exhaustion is not an
871 : * error.) An implementation might report errors (e.g. ENOMEM) by
872 : * returning 0 (or, less plausibly, a negative value) and setting errno.
873 : * Report errno just in case the implementation did that, but clear it in
874 : * advance of the calls so we don't emit a stale, unrelated errno.
875 : */
876 46 : errno = 0;
877 :
878 : /* localized days */
879 368 : for (i = 0; i < 7; i++)
880 : {
881 322 : timeinfo->tm_wday = i;
882 322 : if (strftime(bufptr, MAX_L10N_DATA, "%a", timeinfo) <= 0)
883 0 : strftimefail = true;
884 322 : bufptr += MAX_L10N_DATA;
885 322 : if (strftime(bufptr, MAX_L10N_DATA, "%A", timeinfo) <= 0)
886 0 : strftimefail = true;
887 322 : bufptr += MAX_L10N_DATA;
888 : }
889 :
890 : /* localized months */
891 598 : for (i = 0; i < 12; i++)
892 : {
893 552 : timeinfo->tm_mon = i;
894 552 : timeinfo->tm_mday = 1; /* make sure we don't have invalid date */
895 552 : if (strftime(bufptr, MAX_L10N_DATA, "%b", timeinfo) <= 0)
896 0 : strftimefail = true;
897 552 : bufptr += MAX_L10N_DATA;
898 552 : if (strftime(bufptr, MAX_L10N_DATA, "%B", timeinfo) <= 0)
899 0 : strftimefail = true;
900 552 : bufptr += MAX_L10N_DATA;
901 : }
902 :
903 : /*
904 : * Restore the prevailing locale settings; as in PGLC_localeconv(),
905 : * failure to do so is fatal.
906 : */
907 : #ifdef WIN32
908 : if (!setlocale(LC_CTYPE, save_lc_ctype))
909 : elog(FATAL, "failed to restore LC_CTYPE to \"%s\"", save_lc_ctype);
910 : #endif
911 46 : if (!setlocale(LC_TIME, save_lc_time))
912 0 : elog(FATAL, "failed to restore LC_TIME to \"%s\"", save_lc_time);
913 :
914 : /*
915 : * At this point we've done our best to clean up, and can throw errors, or
916 : * call functions that might throw errors, with a clean conscience.
917 : */
918 46 : if (strftimefail)
919 0 : elog(ERROR, "strftime() failed: %m");
920 :
921 : /* Release the pstrdup'd locale names */
922 46 : pfree(save_lc_time);
923 : #ifdef WIN32
924 : pfree(save_lc_ctype);
925 : #endif
926 :
927 : #ifndef WIN32
928 :
929 : /*
930 : * As in PGLC_localeconv(), we must convert strftime()'s output from the
931 : * encoding implied by LC_TIME to the database encoding. If we can't
932 : * identify the LC_TIME encoding, just perform encoding validation.
933 : */
934 46 : encoding = pg_get_encoding_from_locale(locale_time, true);
935 46 : if (encoding < 0)
936 0 : encoding = PG_SQL_ASCII;
937 :
938 : #else
939 :
940 : /*
941 : * On Windows, strftime_win32() always returns UTF8 data, so convert from
942 : * that if necessary.
943 : */
944 : encoding = PG_UTF8;
945 :
946 : #endif /* WIN32 */
947 :
948 46 : bufptr = buf;
949 :
950 : /* localized days */
951 368 : for (i = 0; i < 7; i++)
952 : {
953 322 : cache_single_string(&localized_abbrev_days[i], bufptr, encoding);
954 322 : bufptr += MAX_L10N_DATA;
955 322 : cache_single_string(&localized_full_days[i], bufptr, encoding);
956 322 : bufptr += MAX_L10N_DATA;
957 : }
958 46 : localized_abbrev_days[7] = NULL;
959 46 : localized_full_days[7] = NULL;
960 :
961 : /* localized months */
962 598 : for (i = 0; i < 12; i++)
963 : {
964 552 : cache_single_string(&localized_abbrev_months[i], bufptr, encoding);
965 552 : bufptr += MAX_L10N_DATA;
966 552 : cache_single_string(&localized_full_months[i], bufptr, encoding);
967 552 : bufptr += MAX_L10N_DATA;
968 : }
969 46 : localized_abbrev_months[12] = NULL;
970 46 : localized_full_months[12] = NULL;
971 :
972 46 : CurrentLCTimeValid = true;
973 : }
974 :
975 :
976 : #if defined(WIN32) && defined(LC_MESSAGES)
977 : /*
978 : * Convert a Windows setlocale() argument to a Unix-style one.
979 : *
980 : * Regardless of platform, we install message catalogs under a Unix-style
981 : * LL[_CC][.ENCODING][@VARIANT] naming convention. Only LC_MESSAGES settings
982 : * following that style will elicit localized interface strings.
983 : *
984 : * Before Visual Studio 2012 (msvcr110.dll), Windows setlocale() accepted "C"
985 : * (but not "c") and strings of the form <Language>[_<Country>][.<CodePage>],
986 : * case-insensitive. setlocale() returns the fully-qualified form; for
987 : * example, setlocale("thaI") returns "Thai_Thailand.874". Internally,
988 : * setlocale() and _create_locale() select a "locale identifier"[1] and store
989 : * it in an undocumented _locale_t field. From that LCID, we can retrieve the
990 : * ISO 639 language and the ISO 3166 country. Character encoding does not
991 : * matter, because the server and client encodings govern that.
992 : *
993 : * Windows Vista introduced the "locale name" concept[2], closely following
994 : * RFC 4646. Locale identifiers are now deprecated. Starting with Visual
995 : * Studio 2012, setlocale() accepts locale names in addition to the strings it
996 : * accepted historically. It does not standardize them; setlocale("Th-tH")
997 : * returns "Th-tH". setlocale(category, "") still returns a traditional
998 : * string. Furthermore, msvcr110.dll changed the undocumented _locale_t
999 : * content to carry locale names instead of locale identifiers.
1000 : *
1001 : * Visual Studio 2015 should still be able to do the same as Visual Studio
1002 : * 2012, but the declaration of locale_name is missing in _locale_t, causing
1003 : * this code compilation to fail, hence this falls back instead on to
1004 : * enumerating all system locales by using EnumSystemLocalesEx to find the
1005 : * required locale name. If the input argument is in Unix-style then we can
1006 : * get ISO Locale name directly by using GetLocaleInfoEx() with LCType as
1007 : * LOCALE_SNAME.
1008 : *
1009 : * MinGW headers declare _create_locale(), but msvcrt.dll lacks that symbol in
1010 : * releases before Windows 8. IsoLocaleName() always fails in a MinGW-built
1011 : * postgres.exe, so only Unix-style values of the lc_messages GUC can elicit
1012 : * localized messages. In particular, every lc_messages setting that initdb
1013 : * can select automatically will yield only C-locale messages. XXX This could
1014 : * be fixed by running the fully-qualified locale name through a lookup table.
1015 : *
1016 : * This function returns a pointer to a static buffer bearing the converted
1017 : * name or NULL if conversion fails.
1018 : *
1019 : * [1] https://docs.microsoft.com/en-us/windows/win32/intl/locale-identifiers
1020 : * [2] https://docs.microsoft.com/en-us/windows/win32/intl/locale-names
1021 : */
1022 :
1023 : #if defined(_MSC_VER)
1024 :
1025 : /*
1026 : * Callback function for EnumSystemLocalesEx() in get_iso_localename().
1027 : *
1028 : * This function enumerates all system locales, searching for one that matches
1029 : * an input with the format: <Language>[_<Country>], e.g.
1030 : * English[_United States]
1031 : *
1032 : * The input is a three wchar_t array as an LPARAM. The first element is the
1033 : * locale_name we want to match, the second element is an allocated buffer
1034 : * where the Unix-style locale is copied if a match is found, and the third
1035 : * element is the search status, 1 if a match was found, 0 otherwise.
1036 : */
1037 : static BOOL CALLBACK
1038 : search_locale_enum(LPWSTR pStr, DWORD dwFlags, LPARAM lparam)
1039 : {
1040 : wchar_t test_locale[LOCALE_NAME_MAX_LENGTH];
1041 : wchar_t **argv;
1042 :
1043 : (void) (dwFlags);
1044 :
1045 : argv = (wchar_t **) lparam;
1046 : *argv[2] = (wchar_t) 0;
1047 :
1048 : memset(test_locale, 0, sizeof(test_locale));
1049 :
1050 : /* Get the name of the <Language> in English */
1051 : if (GetLocaleInfoEx(pStr, LOCALE_SENGLISHLANGUAGENAME,
1052 : test_locale, LOCALE_NAME_MAX_LENGTH))
1053 : {
1054 : /*
1055 : * If the enumerated locale does not have a hyphen ("en") OR the
1056 : * locale_name input does not have an underscore ("English"), we only
1057 : * need to compare the <Language> tags.
1058 : */
1059 : if (wcsrchr(pStr, '-') == NULL || wcsrchr(argv[0], '_') == NULL)
1060 : {
1061 : if (_wcsicmp(argv[0], test_locale) == 0)
1062 : {
1063 : wcscpy(argv[1], pStr);
1064 : *argv[2] = (wchar_t) 1;
1065 : return FALSE;
1066 : }
1067 : }
1068 :
1069 : /*
1070 : * We have to compare a full <Language>_<Country> tag, so we append
1071 : * the underscore and name of the country/region in English, e.g.
1072 : * "English_United States".
1073 : */
1074 : else
1075 : {
1076 : size_t len;
1077 :
1078 : wcscat(test_locale, L"_");
1079 : len = wcslen(test_locale);
1080 : if (GetLocaleInfoEx(pStr, LOCALE_SENGLISHCOUNTRYNAME,
1081 : test_locale + len,
1082 : LOCALE_NAME_MAX_LENGTH - len))
1083 : {
1084 : if (_wcsicmp(argv[0], test_locale) == 0)
1085 : {
1086 : wcscpy(argv[1], pStr);
1087 : *argv[2] = (wchar_t) 1;
1088 : return FALSE;
1089 : }
1090 : }
1091 : }
1092 : }
1093 :
1094 : return TRUE;
1095 : }
1096 :
1097 : /*
1098 : * This function converts a Windows locale name to an ISO formatted version
1099 : * for Visual Studio 2015 or greater.
1100 : *
1101 : * Returns NULL, if no valid conversion was found.
1102 : */
1103 : static char *
1104 : get_iso_localename(const char *winlocname)
1105 : {
1106 : wchar_t wc_locale_name[LOCALE_NAME_MAX_LENGTH];
1107 : wchar_t buffer[LOCALE_NAME_MAX_LENGTH];
1108 : static char iso_lc_messages[LOCALE_NAME_MAX_LENGTH];
1109 : char *period;
1110 : int len;
1111 : int ret_val;
1112 :
1113 : /*
1114 : * Valid locales have the following syntax:
1115 : * <Language>[_<Country>[.<CodePage>]]
1116 : *
1117 : * GetLocaleInfoEx can only take locale name without code-page and for the
1118 : * purpose of this API the code-page doesn't matter.
1119 : */
1120 : period = strchr(winlocname, '.');
1121 : if (period != NULL)
1122 : len = period - winlocname;
1123 : else
1124 : len = pg_mbstrlen(winlocname);
1125 :
1126 : memset(wc_locale_name, 0, sizeof(wc_locale_name));
1127 : memset(buffer, 0, sizeof(buffer));
1128 : MultiByteToWideChar(CP_ACP, 0, winlocname, len, wc_locale_name,
1129 : LOCALE_NAME_MAX_LENGTH);
1130 :
1131 : /*
1132 : * If the lc_messages is already a Unix-style string, we have a direct
1133 : * match with LOCALE_SNAME, e.g. en-US, en_US.
1134 : */
1135 : ret_val = GetLocaleInfoEx(wc_locale_name, LOCALE_SNAME, (LPWSTR) &buffer,
1136 : LOCALE_NAME_MAX_LENGTH);
1137 : if (!ret_val)
1138 : {
1139 : /*
1140 : * Search for a locale in the system that matches language and country
1141 : * name.
1142 : */
1143 : wchar_t *argv[3];
1144 :
1145 : argv[0] = wc_locale_name;
1146 : argv[1] = buffer;
1147 : argv[2] = (wchar_t *) &ret_val;
1148 : EnumSystemLocalesEx(search_locale_enum, LOCALE_WINDOWS, (LPARAM) argv,
1149 : NULL);
1150 : }
1151 :
1152 : if (ret_val)
1153 : {
1154 : size_t rc;
1155 : char *hyphen;
1156 :
1157 : /* Locale names use only ASCII, any conversion locale suffices. */
1158 : rc = wchar2char(iso_lc_messages, buffer, sizeof(iso_lc_messages), NULL);
1159 : if (rc == -1 || rc == sizeof(iso_lc_messages))
1160 : return NULL;
1161 :
1162 : /*
1163 : * Since the message catalogs sit on a case-insensitive filesystem, we
1164 : * need not standardize letter case here. So long as we do not ship
1165 : * message catalogs for which it would matter, we also need not
1166 : * translate the script/variant portion, e.g. uz-Cyrl-UZ to
1167 : * uz_UZ@cyrillic. Simply replace the hyphen with an underscore.
1168 : */
1169 : hyphen = strchr(iso_lc_messages, '-');
1170 : if (hyphen)
1171 : *hyphen = '_';
1172 : return iso_lc_messages;
1173 : }
1174 :
1175 : return NULL;
1176 : }
1177 :
1178 : static char *
1179 : IsoLocaleName(const char *winlocname)
1180 : {
1181 : static char iso_lc_messages[LOCALE_NAME_MAX_LENGTH];
1182 :
1183 : if (pg_strcasecmp("c", winlocname) == 0 ||
1184 : pg_strcasecmp("posix", winlocname) == 0)
1185 : {
1186 : strcpy(iso_lc_messages, "C");
1187 : return iso_lc_messages;
1188 : }
1189 : else
1190 : return get_iso_localename(winlocname);
1191 : }
1192 :
1193 : #else /* !defined(_MSC_VER) */
1194 :
1195 : static char *
1196 : IsoLocaleName(const char *winlocname)
1197 : {
1198 : return NULL; /* Not supported on MinGW */
1199 : }
1200 :
1201 : #endif /* defined(_MSC_VER) */
1202 :
1203 : #endif /* WIN32 && LC_MESSAGES */
1204 :
1205 :
1206 : /*
1207 : * Cache mechanism for collation information.
1208 : *
1209 : * We cache two flags: whether the collation's LC_COLLATE or LC_CTYPE is C
1210 : * (or POSIX), so we can optimize a few code paths in various places.
1211 : * For the built-in C and POSIX collations, we can know that without even
1212 : * doing a cache lookup, but we want to support aliases for C/POSIX too.
1213 : * For the "default" collation, there are separate static cache variables,
1214 : * since consulting the pg_collation catalog doesn't tell us what we need.
1215 : *
1216 : * Also, if a pg_locale_t has been requested for a collation, we cache that
1217 : * for the life of a backend.
1218 : *
1219 : * Note that some code relies on the flags not reporting false negatives
1220 : * (that is, saying it's not C when it is). For example, char2wchar()
1221 : * could fail if the locale is C, so str_tolower() shouldn't call it
1222 : * in that case.
1223 : *
1224 : * Note that we currently lack any way to flush the cache. Since we don't
1225 : * support ALTER COLLATION, this is OK. The worst case is that someone
1226 : * drops a collation, and a useless cache entry hangs around in existing
1227 : * backends.
1228 : */
1229 :
1230 : static collation_cache_entry *
1231 13044 : lookup_collation_cache(Oid collation, bool set_flags)
1232 : {
1233 : collation_cache_entry *cache_entry;
1234 : bool found;
1235 :
1236 : Assert(OidIsValid(collation));
1237 : Assert(collation != DEFAULT_COLLATION_OID);
1238 :
1239 13044 : if (collation_cache == NULL)
1240 : {
1241 : /* First time through, initialize the hash table */
1242 : HASHCTL ctl;
1243 :
1244 36 : ctl.keysize = sizeof(Oid);
1245 36 : ctl.entrysize = sizeof(collation_cache_entry);
1246 36 : collation_cache = hash_create("Collation cache", 100, &ctl,
1247 : HASH_ELEM | HASH_BLOBS);
1248 : }
1249 :
1250 13044 : cache_entry = hash_search(collation_cache, &collation, HASH_ENTER, &found);
1251 13044 : if (!found)
1252 : {
1253 : /*
1254 : * Make sure cache entry is marked invalid, in case we fail before
1255 : * setting things.
1256 : */
1257 114 : cache_entry->flags_valid = false;
1258 114 : cache_entry->locale = 0;
1259 : }
1260 :
1261 13044 : if (set_flags && !cache_entry->flags_valid)
1262 : {
1263 : /* Attempt to set the flags */
1264 : HeapTuple tp;
1265 : Form_pg_collation collform;
1266 :
1267 114 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collation));
1268 114 : if (!HeapTupleIsValid(tp))
1269 0 : elog(ERROR, "cache lookup failed for collation %u", collation);
1270 114 : collform = (Form_pg_collation) GETSTRUCT(tp);
1271 :
1272 114 : if (collform->collprovider == COLLPROVIDER_LIBC)
1273 : {
1274 : Datum datum;
1275 : const char *collcollate;
1276 : const char *collctype;
1277 :
1278 44 : datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
1279 44 : collcollate = TextDatumGetCString(datum);
1280 44 : datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype);
1281 44 : collctype = TextDatumGetCString(datum);
1282 :
1283 64 : cache_entry->collate_is_c = ((strcmp(collcollate, "C") == 0) ||
1284 20 : (strcmp(collcollate, "POSIX") == 0));
1285 64 : cache_entry->ctype_is_c = ((strcmp(collctype, "C") == 0) ||
1286 20 : (strcmp(collctype, "POSIX") == 0));
1287 : }
1288 : else
1289 : {
1290 70 : cache_entry->collate_is_c = false;
1291 70 : cache_entry->ctype_is_c = false;
1292 : }
1293 :
1294 114 : cache_entry->flags_valid = true;
1295 :
1296 114 : ReleaseSysCache(tp);
1297 : }
1298 :
1299 13044 : return cache_entry;
1300 : }
1301 :
1302 :
1303 : /*
1304 : * Detect whether collation's LC_COLLATE property is C
1305 : */
1306 : bool
1307 13621330 : lc_collate_is_c(Oid collation)
1308 : {
1309 : /*
1310 : * If we're asked about "collation 0", return false, so that the code will
1311 : * go into the non-C path and report that the collation is bogus.
1312 : */
1313 13621330 : if (!OidIsValid(collation))
1314 0 : return false;
1315 :
1316 : /*
1317 : * If we're asked about the default collation, we have to inquire of the C
1318 : * library. Cache the result so we only have to compute it once.
1319 : */
1320 13621330 : if (collation == DEFAULT_COLLATION_OID)
1321 : {
1322 : static int result = -1;
1323 : char *localeptr;
1324 :
1325 11639724 : if (default_locale.provider == COLLPROVIDER_ICU)
1326 2616776 : return false;
1327 :
1328 9022948 : if (result >= 0)
1329 9018832 : return (bool) result;
1330 4116 : localeptr = setlocale(LC_COLLATE, NULL);
1331 4116 : if (!localeptr)
1332 0 : elog(ERROR, "invalid LC_COLLATE setting");
1333 :
1334 4116 : if (strcmp(localeptr, "C") == 0)
1335 4054 : result = true;
1336 62 : else if (strcmp(localeptr, "POSIX") == 0)
1337 0 : result = true;
1338 : else
1339 62 : result = false;
1340 4116 : return (bool) result;
1341 : }
1342 :
1343 : /*
1344 : * If we're asked about the built-in C/POSIX collations, we know that.
1345 : */
1346 1981606 : if (collation == C_COLLATION_OID ||
1347 : collation == POSIX_COLLATION_OID)
1348 1975698 : return true;
1349 :
1350 : /*
1351 : * Otherwise, we have to consult pg_collation, but we cache that.
1352 : */
1353 5908 : return (lookup_collation_cache(collation, true))->collate_is_c;
1354 : }
1355 :
1356 : /*
1357 : * Detect whether collation's LC_CTYPE property is C
1358 : */
1359 : bool
1360 3885800 : lc_ctype_is_c(Oid collation)
1361 : {
1362 : /*
1363 : * If we're asked about "collation 0", return false, so that the code will
1364 : * go into the non-C path and report that the collation is bogus.
1365 : */
1366 3885800 : if (!OidIsValid(collation))
1367 0 : return false;
1368 :
1369 : /*
1370 : * If we're asked about the default collation, we have to inquire of the C
1371 : * library. Cache the result so we only have to compute it once.
1372 : */
1373 3885800 : if (collation == DEFAULT_COLLATION_OID)
1374 : {
1375 : static int result = -1;
1376 : char *localeptr;
1377 :
1378 3041062 : if (default_locale.provider == COLLPROVIDER_ICU)
1379 546022 : return false;
1380 :
1381 2495040 : if (result >= 0)
1382 2494466 : return (bool) result;
1383 574 : localeptr = setlocale(LC_CTYPE, NULL);
1384 574 : if (!localeptr)
1385 0 : elog(ERROR, "invalid LC_CTYPE setting");
1386 :
1387 574 : if (strcmp(localeptr, "C") == 0)
1388 544 : result = true;
1389 30 : else if (strcmp(localeptr, "POSIX") == 0)
1390 0 : result = true;
1391 : else
1392 30 : result = false;
1393 574 : return (bool) result;
1394 : }
1395 :
1396 : /*
1397 : * If we're asked about the built-in C/POSIX collations, we know that.
1398 : */
1399 844738 : if (collation == C_COLLATION_OID ||
1400 : collation == POSIX_COLLATION_OID)
1401 844062 : return true;
1402 :
1403 : /*
1404 : * Otherwise, we have to consult pg_collation, but we cache that.
1405 : */
1406 676 : return (lookup_collation_cache(collation, true))->ctype_is_c;
1407 : }
1408 :
1409 : struct pg_locale_struct default_locale;
1410 :
1411 : void
1412 1762 : make_icu_collator(const char *iculocstr,
1413 : const char *icurules,
1414 : struct pg_locale_struct *resultp)
1415 : {
1416 : #ifdef USE_ICU
1417 : UCollator *collator;
1418 :
1419 1762 : collator = pg_ucol_open(iculocstr);
1420 :
1421 : /*
1422 : * If rules are specified, we extract the rules of the standard collation,
1423 : * add our own rules, and make a new collator with the combined rules.
1424 : */
1425 1758 : if (icurules)
1426 : {
1427 : const UChar *default_rules;
1428 : UChar *agg_rules;
1429 : UChar *my_rules;
1430 : UErrorCode status;
1431 : int32_t length;
1432 :
1433 4 : default_rules = ucol_getRules(collator, &length);
1434 4 : icu_to_uchar(&my_rules, icurules, strlen(icurules));
1435 :
1436 4 : agg_rules = palloc_array(UChar, u_strlen(default_rules) + u_strlen(my_rules) + 1);
1437 4 : u_strcpy(agg_rules, default_rules);
1438 4 : u_strcat(agg_rules, my_rules);
1439 :
1440 4 : ucol_close(collator);
1441 :
1442 4 : status = U_ZERO_ERROR;
1443 4 : collator = ucol_openRules(agg_rules, u_strlen(agg_rules),
1444 : UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH, NULL, &status);
1445 4 : if (U_FAILURE(status))
1446 2 : ereport(ERROR,
1447 : (errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
1448 : iculocstr, icurules, u_errorName(status))));
1449 : }
1450 :
1451 : /* We will leak this string if the caller errors later :-( */
1452 1756 : resultp->info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr);
1453 1756 : resultp->info.icu.ucol = collator;
1454 : #else /* not USE_ICU */
1455 : /* could get here if a collation was created by a build with ICU */
1456 : ereport(ERROR,
1457 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1458 : errmsg("ICU is not supported in this build")));
1459 : #endif /* not USE_ICU */
1460 1756 : }
1461 :
1462 :
1463 : /* simple subroutine for reporting errors from newlocale() */
1464 : static void
1465 0 : report_newlocale_failure(const char *localename)
1466 : {
1467 : int save_errno;
1468 :
1469 : /*
1470 : * Windows doesn't provide any useful error indication from
1471 : * _create_locale(), and BSD-derived platforms don't seem to feel they
1472 : * need to set errno either (even though POSIX is pretty clear that
1473 : * newlocale should do so). So, if errno hasn't been set, assume ENOENT
1474 : * is what to report.
1475 : */
1476 0 : if (errno == 0)
1477 0 : errno = ENOENT;
1478 :
1479 : /*
1480 : * ENOENT means "no such locale", not "no such file", so clarify that
1481 : * errno with an errdetail message.
1482 : */
1483 0 : save_errno = errno; /* auxiliary funcs might change errno */
1484 0 : ereport(ERROR,
1485 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1486 : errmsg("could not create locale \"%s\": %m",
1487 : localename),
1488 : (save_errno == ENOENT ?
1489 : errdetail("The operating system could not find any locale data for the locale name \"%s\".",
1490 : localename) : 0)));
1491 : }
1492 :
1493 : bool
1494 2891574 : pg_locale_deterministic(pg_locale_t locale)
1495 : {
1496 : /* default locale must always be deterministic */
1497 2891574 : if (locale == NULL)
1498 1255980 : return true;
1499 : else
1500 1635594 : return locale->deterministic;
1501 : }
1502 :
1503 : /*
1504 : * Create a locale_t from a collation OID. Results are cached for the
1505 : * lifetime of the backend. Thus, do not free the result with freelocale().
1506 : *
1507 : * As a special optimization, the default/database collation returns 0.
1508 : *
1509 : * For simplicity, we always generate COLLATE + CTYPE even though we
1510 : * might only need one of them. Since this is called only once per session,
1511 : * it shouldn't cost much.
1512 : */
1513 : pg_locale_t
1514 3200178 : pg_newlocale_from_collation(Oid collid)
1515 : {
1516 : collation_cache_entry *cache_entry;
1517 :
1518 : /* Callers must pass a valid OID */
1519 : Assert(OidIsValid(collid));
1520 :
1521 3200178 : if (collid == DEFAULT_COLLATION_OID)
1522 : {
1523 3193718 : if (default_locale.provider == COLLPROVIDER_ICU)
1524 3161476 : return &default_locale;
1525 : else
1526 32242 : return (pg_locale_t) 0;
1527 : }
1528 :
1529 6460 : cache_entry = lookup_collation_cache(collid, false);
1530 :
1531 6460 : if (cache_entry->locale == 0)
1532 : {
1533 : /* We haven't computed this yet in this session, so do it */
1534 : HeapTuple tp;
1535 : Form_pg_collation collform;
1536 : struct pg_locale_struct result;
1537 : pg_locale_t resultp;
1538 : Datum datum;
1539 : bool isnull;
1540 :
1541 70 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
1542 70 : if (!HeapTupleIsValid(tp))
1543 0 : elog(ERROR, "cache lookup failed for collation %u", collid);
1544 70 : collform = (Form_pg_collation) GETSTRUCT(tp);
1545 :
1546 : /* We'll fill in the result struct locally before allocating memory */
1547 70 : memset(&result, 0, sizeof(result));
1548 70 : result.provider = collform->collprovider;
1549 70 : result.deterministic = collform->collisdeterministic;
1550 :
1551 70 : if (collform->collprovider == COLLPROVIDER_LIBC)
1552 : {
1553 : const char *collcollate;
1554 : const char *collctype pg_attribute_unused();
1555 : locale_t loc;
1556 :
1557 0 : datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
1558 0 : collcollate = TextDatumGetCString(datum);
1559 0 : datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype);
1560 0 : collctype = TextDatumGetCString(datum);
1561 :
1562 0 : if (strcmp(collcollate, collctype) == 0)
1563 : {
1564 : /* Normal case where they're the same */
1565 0 : errno = 0;
1566 : #ifndef WIN32
1567 0 : loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collcollate,
1568 : NULL);
1569 : #else
1570 : loc = _create_locale(LC_ALL, collcollate);
1571 : #endif
1572 0 : if (!loc)
1573 0 : report_newlocale_failure(collcollate);
1574 : }
1575 : else
1576 : {
1577 : #ifndef WIN32
1578 : /* We need two newlocale() steps */
1579 : locale_t loc1;
1580 :
1581 0 : errno = 0;
1582 0 : loc1 = newlocale(LC_COLLATE_MASK, collcollate, NULL);
1583 0 : if (!loc1)
1584 0 : report_newlocale_failure(collcollate);
1585 0 : errno = 0;
1586 0 : loc = newlocale(LC_CTYPE_MASK, collctype, loc1);
1587 0 : if (!loc)
1588 0 : report_newlocale_failure(collctype);
1589 : #else
1590 :
1591 : /*
1592 : * XXX The _create_locale() API doesn't appear to support
1593 : * this. Could perhaps be worked around by changing
1594 : * pg_locale_t to contain two separate fields.
1595 : */
1596 : ereport(ERROR,
1597 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1598 : errmsg("collations with different collate and ctype values are not supported on this platform")));
1599 : #endif
1600 : }
1601 :
1602 0 : result.info.lt = loc;
1603 : }
1604 70 : else if (collform->collprovider == COLLPROVIDER_ICU)
1605 : {
1606 : const char *iculocstr;
1607 : const char *icurules;
1608 :
1609 70 : datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colliculocale);
1610 70 : iculocstr = TextDatumGetCString(datum);
1611 :
1612 70 : datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull);
1613 70 : if (!isnull)
1614 4 : icurules = TextDatumGetCString(datum);
1615 : else
1616 66 : icurules = NULL;
1617 :
1618 70 : make_icu_collator(iculocstr, icurules, &result);
1619 : }
1620 :
1621 68 : datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion,
1622 : &isnull);
1623 68 : if (!isnull)
1624 : {
1625 : char *actual_versionstr;
1626 : char *collversionstr;
1627 :
1628 68 : collversionstr = TextDatumGetCString(datum);
1629 :
1630 68 : datum = SysCacheGetAttrNotNull(COLLOID, tp, collform->collprovider == COLLPROVIDER_ICU ? Anum_pg_collation_colliculocale : Anum_pg_collation_collcollate);
1631 :
1632 68 : actual_versionstr = get_collation_actual_version(collform->collprovider,
1633 68 : TextDatumGetCString(datum));
1634 68 : if (!actual_versionstr)
1635 : {
1636 : /*
1637 : * This could happen when specifying a version in CREATE
1638 : * COLLATION but the provider does not support versioning, or
1639 : * manually creating a mess in the catalogs.
1640 : */
1641 0 : ereport(ERROR,
1642 : (errmsg("collation \"%s\" has no actual version, but a version was recorded",
1643 : NameStr(collform->collname))));
1644 : }
1645 :
1646 68 : if (strcmp(actual_versionstr, collversionstr) != 0)
1647 0 : ereport(WARNING,
1648 : (errmsg("collation \"%s\" has version mismatch",
1649 : NameStr(collform->collname)),
1650 : errdetail("The collation in the database was created using version %s, "
1651 : "but the operating system provides version %s.",
1652 : collversionstr, actual_versionstr),
1653 : errhint("Rebuild all objects affected by this collation and run "
1654 : "ALTER COLLATION %s REFRESH VERSION, "
1655 : "or build PostgreSQL with the right library version.",
1656 : quote_qualified_identifier(get_namespace_name(collform->collnamespace),
1657 : NameStr(collform->collname)))));
1658 : }
1659 :
1660 68 : ReleaseSysCache(tp);
1661 :
1662 : /* We'll keep the pg_locale_t structures in TopMemoryContext */
1663 68 : resultp = MemoryContextAlloc(TopMemoryContext, sizeof(*resultp));
1664 68 : *resultp = result;
1665 :
1666 68 : cache_entry->locale = resultp;
1667 : }
1668 :
1669 6458 : return cache_entry->locale;
1670 : }
1671 :
1672 : /*
1673 : * Get provider-specific collation version string for the given collation from
1674 : * the operating system/library.
1675 : */
1676 : char *
1677 50128 : get_collation_actual_version(char collprovider, const char *collcollate)
1678 : {
1679 50128 : char *collversion = NULL;
1680 :
1681 : #ifdef USE_ICU
1682 50128 : if (collprovider == COLLPROVIDER_ICU)
1683 : {
1684 : UCollator *collator;
1685 : UVersionInfo versioninfo;
1686 : char buf[U_MAX_VERSION_STRING_LENGTH];
1687 :
1688 48884 : collator = pg_ucol_open(collcollate);
1689 :
1690 48884 : ucol_getVersion(collator, versioninfo);
1691 48884 : ucol_close(collator);
1692 :
1693 48884 : u_versionToString(versioninfo, buf);
1694 48884 : collversion = pstrdup(buf);
1695 : }
1696 : else
1697 : #endif
1698 2488 : if (collprovider == COLLPROVIDER_LIBC &&
1699 1998 : pg_strcasecmp("C", collcollate) != 0 &&
1700 1388 : pg_strncasecmp("C.", collcollate, 2) != 0 &&
1701 634 : pg_strcasecmp("POSIX", collcollate) != 0)
1702 : {
1703 : #if defined(__GLIBC__)
1704 : /* Use the glibc version because we don't have anything better. */
1705 610 : collversion = pstrdup(gnu_get_libc_version());
1706 : #elif defined(LC_VERSION_MASK)
1707 : locale_t loc;
1708 :
1709 : /* Look up FreeBSD collation version. */
1710 : loc = newlocale(LC_COLLATE, collcollate, NULL);
1711 : if (loc)
1712 : {
1713 : collversion =
1714 : pstrdup(querylocale(LC_COLLATE_MASK | LC_VERSION_MASK, loc));
1715 : freelocale(loc);
1716 : }
1717 : else
1718 : ereport(ERROR,
1719 : (errmsg("could not load locale \"%s\"", collcollate)));
1720 : #elif defined(WIN32)
1721 : /*
1722 : * If we are targeting Windows Vista and above, we can ask for a name
1723 : * given a collation name (earlier versions required a location code
1724 : * that we don't have).
1725 : */
1726 : NLSVERSIONINFOEX version = {sizeof(NLSVERSIONINFOEX)};
1727 : WCHAR wide_collcollate[LOCALE_NAME_MAX_LENGTH];
1728 :
1729 : MultiByteToWideChar(CP_ACP, 0, collcollate, -1, wide_collcollate,
1730 : LOCALE_NAME_MAX_LENGTH);
1731 : if (!GetNLSVersionEx(COMPARE_STRING, wide_collcollate, &version))
1732 : {
1733 : /*
1734 : * GetNLSVersionEx() wants a language tag such as "en-US", not a
1735 : * locale name like "English_United States.1252". Until those
1736 : * values can be prevented from entering the system, or 100%
1737 : * reliably converted to the more useful tag format, tolerate the
1738 : * resulting error and report that we have no version data.
1739 : */
1740 : if (GetLastError() == ERROR_INVALID_PARAMETER)
1741 : return NULL;
1742 :
1743 : ereport(ERROR,
1744 : (errmsg("could not get collation version for locale \"%s\": error code %lu",
1745 : collcollate,
1746 : GetLastError())));
1747 : }
1748 : collversion = psprintf("%lu.%lu,%lu.%lu",
1749 : (version.dwNLSVersion >> 8) & 0xFFFF,
1750 : version.dwNLSVersion & 0xFF,
1751 : (version.dwDefinedVersion >> 8) & 0xFFFF,
1752 : version.dwDefinedVersion & 0xFF);
1753 : #endif
1754 : }
1755 :
1756 50128 : return collversion;
1757 : }
1758 :
1759 : /*
1760 : * pg_strncoll_libc_win32_utf8
1761 : *
1762 : * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
1763 : * invoke wcscoll() or wcscoll_l().
1764 : */
1765 : #ifdef WIN32
1766 : static int
1767 : pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
1768 : size_t len2, pg_locale_t locale)
1769 : {
1770 : char sbuf[TEXTBUFLEN];
1771 : char *buf = sbuf;
1772 : char *a1p,
1773 : *a2p;
1774 : int a1len = len1 * 2 + 2;
1775 : int a2len = len2 * 2 + 2;
1776 : int r;
1777 : int result;
1778 :
1779 : Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
1780 : Assert(GetDatabaseEncoding() == PG_UTF8);
1781 : #ifndef WIN32
1782 : Assert(false);
1783 : #endif
1784 :
1785 : if (a1len + a2len > TEXTBUFLEN)
1786 : buf = palloc(a1len + a2len);
1787 :
1788 : a1p = buf;
1789 : a2p = buf + a1len;
1790 :
1791 : /* API does not work for zero-length input */
1792 : if (len1 == 0)
1793 : r = 0;
1794 : else
1795 : {
1796 : r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1797 : (LPWSTR) a1p, a1len / 2);
1798 : if (!r)
1799 : ereport(ERROR,
1800 : (errmsg("could not convert string to UTF-16: error code %lu",
1801 : GetLastError())));
1802 : }
1803 : ((LPWSTR) a1p)[r] = 0;
1804 :
1805 : if (len2 == 0)
1806 : r = 0;
1807 : else
1808 : {
1809 : r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1810 : (LPWSTR) a2p, a2len / 2);
1811 : if (!r)
1812 : ereport(ERROR,
1813 : (errmsg("could not convert string to UTF-16: error code %lu",
1814 : GetLastError())));
1815 : }
1816 : ((LPWSTR) a2p)[r] = 0;
1817 :
1818 : errno = 0;
1819 : if (locale)
1820 : result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt);
1821 : else
1822 : result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1823 : if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */
1824 : ereport(ERROR,
1825 : (errmsg("could not compare Unicode strings: %m")));
1826 :
1827 : if (buf != sbuf)
1828 : pfree(buf);
1829 :
1830 : return result;
1831 : }
1832 : #endif /* WIN32 */
1833 :
1834 : /*
1835 : * pg_strcoll_libc
1836 : *
1837 : * Call strcoll(), strcoll_l(), wcscoll(), or wcscoll_l() as appropriate for
1838 : * the given locale, platform, and database encoding. If the locale is NULL,
1839 : * use the database collation.
1840 : *
1841 : * Arguments must be encoded in the database encoding and nul-terminated.
1842 : */
1843 : static int
1844 6 : pg_strcoll_libc(const char *arg1, const char *arg2, pg_locale_t locale)
1845 : {
1846 : int result;
1847 :
1848 : Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
1849 : #ifdef WIN32
1850 : if (GetDatabaseEncoding() == PG_UTF8)
1851 : {
1852 : size_t len1 = strlen(arg1);
1853 : size_t len2 = strlen(arg2);
1854 :
1855 : result = pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
1856 : }
1857 : else
1858 : #endif /* WIN32 */
1859 6 : if (locale)
1860 0 : result = strcoll_l(arg1, arg2, locale->info.lt);
1861 : else
1862 6 : result = strcoll(arg1, arg2);
1863 :
1864 6 : return result;
1865 : }
1866 :
1867 : /*
1868 : * pg_strncoll_libc
1869 : *
1870 : * Nul-terminate the arguments and call pg_strcoll_libc().
1871 : */
1872 : static int
1873 0 : pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
1874 : pg_locale_t locale)
1875 : {
1876 : char sbuf[TEXTBUFLEN];
1877 0 : char *buf = sbuf;
1878 0 : size_t bufsize1 = len1 + 1;
1879 0 : size_t bufsize2 = len2 + 1;
1880 : char *arg1n;
1881 : char *arg2n;
1882 : int result;
1883 :
1884 : Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
1885 :
1886 : #ifdef WIN32
1887 : /* check for this case before doing the work for nul-termination */
1888 : if (GetDatabaseEncoding() == PG_UTF8)
1889 : return pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
1890 : #endif /* WIN32 */
1891 :
1892 0 : if (bufsize1 + bufsize2 > TEXTBUFLEN)
1893 0 : buf = palloc(bufsize1 + bufsize2);
1894 :
1895 0 : arg1n = buf;
1896 0 : arg2n = buf + bufsize1;
1897 :
1898 : /* nul-terminate arguments */
1899 0 : memcpy(arg1n, arg1, len1);
1900 0 : arg1n[len1] = '\0';
1901 0 : memcpy(arg2n, arg2, len2);
1902 0 : arg2n[len2] = '\0';
1903 :
1904 0 : result = pg_strcoll_libc(arg1n, arg2n, locale);
1905 :
1906 0 : if (buf != sbuf)
1907 0 : pfree(buf);
1908 :
1909 0 : return result;
1910 : }
1911 :
1912 : #ifdef USE_ICU
1913 :
1914 : /*
1915 : * pg_strncoll_icu_no_utf8
1916 : *
1917 : * Convert the arguments from the database encoding to UChar strings, then
1918 : * call ucol_strcoll(). An argument length of -1 means that the string is
1919 : * NUL-terminated.
1920 : *
1921 : * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
1922 : * caller should call that instead.
1923 : */
1924 : static int
1925 0 : pg_strncoll_icu_no_utf8(const char *arg1, int32_t len1,
1926 : const char *arg2, int32_t len2, pg_locale_t locale)
1927 : {
1928 : char sbuf[TEXTBUFLEN];
1929 0 : char *buf = sbuf;
1930 : int32_t ulen1;
1931 : int32_t ulen2;
1932 : size_t bufsize1;
1933 : size_t bufsize2;
1934 : UChar *uchar1,
1935 : *uchar2;
1936 : int result;
1937 :
1938 : Assert(locale->provider == COLLPROVIDER_ICU);
1939 : #ifdef HAVE_UCOL_STRCOLLUTF8
1940 : Assert(GetDatabaseEncoding() != PG_UTF8);
1941 : #endif
1942 :
1943 0 : init_icu_converter();
1944 :
1945 0 : ulen1 = uchar_length(icu_converter, arg1, len1);
1946 0 : ulen2 = uchar_length(icu_converter, arg2, len2);
1947 :
1948 0 : bufsize1 = (ulen1 + 1) * sizeof(UChar);
1949 0 : bufsize2 = (ulen2 + 1) * sizeof(UChar);
1950 :
1951 0 : if (bufsize1 + bufsize2 > TEXTBUFLEN)
1952 0 : buf = palloc(bufsize1 + bufsize2);
1953 :
1954 0 : uchar1 = (UChar *) buf;
1955 0 : uchar2 = (UChar *) (buf + bufsize1);
1956 :
1957 0 : ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
1958 0 : ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
1959 :
1960 0 : result = ucol_strcoll(locale->info.icu.ucol,
1961 : uchar1, ulen1,
1962 : uchar2, ulen2);
1963 :
1964 0 : if (buf != sbuf)
1965 0 : pfree(buf);
1966 :
1967 0 : return result;
1968 : }
1969 :
1970 : /*
1971 : * pg_strncoll_icu
1972 : *
1973 : * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
1974 : * database encoding. An argument length of -1 means the string is
1975 : * NUL-terminated.
1976 : *
1977 : * Arguments must be encoded in the database encoding.
1978 : */
1979 : static int
1980 6455396 : pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2,
1981 : pg_locale_t locale)
1982 : {
1983 : int result;
1984 :
1985 : Assert(locale->provider == COLLPROVIDER_ICU);
1986 :
1987 : #ifdef HAVE_UCOL_STRCOLLUTF8
1988 6455396 : if (GetDatabaseEncoding() == PG_UTF8)
1989 : {
1990 : UErrorCode status;
1991 :
1992 6455396 : status = U_ZERO_ERROR;
1993 6455396 : result = ucol_strcollUTF8(locale->info.icu.ucol,
1994 : arg1, len1,
1995 : arg2, len2,
1996 : &status);
1997 6455396 : if (U_FAILURE(status))
1998 0 : ereport(ERROR,
1999 : (errmsg("collation failed: %s", u_errorName(status))));
2000 : }
2001 : else
2002 : #endif
2003 : {
2004 0 : result = pg_strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale);
2005 : }
2006 :
2007 6455396 : return result;
2008 : }
2009 :
2010 : #endif /* USE_ICU */
2011 :
2012 : /*
2013 : * pg_strcoll
2014 : *
2015 : * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll(), strcoll_l(), wcscoll(),
2016 : * or wcscoll_l() as appropriate for the given locale, platform, and database
2017 : * encoding. If the locale is not specified, use the database collation.
2018 : *
2019 : * Arguments must be encoded in the database encoding and nul-terminated.
2020 : *
2021 : * The caller is responsible for breaking ties if the collation is
2022 : * deterministic; this maintains consistency with pg_strxfrm(), which cannot
2023 : * easily account for deterministic collations.
2024 : */
2025 : int
2026 5821132 : pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
2027 : {
2028 : int result;
2029 :
2030 5821132 : if (!locale || locale->provider == COLLPROVIDER_LIBC)
2031 6 : result = pg_strcoll_libc(arg1, arg2, locale);
2032 : #ifdef USE_ICU
2033 5821126 : else if (locale->provider == COLLPROVIDER_ICU)
2034 5821126 : result = pg_strncoll_icu(arg1, -1, arg2, -1, locale);
2035 : #endif
2036 : else
2037 : /* shouldn't happen */
2038 0 : PGLOCALE_SUPPORT_ERROR(locale->provider);
2039 :
2040 5821132 : return result;
2041 : }
2042 :
2043 : /*
2044 : * pg_strncoll
2045 : *
2046 : * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll(), strcoll_l(), wcscoll(),
2047 : * or wcscoll_l() as appropriate for the given locale, platform, and database
2048 : * encoding. If the locale is not specified, use the database collation.
2049 : *
2050 : * Arguments must be encoded in the database encoding.
2051 : *
2052 : * This function may need to nul-terminate the arguments for libc functions;
2053 : * so if the caller already has nul-terminated strings, it should call
2054 : * pg_strcoll() instead.
2055 : *
2056 : * The caller is responsible for breaking ties if the collation is
2057 : * deterministic; this maintains consistency with pg_strnxfrm(), which cannot
2058 : * easily account for deterministic collations.
2059 : */
2060 : int
2061 634270 : pg_strncoll(const char *arg1, size_t len1, const char *arg2, size_t len2,
2062 : pg_locale_t locale)
2063 : {
2064 : int result;
2065 :
2066 634270 : if (!locale || locale->provider == COLLPROVIDER_LIBC)
2067 0 : result = pg_strncoll_libc(arg1, len1, arg2, len2, locale);
2068 : #ifdef USE_ICU
2069 634270 : else if (locale->provider == COLLPROVIDER_ICU)
2070 634270 : result = pg_strncoll_icu(arg1, len1, arg2, len2, locale);
2071 : #endif
2072 : else
2073 : /* shouldn't happen */
2074 0 : PGLOCALE_SUPPORT_ERROR(locale->provider);
2075 :
2076 634270 : return result;
2077 : }
2078 :
2079 :
2080 : static size_t
2081 0 : pg_strxfrm_libc(char *dest, const char *src, size_t destsize,
2082 : pg_locale_t locale)
2083 : {
2084 : Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
2085 :
2086 : #ifdef TRUST_STRXFRM
2087 : if (locale)
2088 : return strxfrm_l(dest, src, destsize, locale->info.lt);
2089 : else
2090 : return strxfrm(dest, src, destsize);
2091 : #else
2092 : /* shouldn't happen */
2093 0 : PGLOCALE_SUPPORT_ERROR(locale->provider);
2094 : return 0; /* keep compiler quiet */
2095 : #endif
2096 : }
2097 :
2098 : static size_t
2099 0 : pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize,
2100 : pg_locale_t locale)
2101 : {
2102 : char sbuf[TEXTBUFLEN];
2103 0 : char *buf = sbuf;
2104 0 : size_t bufsize = srclen + 1;
2105 : size_t result;
2106 :
2107 : Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
2108 :
2109 0 : if (bufsize > TEXTBUFLEN)
2110 0 : buf = palloc(bufsize);
2111 :
2112 : /* nul-terminate arguments */
2113 0 : memcpy(buf, src, srclen);
2114 0 : buf[srclen] = '\0';
2115 :
2116 0 : result = pg_strxfrm_libc(dest, buf, destsize, locale);
2117 :
2118 0 : if (buf != sbuf)
2119 0 : pfree(buf);
2120 :
2121 : /* if dest is defined, it should be nul-terminated */
2122 : Assert(result >= destsize || dest[result] == '\0');
2123 :
2124 0 : return result;
2125 : }
2126 :
2127 : #ifdef USE_ICU
2128 :
2129 : /* 'srclen' of -1 means the strings are NUL-terminated */
2130 : static size_t
2131 332 : pg_strnxfrm_icu(char *dest, const char *src, int32_t srclen, int32_t destsize,
2132 : pg_locale_t locale)
2133 : {
2134 : char sbuf[TEXTBUFLEN];
2135 332 : char *buf = sbuf;
2136 : UChar *uchar;
2137 : int32_t ulen;
2138 : size_t uchar_bsize;
2139 : Size result_bsize;
2140 :
2141 : Assert(locale->provider == COLLPROVIDER_ICU);
2142 :
2143 332 : init_icu_converter();
2144 :
2145 332 : ulen = uchar_length(icu_converter, src, srclen);
2146 :
2147 332 : uchar_bsize = (ulen + 1) * sizeof(UChar);
2148 :
2149 332 : if (uchar_bsize > TEXTBUFLEN)
2150 0 : buf = palloc(uchar_bsize);
2151 :
2152 332 : uchar = (UChar *) buf;
2153 :
2154 332 : ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
2155 :
2156 332 : result_bsize = ucol_getSortKey(locale->info.icu.ucol,
2157 : uchar, ulen,
2158 : (uint8_t *) dest, destsize);
2159 :
2160 : /*
2161 : * ucol_getSortKey() counts the nul-terminator in the result length, but
2162 : * this function should not.
2163 : */
2164 : Assert(result_bsize > 0);
2165 332 : result_bsize--;
2166 :
2167 332 : if (buf != sbuf)
2168 0 : pfree(buf);
2169 :
2170 : /* if dest is defined, it should be nul-terminated */
2171 : Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
2172 :
2173 332 : return result_bsize;
2174 : }
2175 :
2176 : /* 'srclen' of -1 means the strings are NUL-terminated */
2177 : static size_t
2178 0 : pg_strnxfrm_prefix_icu_no_utf8(char *dest, const char *src, int32_t srclen,
2179 : int32_t destsize, pg_locale_t locale)
2180 : {
2181 : char sbuf[TEXTBUFLEN];
2182 0 : char *buf = sbuf;
2183 : UCharIterator iter;
2184 : uint32_t state[2];
2185 : UErrorCode status;
2186 0 : int32_t ulen = -1;
2187 0 : UChar *uchar = NULL;
2188 : size_t uchar_bsize;
2189 : Size result_bsize;
2190 :
2191 : Assert(locale->provider == COLLPROVIDER_ICU);
2192 : Assert(GetDatabaseEncoding() != PG_UTF8);
2193 :
2194 0 : init_icu_converter();
2195 :
2196 0 : ulen = uchar_length(icu_converter, src, srclen);
2197 :
2198 0 : uchar_bsize = (ulen + 1) * sizeof(UChar);
2199 :
2200 0 : if (uchar_bsize > TEXTBUFLEN)
2201 0 : buf = palloc(uchar_bsize);
2202 :
2203 0 : uchar = (UChar *) buf;
2204 :
2205 0 : ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
2206 :
2207 0 : uiter_setString(&iter, uchar, ulen);
2208 0 : state[0] = state[1] = 0; /* won't need that again */
2209 0 : status = U_ZERO_ERROR;
2210 0 : result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol,
2211 : &iter,
2212 : state,
2213 : (uint8_t *) dest,
2214 : destsize,
2215 : &status);
2216 0 : if (U_FAILURE(status))
2217 0 : ereport(ERROR,
2218 : (errmsg("sort key generation failed: %s",
2219 : u_errorName(status))));
2220 :
2221 0 : return result_bsize;
2222 : }
2223 :
2224 : /* 'srclen' of -1 means the strings are NUL-terminated */
2225 : static size_t
2226 237192 : pg_strnxfrm_prefix_icu(char *dest, const char *src, int32_t srclen,
2227 : int32_t destsize, pg_locale_t locale)
2228 : {
2229 : size_t result;
2230 :
2231 : Assert(locale->provider == COLLPROVIDER_ICU);
2232 :
2233 237192 : if (GetDatabaseEncoding() == PG_UTF8)
2234 : {
2235 : UCharIterator iter;
2236 : uint32_t state[2];
2237 : UErrorCode status;
2238 :
2239 237192 : uiter_setUTF8(&iter, src, srclen);
2240 237192 : state[0] = state[1] = 0; /* won't need that again */
2241 237192 : status = U_ZERO_ERROR;
2242 237192 : result = ucol_nextSortKeyPart(locale->info.icu.ucol,
2243 : &iter,
2244 : state,
2245 : (uint8_t *) dest,
2246 : destsize,
2247 : &status);
2248 237192 : if (U_FAILURE(status))
2249 0 : ereport(ERROR,
2250 : (errmsg("sort key generation failed: %s",
2251 : u_errorName(status))));
2252 : }
2253 : else
2254 0 : result = pg_strnxfrm_prefix_icu_no_utf8(dest, src, srclen, destsize,
2255 : locale);
2256 :
2257 237192 : return result;
2258 : }
2259 :
2260 : #endif
2261 :
2262 : /*
2263 : * Return true if the collation provider supports pg_strxfrm() and
2264 : * pg_strnxfrm(); otherwise false.
2265 : *
2266 : * Unfortunately, it seems that strxfrm() for non-C collations is broken on
2267 : * many common platforms; testing of multiple versions of glibc reveals that,
2268 : * for many locales, strcoll() and strxfrm() do not return consistent
2269 : * results. While no other libc other than Cygwin has so far been shown to
2270 : * have a problem, we take the conservative course of action for right now and
2271 : * disable this categorically. (Users who are certain this isn't a problem on
2272 : * their system can define TRUST_STRXFRM.)
2273 : *
2274 : * No similar problem is known for the ICU provider.
2275 : */
2276 : bool
2277 22294 : pg_strxfrm_enabled(pg_locale_t locale)
2278 : {
2279 22294 : if (!locale || locale->provider == COLLPROVIDER_LIBC)
2280 : #ifdef TRUST_STRXFRM
2281 : return true;
2282 : #else
2283 2 : return false;
2284 : #endif
2285 22292 : else if (locale->provider == COLLPROVIDER_ICU)
2286 22292 : return true;
2287 : else
2288 : /* shouldn't happen */
2289 0 : PGLOCALE_SUPPORT_ERROR(locale->provider);
2290 :
2291 : return false; /* keep compiler quiet */
2292 : }
2293 :
2294 : /*
2295 : * pg_strxfrm
2296 : *
2297 : * Transforms 'src' to a nul-terminated string stored in 'dest' such that
2298 : * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
2299 : * untransformed strings.
2300 : *
2301 : * The provided 'src' must be nul-terminated. If 'destsize' is zero, 'dest'
2302 : * may be NULL.
2303 : *
2304 : * Returns the number of bytes needed to store the transformed string,
2305 : * excluding the terminating nul byte. If the value returned is 'destsize' or
2306 : * greater, the resulting contents of 'dest' are undefined.
2307 : */
2308 : size_t
2309 0 : pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
2310 : {
2311 0 : size_t result = 0; /* keep compiler quiet */
2312 :
2313 0 : if (!locale || locale->provider == COLLPROVIDER_LIBC)
2314 0 : result = pg_strxfrm_libc(dest, src, destsize, locale);
2315 : #ifdef USE_ICU
2316 0 : else if (locale->provider == COLLPROVIDER_ICU)
2317 0 : result = pg_strnxfrm_icu(dest, src, -1, destsize, locale);
2318 : #endif
2319 : else
2320 : /* shouldn't happen */
2321 0 : PGLOCALE_SUPPORT_ERROR(locale->provider);
2322 :
2323 0 : return result;
2324 : }
2325 :
2326 : /*
2327 : * pg_strnxfrm
2328 : *
2329 : * Transforms 'src' to a nul-terminated string stored in 'dest' such that
2330 : * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
2331 : * untransformed strings.
2332 : *
2333 : * 'src' does not need to be nul-terminated. If 'destsize' is zero, 'dest' may
2334 : * be NULL.
2335 : *
2336 : * Returns the number of bytes needed to store the transformed string,
2337 : * excluding the terminating nul byte. If the value returned is 'destsize' or
2338 : * greater, the resulting contents of 'dest' are undefined.
2339 : *
2340 : * This function may need to nul-terminate the argument for libc functions;
2341 : * so if the caller already has a nul-terminated string, it should call
2342 : * pg_strxfrm() instead.
2343 : */
2344 : size_t
2345 332 : pg_strnxfrm(char *dest, size_t destsize, const char *src, size_t srclen,
2346 : pg_locale_t locale)
2347 : {
2348 332 : size_t result = 0; /* keep compiler quiet */
2349 :
2350 332 : if (!locale || locale->provider == COLLPROVIDER_LIBC)
2351 0 : result = pg_strnxfrm_libc(dest, src, srclen, destsize, locale);
2352 : #ifdef USE_ICU
2353 332 : else if (locale->provider == COLLPROVIDER_ICU)
2354 332 : result = pg_strnxfrm_icu(dest, src, srclen, destsize, locale);
2355 : #endif
2356 : else
2357 : /* shouldn't happen */
2358 0 : PGLOCALE_SUPPORT_ERROR(locale->provider);
2359 :
2360 332 : return result;
2361 : }
2362 :
2363 : /*
2364 : * Return true if the collation provider supports pg_strxfrm_prefix() and
2365 : * pg_strnxfrm_prefix(); otherwise false.
2366 : */
2367 : bool
2368 237192 : pg_strxfrm_prefix_enabled(pg_locale_t locale)
2369 : {
2370 237192 : if (!locale || locale->provider == COLLPROVIDER_LIBC)
2371 0 : return false;
2372 237192 : else if (locale->provider == COLLPROVIDER_ICU)
2373 237192 : return true;
2374 : else
2375 : /* shouldn't happen */
2376 0 : PGLOCALE_SUPPORT_ERROR(locale->provider);
2377 :
2378 : return false; /* keep compiler quiet */
2379 : }
2380 :
2381 : /*
2382 : * pg_strxfrm_prefix
2383 : *
2384 : * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
2385 : * memcmp() on the byte sequence is equivalent to pg_strcoll() on
2386 : * untransformed strings. The result is not nul-terminated.
2387 : *
2388 : * The provided 'src' must be nul-terminated.
2389 : *
2390 : * If destsize is not large enough to hold the resulting byte sequence, stores
2391 : * only the first destsize bytes in 'dest'. Returns the number of bytes
2392 : * actually copied to 'dest'.
2393 : */
2394 : size_t
2395 237192 : pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
2396 : pg_locale_t locale)
2397 : {
2398 237192 : size_t result = 0; /* keep compiler quiet */
2399 :
2400 237192 : if (!locale)
2401 0 : PGLOCALE_SUPPORT_ERROR(COLLPROVIDER_LIBC);
2402 : #ifdef USE_ICU
2403 237192 : else if (locale->provider == COLLPROVIDER_ICU)
2404 237192 : result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
2405 : #endif
2406 : else
2407 0 : PGLOCALE_SUPPORT_ERROR(locale->provider);
2408 :
2409 237192 : return result;
2410 : }
2411 :
2412 : /*
2413 : * pg_strnxfrm_prefix
2414 : *
2415 : * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
2416 : * memcmp() on the byte sequence is equivalent to pg_strcoll() on
2417 : * untransformed strings. The result is not nul-terminated.
2418 : *
2419 : * The provided 'src' must be nul-terminated.
2420 : *
2421 : * If destsize is not large enough to hold the resulting byte sequence, stores
2422 : * only the first destsize bytes in 'dest'. Returns the number of bytes
2423 : * actually copied to 'dest'.
2424 : *
2425 : * This function may need to nul-terminate the argument for libc functions;
2426 : * so if the caller already has a nul-terminated string, it should call
2427 : * pg_strxfrm_prefix() instead.
2428 : */
2429 : size_t
2430 0 : pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
2431 : size_t srclen, pg_locale_t locale)
2432 : {
2433 0 : size_t result = 0; /* keep compiler quiet */
2434 :
2435 0 : if (!locale)
2436 0 : PGLOCALE_SUPPORT_ERROR(COLLPROVIDER_LIBC);
2437 : #ifdef USE_ICU
2438 0 : else if (locale->provider == COLLPROVIDER_ICU)
2439 0 : result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
2440 : #endif
2441 : else
2442 0 : PGLOCALE_SUPPORT_ERROR(locale->provider);
2443 :
2444 0 : return result;
2445 : }
2446 :
2447 : #ifdef USE_ICU
2448 :
2449 : /*
2450 : * Wrapper around ucol_open() to handle API differences for older ICU
2451 : * versions.
2452 : */
2453 : static UCollator *
2454 50742 : pg_ucol_open(const char *loc_str)
2455 : {
2456 : UCollator *collator;
2457 : UErrorCode status;
2458 50742 : const char *orig_str = loc_str;
2459 50742 : char *fixed_str = NULL;
2460 :
2461 : /*
2462 : * Must never open default collator, because it depends on the environment
2463 : * and may change at any time. Should not happen, but check here to catch
2464 : * bugs that might be hard to catch otherwise.
2465 : *
2466 : * NB: the default collator is not the same as the collator for the root
2467 : * locale. The root locale may be specified as the empty string, "und", or
2468 : * "root". The default collator is opened by passing NULL to ucol_open().
2469 : */
2470 50742 : if (loc_str == NULL)
2471 0 : elog(ERROR, "opening default collator is not supported");
2472 :
2473 : /*
2474 : * In ICU versions 54 and earlier, "und" is not a recognized spelling of
2475 : * the root locale. If the first component of the locale is "und", replace
2476 : * with "root" before opening.
2477 : */
2478 : if (U_ICU_VERSION_MAJOR_NUM < 55)
2479 : {
2480 : char lang[ULOC_LANG_CAPACITY];
2481 :
2482 : status = U_ZERO_ERROR;
2483 : uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
2484 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
2485 : {
2486 : ereport(ERROR,
2487 : (errmsg("could not get language from locale \"%s\": %s",
2488 : loc_str, u_errorName(status))));
2489 : }
2490 :
2491 : if (strcmp(lang, "und") == 0)
2492 : {
2493 : const char *remainder = loc_str + strlen("und");
2494 :
2495 : fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
2496 : strcpy(fixed_str, "root");
2497 : strcat(fixed_str, remainder);
2498 :
2499 : loc_str = fixed_str;
2500 : }
2501 : }
2502 :
2503 50742 : status = U_ZERO_ERROR;
2504 50742 : collator = ucol_open(loc_str, &status);
2505 50742 : if (U_FAILURE(status))
2506 8 : ereport(ERROR,
2507 : /* use original string for error report */
2508 : (errmsg("could not open collator for locale \"%s\": %s",
2509 : orig_str, u_errorName(status))));
2510 :
2511 : if (U_ICU_VERSION_MAJOR_NUM < 54)
2512 : {
2513 : status = U_ZERO_ERROR;
2514 : icu_set_collation_attributes(collator, loc_str, &status);
2515 :
2516 : /*
2517 : * Pretend the error came from ucol_open(), for consistent error
2518 : * message across ICU versions.
2519 : */
2520 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
2521 : {
2522 : ucol_close(collator);
2523 : ereport(ERROR,
2524 : (errmsg("could not open collator for locale \"%s\": %s",
2525 : orig_str, u_errorName(status))));
2526 : }
2527 : }
2528 :
2529 50734 : if (fixed_str != NULL)
2530 0 : pfree(fixed_str);
2531 :
2532 50734 : return collator;
2533 : }
2534 :
2535 : static void
2536 637792 : init_icu_converter(void)
2537 : {
2538 : const char *icu_encoding_name;
2539 : UErrorCode status;
2540 : UConverter *conv;
2541 :
2542 637792 : if (icu_converter)
2543 637764 : return; /* already done */
2544 :
2545 28 : icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
2546 28 : if (!icu_encoding_name)
2547 0 : ereport(ERROR,
2548 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
2549 : errmsg("encoding \"%s\" not supported by ICU",
2550 : pg_encoding_to_char(GetDatabaseEncoding()))));
2551 :
2552 28 : status = U_ZERO_ERROR;
2553 28 : conv = ucnv_open(icu_encoding_name, &status);
2554 28 : if (U_FAILURE(status))
2555 0 : ereport(ERROR,
2556 : (errmsg("could not open ICU converter for encoding \"%s\": %s",
2557 : icu_encoding_name, u_errorName(status))));
2558 :
2559 28 : icu_converter = conv;
2560 : }
2561 :
2562 : /*
2563 : * Find length, in UChars, of given string if converted to UChar string.
2564 : */
2565 : static size_t
2566 319064 : uchar_length(UConverter *converter, const char *str, int32_t len)
2567 : {
2568 319064 : UErrorCode status = U_ZERO_ERROR;
2569 : int32_t ulen;
2570 :
2571 319064 : ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
2572 319064 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
2573 0 : ereport(ERROR,
2574 : (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
2575 319064 : return ulen;
2576 : }
2577 :
2578 : /*
2579 : * Convert the given source string into a UChar string, stored in dest, and
2580 : * return the length (in UChars).
2581 : */
2582 : static int32_t
2583 319064 : uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
2584 : const char *src, int32_t srclen)
2585 : {
2586 319064 : UErrorCode status = U_ZERO_ERROR;
2587 : int32_t ulen;
2588 :
2589 319064 : status = U_ZERO_ERROR;
2590 319064 : ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
2591 319064 : if (U_FAILURE(status))
2592 0 : ereport(ERROR,
2593 : (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
2594 319064 : return ulen;
2595 : }
2596 :
2597 : /*
2598 : * Convert a string in the database encoding into a string of UChars.
2599 : *
2600 : * The source string at buff is of length nbytes
2601 : * (it needn't be nul-terminated)
2602 : *
2603 : * *buff_uchar receives a pointer to the palloc'd result string, and
2604 : * the function's result is the number of UChars generated.
2605 : *
2606 : * The result string is nul-terminated, though most callers rely on the
2607 : * result length instead.
2608 : */
2609 : int32_t
2610 318732 : icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
2611 : {
2612 : int32_t len_uchar;
2613 :
2614 318732 : init_icu_converter();
2615 :
2616 318732 : len_uchar = uchar_length(icu_converter, buff, nbytes);
2617 :
2618 318732 : *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
2619 318732 : len_uchar = uchar_convert(icu_converter,
2620 : *buff_uchar, len_uchar + 1, buff, nbytes);
2621 :
2622 318732 : return len_uchar;
2623 : }
2624 :
2625 : /*
2626 : * Convert a string of UChars into the database encoding.
2627 : *
2628 : * The source string at buff_uchar is of length len_uchar
2629 : * (it needn't be nul-terminated)
2630 : *
2631 : * *result receives a pointer to the palloc'd result string, and the
2632 : * function's result is the number of bytes generated (not counting nul).
2633 : *
2634 : * The result string is nul-terminated.
2635 : */
2636 : int32_t
2637 318728 : icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar)
2638 : {
2639 : UErrorCode status;
2640 : int32_t len_result;
2641 :
2642 318728 : init_icu_converter();
2643 :
2644 318728 : status = U_ZERO_ERROR;
2645 318728 : len_result = ucnv_fromUChars(icu_converter, NULL, 0,
2646 : buff_uchar, len_uchar, &status);
2647 318728 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
2648 0 : ereport(ERROR,
2649 : (errmsg("%s failed: %s", "ucnv_fromUChars",
2650 : u_errorName(status))));
2651 :
2652 318728 : *result = palloc(len_result + 1);
2653 :
2654 318728 : status = U_ZERO_ERROR;
2655 318728 : len_result = ucnv_fromUChars(icu_converter, *result, len_result + 1,
2656 : buff_uchar, len_uchar, &status);
2657 318728 : if (U_FAILURE(status) ||
2658 318728 : status == U_STRING_NOT_TERMINATED_WARNING)
2659 0 : ereport(ERROR,
2660 : (errmsg("%s failed: %s", "ucnv_fromUChars",
2661 : u_errorName(status))));
2662 :
2663 318728 : return len_result;
2664 : }
2665 :
2666 : /*
2667 : * Parse collation attributes from the given locale string and apply them to
2668 : * the open collator.
2669 : *
2670 : * First, the locale string is canonicalized to an ICU format locale ID such
2671 : * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
2672 : * the key-value arguments.
2673 : *
2674 : * Starting with ICU version 54, the attributes are processed automatically by
2675 : * ucol_open(), so this is only necessary for emulating this behavior on older
2676 : * versions.
2677 : */
2678 : pg_attribute_unused()
2679 : static void
2680 0 : icu_set_collation_attributes(UCollator *collator, const char *loc,
2681 : UErrorCode *status)
2682 : {
2683 : int32_t len;
2684 : char *icu_locale_id;
2685 : char *lower_str;
2686 : char *str;
2687 :
2688 : /*
2689 : * The input locale may be a BCP 47 language tag, e.g.
2690 : * "und-u-kc-ks-level1", which expresses the same attributes in a
2691 : * different form. It will be converted to the equivalent ICU format
2692 : * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
2693 : * uloc_canonicalize().
2694 : */
2695 0 : *status = U_ZERO_ERROR;
2696 0 : len = uloc_canonicalize(loc, NULL, 0, status);
2697 0 : icu_locale_id = palloc(len + 1);
2698 0 : *status = U_ZERO_ERROR;
2699 0 : len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
2700 0 : if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
2701 0 : return;
2702 :
2703 0 : lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
2704 :
2705 0 : pfree(icu_locale_id);
2706 :
2707 0 : str = strchr(lower_str, '@');
2708 0 : if (!str)
2709 0 : return;
2710 0 : str++;
2711 :
2712 0 : for (char *token = strtok(str, ";"); token; token = strtok(NULL, ";"))
2713 : {
2714 0 : char *e = strchr(token, '=');
2715 :
2716 0 : if (e)
2717 : {
2718 : char *name;
2719 : char *value;
2720 : UColAttribute uattr;
2721 : UColAttributeValue uvalue;
2722 :
2723 0 : *status = U_ZERO_ERROR;
2724 :
2725 0 : *e = '\0';
2726 0 : name = token;
2727 0 : value = e + 1;
2728 :
2729 : /*
2730 : * See attribute name and value lists in ICU i18n/coll.cpp
2731 : */
2732 0 : if (strcmp(name, "colstrength") == 0)
2733 0 : uattr = UCOL_STRENGTH;
2734 0 : else if (strcmp(name, "colbackwards") == 0)
2735 0 : uattr = UCOL_FRENCH_COLLATION;
2736 0 : else if (strcmp(name, "colcaselevel") == 0)
2737 0 : uattr = UCOL_CASE_LEVEL;
2738 0 : else if (strcmp(name, "colcasefirst") == 0)
2739 0 : uattr = UCOL_CASE_FIRST;
2740 0 : else if (strcmp(name, "colalternate") == 0)
2741 0 : uattr = UCOL_ALTERNATE_HANDLING;
2742 0 : else if (strcmp(name, "colnormalization") == 0)
2743 0 : uattr = UCOL_NORMALIZATION_MODE;
2744 0 : else if (strcmp(name, "colnumeric") == 0)
2745 0 : uattr = UCOL_NUMERIC_COLLATION;
2746 : else
2747 : /* ignore if unknown */
2748 0 : continue;
2749 :
2750 0 : if (strcmp(value, "primary") == 0)
2751 0 : uvalue = UCOL_PRIMARY;
2752 0 : else if (strcmp(value, "secondary") == 0)
2753 0 : uvalue = UCOL_SECONDARY;
2754 0 : else if (strcmp(value, "tertiary") == 0)
2755 0 : uvalue = UCOL_TERTIARY;
2756 0 : else if (strcmp(value, "quaternary") == 0)
2757 0 : uvalue = UCOL_QUATERNARY;
2758 0 : else if (strcmp(value, "identical") == 0)
2759 0 : uvalue = UCOL_IDENTICAL;
2760 0 : else if (strcmp(value, "no") == 0)
2761 0 : uvalue = UCOL_OFF;
2762 0 : else if (strcmp(value, "yes") == 0)
2763 0 : uvalue = UCOL_ON;
2764 0 : else if (strcmp(value, "shifted") == 0)
2765 0 : uvalue = UCOL_SHIFTED;
2766 0 : else if (strcmp(value, "non-ignorable") == 0)
2767 0 : uvalue = UCOL_NON_IGNORABLE;
2768 0 : else if (strcmp(value, "lower") == 0)
2769 0 : uvalue = UCOL_LOWER_FIRST;
2770 0 : else if (strcmp(value, "upper") == 0)
2771 0 : uvalue = UCOL_UPPER_FIRST;
2772 : else
2773 : {
2774 0 : *status = U_ILLEGAL_ARGUMENT_ERROR;
2775 0 : break;
2776 : }
2777 :
2778 0 : ucol_setAttribute(collator, uattr, uvalue, status);
2779 : }
2780 : }
2781 :
2782 0 : pfree(lower_str);
2783 : }
2784 : #endif
2785 :
2786 : /*
2787 : * Return the BCP47 language tag representation of the requested locale.
2788 : *
2789 : * This function should be called before passing the string to ucol_open(),
2790 : * because conversion to a language tag also performs "level 2
2791 : * canonicalization". In addition to producing a consistent format, level 2
2792 : * canonicalization is able to more accurately interpret different input
2793 : * locale string formats, such as POSIX and .NET IDs.
2794 : */
2795 : char *
2796 47044 : icu_language_tag(const char *loc_str, int elevel)
2797 : {
2798 : #ifdef USE_ICU
2799 : UErrorCode status;
2800 : char *langtag;
2801 47044 : size_t buflen = 32; /* arbitrary starting buffer size */
2802 47044 : const bool strict = true;
2803 :
2804 : /*
2805 : * A BCP47 language tag doesn't have a clearly-defined upper limit (cf.
2806 : * RFC5646 section 4.4). Additionally, in older ICU versions,
2807 : * uloc_toLanguageTag() doesn't always return the ultimate length on the
2808 : * first call, necessitating a loop.
2809 : */
2810 47044 : langtag = palloc(buflen);
2811 : while (true)
2812 : {
2813 47044 : status = U_ZERO_ERROR;
2814 47044 : uloc_toLanguageTag(loc_str, langtag, buflen, strict, &status);
2815 :
2816 : /* try again if the buffer is not large enough */
2817 47044 : if ((status == U_BUFFER_OVERFLOW_ERROR ||
2818 47044 : status == U_STRING_NOT_TERMINATED_WARNING) &&
2819 : buflen < MaxAllocSize)
2820 : {
2821 0 : buflen = Min(buflen * 2, MaxAllocSize);
2822 0 : langtag = repalloc(langtag, buflen);
2823 0 : continue;
2824 : }
2825 :
2826 47044 : break;
2827 : }
2828 :
2829 47044 : if (U_FAILURE(status))
2830 : {
2831 6 : pfree(langtag);
2832 :
2833 6 : if (elevel > 0)
2834 6 : ereport(elevel,
2835 : (errmsg("could not convert locale name \"%s\" to language tag: %s",
2836 : loc_str, u_errorName(status))));
2837 4 : return NULL;
2838 : }
2839 :
2840 47038 : return langtag;
2841 : #else /* not USE_ICU */
2842 : ereport(ERROR,
2843 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
2844 : errmsg("ICU is not supported in this build")));
2845 : return NULL; /* keep compiler quiet */
2846 : #endif /* not USE_ICU */
2847 : }
2848 :
2849 : /*
2850 : * Perform best-effort check that the locale is a valid one.
2851 : */
2852 : void
2853 102 : icu_validate_locale(const char *loc_str)
2854 : {
2855 : #ifdef USE_ICU
2856 : UCollator *collator;
2857 : UErrorCode status;
2858 : char lang[ULOC_LANG_CAPACITY];
2859 102 : bool found = false;
2860 102 : int elevel = icu_validation_level;
2861 :
2862 : /* no validation */
2863 102 : if (elevel < 0)
2864 4 : return;
2865 :
2866 : /* downgrade to WARNING during pg_upgrade */
2867 98 : if (IsBinaryUpgrade && elevel > WARNING)
2868 0 : elevel = WARNING;
2869 :
2870 : /* validate that we can extract the language */
2871 98 : status = U_ZERO_ERROR;
2872 98 : uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
2873 98 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
2874 : {
2875 0 : ereport(elevel,
2876 : (errmsg("could not get language from ICU locale \"%s\": %s",
2877 : loc_str, u_errorName(status)),
2878 : errhint("To disable ICU locale validation, set the parameter %s to \"%s\".",
2879 : "icu_validation_level", "disabled")));
2880 0 : return;
2881 : }
2882 :
2883 : /* check for special language name */
2884 98 : if (strcmp(lang, "") == 0 ||
2885 54 : strcmp(lang, "root") == 0 || strcmp(lang, "und") == 0)
2886 44 : found = true;
2887 :
2888 : /* search for matching language within ICU */
2889 15732 : for (int32_t i = 0; !found && i < uloc_countAvailable(); i++)
2890 : {
2891 15634 : const char *otherloc = uloc_getAvailable(i);
2892 : char otherlang[ULOC_LANG_CAPACITY];
2893 :
2894 15634 : status = U_ZERO_ERROR;
2895 15634 : uloc_getLanguage(otherloc, otherlang, ULOC_LANG_CAPACITY, &status);
2896 15634 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
2897 0 : continue;
2898 :
2899 15634 : if (strcmp(lang, otherlang) == 0)
2900 48 : found = true;
2901 : }
2902 :
2903 98 : if (!found)
2904 6 : ereport(elevel,
2905 : (errmsg("ICU locale \"%s\" has unknown language \"%s\"",
2906 : loc_str, lang),
2907 : errhint("To disable ICU locale validation, set the parameter %s to \"%s\".",
2908 : "icu_validation_level", "disabled")));
2909 :
2910 : /* check that it can be opened */
2911 96 : collator = pg_ucol_open(loc_str);
2912 92 : ucol_close(collator);
2913 : #else /* not USE_ICU */
2914 : /* could get here if a collation was created by a build with ICU */
2915 : ereport(ERROR,
2916 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
2917 : errmsg("ICU is not supported in this build")));
2918 : #endif /* not USE_ICU */
2919 : }
2920 :
2921 : /*
2922 : * These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
2923 : * Therefore we keep them here rather than with the mbutils code.
2924 : */
2925 :
2926 : /*
2927 : * wchar2char --- convert wide characters to multibyte format
2928 : *
2929 : * This has the same API as the standard wcstombs_l() function; in particular,
2930 : * tolen is the maximum number of bytes to store at *to, and *from must be
2931 : * zero-terminated. The output will be zero-terminated iff there is room.
2932 : */
2933 : size_t
2934 0 : wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
2935 : {
2936 : size_t result;
2937 :
2938 : Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
2939 :
2940 0 : if (tolen == 0)
2941 0 : return 0;
2942 :
2943 : #ifdef WIN32
2944 :
2945 : /*
2946 : * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
2947 : * for some reason mbstowcs and wcstombs won't do this for us, so we use
2948 : * MultiByteToWideChar().
2949 : */
2950 : if (GetDatabaseEncoding() == PG_UTF8)
2951 : {
2952 : result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
2953 : NULL, NULL);
2954 : /* A zero return is failure */
2955 : if (result <= 0)
2956 : result = -1;
2957 : else
2958 : {
2959 : Assert(result <= tolen);
2960 : /* Microsoft counts the zero terminator in the result */
2961 : result--;
2962 : }
2963 : }
2964 : else
2965 : #endif /* WIN32 */
2966 0 : if (locale == (pg_locale_t) 0)
2967 : {
2968 : /* Use wcstombs directly for the default locale */
2969 0 : result = wcstombs(to, from, tolen);
2970 : }
2971 : else
2972 : {
2973 : /* Use wcstombs_l for nondefault locales */
2974 0 : result = wcstombs_l(to, from, tolen, locale->info.lt);
2975 : }
2976 :
2977 0 : return result;
2978 : }
2979 :
2980 : /*
2981 : * char2wchar --- convert multibyte characters to wide characters
2982 : *
2983 : * This has almost the API of mbstowcs_l(), except that *from need not be
2984 : * null-terminated; instead, the number of input bytes is specified as
2985 : * fromlen. Also, we ereport() rather than returning -1 for invalid
2986 : * input encoding. tolen is the maximum number of wchar_t's to store at *to.
2987 : * The output will be zero-terminated iff there is room.
2988 : */
2989 : size_t
2990 0 : char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
2991 : pg_locale_t locale)
2992 : {
2993 : size_t result;
2994 :
2995 : Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
2996 :
2997 0 : if (tolen == 0)
2998 0 : return 0;
2999 :
3000 : #ifdef WIN32
3001 : /* See WIN32 "Unicode" comment above */
3002 : if (GetDatabaseEncoding() == PG_UTF8)
3003 : {
3004 : /* Win32 API does not work for zero-length input */
3005 : if (fromlen == 0)
3006 : result = 0;
3007 : else
3008 : {
3009 : result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
3010 : /* A zero return is failure */
3011 : if (result == 0)
3012 : result = -1;
3013 : }
3014 :
3015 : if (result != -1)
3016 : {
3017 : Assert(result < tolen);
3018 : /* Append trailing null wchar (MultiByteToWideChar() does not) */
3019 : to[result] = 0;
3020 : }
3021 : }
3022 : else
3023 : #endif /* WIN32 */
3024 : {
3025 : /* mbstowcs requires ending '\0' */
3026 0 : char *str = pnstrdup(from, fromlen);
3027 :
3028 0 : if (locale == (pg_locale_t) 0)
3029 : {
3030 : /* Use mbstowcs directly for the default locale */
3031 0 : result = mbstowcs(to, str, tolen);
3032 : }
3033 : else
3034 : {
3035 : /* Use mbstowcs_l for nondefault locales */
3036 0 : result = mbstowcs_l(to, str, tolen, locale->info.lt);
3037 : }
3038 :
3039 0 : pfree(str);
3040 : }
3041 :
3042 0 : if (result == -1)
3043 : {
3044 : /*
3045 : * Invalid multibyte character encountered. We try to give a useful
3046 : * error message by letting pg_verifymbstr check the string. But it's
3047 : * possible that the string is OK to us, and not OK to mbstowcs ---
3048 : * this suggests that the LC_CTYPE locale is different from the
3049 : * database encoding. Give a generic error message if pg_verifymbstr
3050 : * can't find anything wrong.
3051 : */
3052 0 : pg_verifymbstr(from, fromlen, false); /* might not return */
3053 : /* but if it does ... */
3054 0 : ereport(ERROR,
3055 : (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
3056 : errmsg("invalid multibyte character for locale"),
3057 : errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
3058 : }
3059 :
3060 0 : return result;
3061 : }
|