Line data Source code
1 : /*-----------------------------------------------------------------------
2 : *
3 : * PostgreSQL locale utilities for ICU
4 : *
5 : * Portions Copyright (c) 2002-2026, PostgreSQL Global Development Group
6 : *
7 : * src/backend/utils/adt/pg_locale_icu.c
8 : *
9 : *-----------------------------------------------------------------------
10 : */
11 :
12 : #include "postgres.h"
13 :
14 : #ifdef USE_ICU
15 : #include <unicode/ucasemap.h>
16 : #include <unicode/ucnv.h>
17 : #include <unicode/ucol.h>
18 : #include <unicode/ustring.h>
19 :
20 : /*
21 : * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
22 : * (see
23 : * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
24 : */
25 : #if U_ICU_VERSION_MAJOR_NUM >= 53
26 : #define HAVE_UCOL_STRCOLLUTF8 1
27 : #else
28 : #undef HAVE_UCOL_STRCOLLUTF8
29 : #endif
30 :
31 : #endif
32 :
33 : #include "access/htup_details.h"
34 : #include "catalog/pg_database.h"
35 : #include "catalog/pg_collation.h"
36 : #include "mb/pg_wchar.h"
37 : #include "miscadmin.h"
38 : #include "utils/builtins.h"
39 : #include "utils/formatting.h"
40 : #include "utils/memutils.h"
41 : #include "utils/pg_locale.h"
42 : #include "utils/syscache.h"
43 :
44 : /*
45 : * Size of stack buffer to use for string transformations, used to avoid heap
46 : * allocations in typical cases. This should be large enough that most strings
47 : * will fit, but small enough that we feel comfortable putting it on the
48 : * stack.
49 : */
50 : #define TEXTBUFLEN 1024
51 :
52 : extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context);
53 :
54 : #ifdef USE_ICU
55 :
56 : extern UCollator *pg_ucol_open(const char *loc_str);
57 : static UCaseMap *pg_ucasemap_open(const char *loc_str);
58 :
59 : static size_t strlower_icu(char *dest, size_t destsize, const char *src,
60 : size_t srclen, pg_locale_t locale);
61 : static size_t strtitle_icu(char *dest, size_t destsize, const char *src,
62 : size_t srclen, pg_locale_t locale);
63 : static size_t strupper_icu(char *dest, size_t destsize, const char *src,
64 : size_t srclen, pg_locale_t locale);
65 : static size_t strfold_icu(char *dest, size_t destsize, const char *src,
66 : size_t srclen, pg_locale_t locale);
67 : static size_t strlower_icu_utf8(char *dest, size_t destsize, const char *src,
68 : size_t srclen, pg_locale_t locale);
69 : static size_t strtitle_icu_utf8(char *dest, size_t destsize, const char *src,
70 : size_t srclen, pg_locale_t locale);
71 : static size_t strupper_icu_utf8(char *dest, size_t destsize, const char *src,
72 : size_t srclen, pg_locale_t locale);
73 : static size_t strfold_icu_utf8(char *dest, size_t destsize, const char *src,
74 : size_t srclen, pg_locale_t locale);
75 : static size_t downcase_ident_icu(char *dst, size_t dstsize, const char *src,
76 : size_t srclen, pg_locale_t locale);
77 : static int strncoll_icu(const char *arg1, size_t len1,
78 : const char *arg2, size_t len2,
79 : pg_locale_t locale);
80 : static int strcoll_icu(const char *arg1, const char *arg2,
81 : pg_locale_t locale);
82 : static size_t strnxfrm_icu(char *dest, size_t destsize,
83 : const char *src, size_t srclen,
84 : pg_locale_t locale);
85 : static size_t strxfrm_icu(char *dest, size_t destsize, const char *src,
86 : pg_locale_t locale);
87 : extern char *get_collation_actual_version_icu(const char *collcollate);
88 :
89 : typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity,
90 : const UChar *src, int32_t srcLength,
91 : const char *locale,
92 : UErrorCode *pErrorCode);
93 :
94 : /*
95 : * Converter object for converting between ICU's UChar strings and C strings
96 : * in database encoding. Since the database encoding doesn't change, we only
97 : * need one of these per session.
98 : */
99 : static UConverter *icu_converter = NULL;
100 :
101 : static UCollator *make_icu_collator(const char *iculocstr,
102 : const char *icurules);
103 : static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
104 : const char *src, size_t srclen,
105 : pg_locale_t locale);
106 : static size_t strxfrm_prefix_icu(char *dest, size_t destsize, const char *src,
107 : pg_locale_t locale);
108 : #ifdef HAVE_UCOL_STRCOLLUTF8
109 : static int strncoll_icu_utf8(const char *arg1, size_t len1,
110 : const char *arg2, size_t len2,
111 : pg_locale_t locale);
112 : static int strcoll_icu_utf8(const char *arg1,
113 : const char *arg2,
114 : pg_locale_t locale);
115 : #endif
116 : static size_t strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
117 : const char *src, size_t srclen,
118 : pg_locale_t locale);
119 : static size_t strxfrm_prefix_icu_utf8(char *dest, size_t destsize, const char *src,
120 : pg_locale_t locale);
121 : static void init_icu_converter(void);
122 : static int32_t uchar_length(UConverter *converter,
123 : const char *str, int32_t len);
124 : static int32_t uchar_convert(UConverter *converter,
125 : UChar *dest, int32_t destlen,
126 : const char *src, int32_t srclen);
127 : static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff,
128 : size_t nbytes);
129 : static size_t icu_from_uchar(char *dest, size_t destsize,
130 : const UChar *buff_uchar, int32_t len_uchar);
131 : static void icu_set_collation_attributes(UCollator *collator, const char *loc,
132 : UErrorCode *status);
133 : static int32_t icu_convert_case(ICU_Convert_Func func, char *dest,
134 : size_t destsize, const char *src,
135 : size_t srclen, pg_locale_t locale);
136 : static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
137 : const UChar *src, int32_t srcLength,
138 : const char *locale,
139 : UErrorCode *pErrorCode);
140 : static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
141 : const UChar *src, int32_t srcLength,
142 : const char *locale,
143 : UErrorCode *pErrorCode);
144 : static int32_t foldcase_options(const char *locale);
145 :
146 : /*
147 : * XXX: many of the functions below rely on casts directly from pg_wchar to
148 : * UChar32, which is correct for UTF-8 and LATIN1, but not in general.
149 : */
150 :
151 : static pg_wchar
152 72 : toupper_icu(pg_wchar wc, pg_locale_t locale)
153 : {
154 72 : return u_toupper(wc);
155 : }
156 :
157 : static pg_wchar
158 72 : tolower_icu(pg_wchar wc, pg_locale_t locale)
159 : {
160 72 : return u_tolower(wc);
161 : }
162 :
163 : static const struct collate_methods collate_methods_icu = {
164 : .strncoll = strncoll_icu,
165 : .strcoll = strcoll_icu,
166 : .strnxfrm = strnxfrm_icu,
167 : .strxfrm = strxfrm_icu,
168 : .strnxfrm_prefix = strnxfrm_prefix_icu,
169 : .strxfrm_prefix = strxfrm_prefix_icu,
170 : .strxfrm_is_safe = true,
171 : };
172 :
173 : static const struct collate_methods collate_methods_icu_utf8 = {
174 : #ifdef HAVE_UCOL_STRCOLLUTF8
175 : .strncoll = strncoll_icu_utf8,
176 : .strcoll = strcoll_icu_utf8,
177 : #else
178 : .strncoll = strncoll_icu,
179 : .strcoll = strcoll_icu,
180 : #endif
181 : .strnxfrm = strnxfrm_icu,
182 : .strxfrm = strxfrm_icu,
183 : .strnxfrm_prefix = strnxfrm_prefix_icu_utf8,
184 : .strxfrm_prefix = strxfrm_prefix_icu_utf8,
185 : .strxfrm_is_safe = true,
186 : };
187 :
188 : static bool
189 8192 : wc_isdigit_icu(pg_wchar wc, pg_locale_t locale)
190 : {
191 8192 : return u_isdigit(wc);
192 : }
193 :
194 : static bool
195 8192 : wc_isalpha_icu(pg_wchar wc, pg_locale_t locale)
196 : {
197 8192 : return u_isalpha(wc);
198 : }
199 :
200 : static bool
201 8192 : wc_isalnum_icu(pg_wchar wc, pg_locale_t locale)
202 : {
203 8192 : return u_isalnum(wc);
204 : }
205 :
206 : static bool
207 8192 : wc_isupper_icu(pg_wchar wc, pg_locale_t locale)
208 : {
209 8192 : return u_isupper(wc);
210 : }
211 :
212 : static bool
213 8192 : wc_islower_icu(pg_wchar wc, pg_locale_t locale)
214 : {
215 8192 : return u_islower(wc);
216 : }
217 :
218 : static bool
219 8192 : wc_isgraph_icu(pg_wchar wc, pg_locale_t locale)
220 : {
221 8192 : return u_isgraph(wc);
222 : }
223 :
224 : static bool
225 8192 : wc_isprint_icu(pg_wchar wc, pg_locale_t locale)
226 : {
227 8192 : return u_isprint(wc);
228 : }
229 :
230 : static bool
231 8192 : wc_ispunct_icu(pg_wchar wc, pg_locale_t locale)
232 : {
233 8192 : return u_ispunct(wc);
234 : }
235 :
236 : static bool
237 8192 : wc_isspace_icu(pg_wchar wc, pg_locale_t locale)
238 : {
239 8192 : return u_isspace(wc);
240 : }
241 :
242 : static bool
243 0 : wc_isxdigit_icu(pg_wchar wc, pg_locale_t locale)
244 : {
245 0 : return u_isxdigit(wc);
246 : }
247 :
248 : static bool
249 105 : wc_iscased_icu(pg_wchar wc, pg_locale_t locale)
250 : {
251 105 : return u_hasBinaryProperty(wc, UCHAR_CASED);
252 : }
253 :
254 : static const struct ctype_methods ctype_methods_icu = {
255 : .strlower = strlower_icu,
256 : .strtitle = strtitle_icu,
257 : .strupper = strupper_icu,
258 : .strfold = strfold_icu,
259 : .downcase_ident = downcase_ident_icu,
260 : .wc_isdigit = wc_isdigit_icu,
261 : .wc_isalpha = wc_isalpha_icu,
262 : .wc_isalnum = wc_isalnum_icu,
263 : .wc_isupper = wc_isupper_icu,
264 : .wc_islower = wc_islower_icu,
265 : .wc_isgraph = wc_isgraph_icu,
266 : .wc_isprint = wc_isprint_icu,
267 : .wc_ispunct = wc_ispunct_icu,
268 : .wc_isspace = wc_isspace_icu,
269 : .wc_isxdigit = wc_isxdigit_icu,
270 : .wc_iscased = wc_iscased_icu,
271 : .wc_toupper = toupper_icu,
272 : .wc_tolower = tolower_icu,
273 : };
274 :
275 : static const struct ctype_methods ctype_methods_icu_utf8 = {
276 : .strlower = strlower_icu_utf8,
277 : .strtitle = strtitle_icu_utf8,
278 : .strupper = strupper_icu_utf8,
279 : .strfold = strfold_icu_utf8,
280 : /* uses plain ASCII semantics for historical reasons */
281 : .downcase_ident = NULL,
282 : .wc_isdigit = wc_isdigit_icu,
283 : .wc_isalpha = wc_isalpha_icu,
284 : .wc_isalnum = wc_isalnum_icu,
285 : .wc_isupper = wc_isupper_icu,
286 : .wc_islower = wc_islower_icu,
287 : .wc_isgraph = wc_isgraph_icu,
288 : .wc_isprint = wc_isprint_icu,
289 : .wc_ispunct = wc_ispunct_icu,
290 : .wc_isspace = wc_isspace_icu,
291 : .wc_isxdigit = wc_isxdigit_icu,
292 : .wc_iscased = wc_iscased_icu,
293 : .wc_toupper = toupper_icu,
294 : .wc_tolower = tolower_icu,
295 : };
296 :
297 : /*
298 : * ICU still depends on libc for compatibility with certain historical
299 : * behavior for single-byte encodings. See downcase_ident_icu().
300 : *
301 : * XXX: consider fixing by decoding the single byte into a code point, and
302 : * using u_tolower().
303 : */
304 : static locale_t
305 0 : make_libc_ctype_locale(const char *ctype)
306 : {
307 : locale_t loc;
308 :
309 : #ifndef WIN32
310 0 : loc = newlocale(LC_CTYPE_MASK, ctype, NULL);
311 : #else
312 : loc = _create_locale(LC_ALL, ctype);
313 : #endif
314 0 : if (!loc)
315 0 : report_newlocale_failure(ctype);
316 :
317 0 : return loc;
318 : }
319 : #endif
320 :
321 : pg_locale_t
322 137 : create_pg_locale_icu(Oid collid, MemoryContext context)
323 : {
324 : #ifdef USE_ICU
325 : bool deterministic;
326 : const char *iculocstr;
327 137 : const char *icurules = NULL;
328 : UCollator *collator;
329 137 : locale_t loc = (locale_t) 0;
330 : pg_locale_t result;
331 :
332 137 : if (collid == DEFAULT_COLLATION_OID)
333 : {
334 : HeapTuple tp;
335 : Datum datum;
336 : bool isnull;
337 :
338 13 : tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
339 13 : if (!HeapTupleIsValid(tp))
340 0 : elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
341 :
342 : /* default database collation is always deterministic */
343 13 : deterministic = true;
344 13 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
345 : Anum_pg_database_datlocale);
346 13 : iculocstr = TextDatumGetCString(datum);
347 13 : datum = SysCacheGetAttr(DATABASEOID, tp,
348 : Anum_pg_database_daticurules, &isnull);
349 13 : if (!isnull)
350 0 : icurules = TextDatumGetCString(datum);
351 :
352 : /* libc only needed for default locale and single-byte encoding */
353 13 : if (pg_database_encoding_max_length() == 1)
354 : {
355 : const char *ctype;
356 :
357 0 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
358 : Anum_pg_database_datctype);
359 0 : ctype = TextDatumGetCString(datum);
360 :
361 0 : loc = make_libc_ctype_locale(ctype);
362 : }
363 :
364 13 : ReleaseSysCache(tp);
365 : }
366 : else
367 : {
368 : Form_pg_collation collform;
369 : HeapTuple tp;
370 : Datum datum;
371 : bool isnull;
372 :
373 124 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
374 124 : if (!HeapTupleIsValid(tp))
375 0 : elog(ERROR, "cache lookup failed for collation %u", collid);
376 124 : collform = (Form_pg_collation) GETSTRUCT(tp);
377 124 : deterministic = collform->collisdeterministic;
378 124 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
379 : Anum_pg_collation_colllocale);
380 124 : iculocstr = TextDatumGetCString(datum);
381 124 : datum = SysCacheGetAttr(COLLOID, tp,
382 : Anum_pg_collation_collicurules, &isnull);
383 124 : if (!isnull)
384 12 : icurules = TextDatumGetCString(datum);
385 :
386 124 : ReleaseSysCache(tp);
387 : }
388 :
389 137 : collator = make_icu_collator(iculocstr, icurules);
390 :
391 131 : result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
392 131 : result->icu.locale = MemoryContextStrdup(context, iculocstr);
393 131 : result->icu.ucol = collator;
394 131 : result->icu.lt = loc;
395 131 : result->deterministic = deterministic;
396 131 : result->collate_is_c = false;
397 131 : result->ctype_is_c = false;
398 131 : if (GetDatabaseEncoding() == PG_UTF8)
399 : {
400 131 : result->icu.ucasemap = pg_ucasemap_open(iculocstr);
401 131 : result->collate = &collate_methods_icu_utf8;
402 131 : result->ctype = &ctype_methods_icu_utf8;
403 : }
404 : else
405 : {
406 0 : result->collate = &collate_methods_icu;
407 0 : result->ctype = &ctype_methods_icu;
408 : }
409 :
410 131 : return result;
411 : #else
412 : /* could get here if a collation was created by a build with ICU */
413 : ereport(ERROR,
414 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
415 : errmsg("ICU is not supported in this build")));
416 :
417 : return NULL;
418 : #endif
419 : }
420 :
421 : #ifdef USE_ICU
422 :
423 : /*
424 : * Check locale string and fix it if necessary. Returns a new palloc'd string.
425 : *
426 : * In ICU versions 54 and earlier, "und" is not a recognized spelling of the
427 : * root locale. If the first component of the locale is "und", replace with
428 : * "root" before opening.
429 : */
430 : static char *
431 48512 : fix_icu_locale_str(const char *loc_str)
432 : {
433 : /*
434 : * Must never open default collator, because it depends on the environment
435 : * and may change at any time. Should not happen, but check here to catch
436 : * bugs that might be hard to catch otherwise.
437 : *
438 : * NB: the default collator is not the same as the collator for the root
439 : * locale. The root locale may be specified as the empty string, "und", or
440 : * "root". The default collator is opened by passing NULL to ucol_open().
441 : */
442 48512 : if (loc_str == NULL)
443 0 : elog(ERROR, "opening default collator is not supported");
444 :
445 : if (U_ICU_VERSION_MAJOR_NUM < 55)
446 : {
447 : char lang[ULOC_LANG_CAPACITY];
448 : UErrorCode status = U_ZERO_ERROR;
449 :
450 : uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
451 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
452 : {
453 : ereport(ERROR,
454 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
455 : errmsg("could not get language from locale \"%s\": %s",
456 : loc_str, u_errorName(status))));
457 : }
458 :
459 : if (strcmp(lang, "und") == 0)
460 : {
461 : const char *remainder = loc_str + strlen("und");
462 : char *fixed_str;
463 :
464 : fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
465 : strcpy(fixed_str, "root");
466 : strcat(fixed_str, remainder);
467 :
468 : return fixed_str;
469 : }
470 : }
471 :
472 48512 : return pstrdup(loc_str);
473 : }
474 :
475 : /*
476 : * Wrapper around ucol_open() to handle API differences for older ICU
477 : * versions.
478 : *
479 : * Ensure that no path leaks a UCollator.
480 : */
481 : UCollator *
482 48381 : pg_ucol_open(const char *loc_str)
483 : {
484 : UCollator *collator;
485 : UErrorCode status;
486 : char *fixed_str;
487 :
488 48381 : fixed_str = fix_icu_locale_str(loc_str);
489 :
490 48381 : status = U_ZERO_ERROR;
491 48381 : collator = ucol_open(fixed_str, &status);
492 48381 : if (U_FAILURE(status))
493 7 : ereport(ERROR,
494 : /* use original string for error report */
495 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
496 : errmsg("could not open collator for locale \"%s\": %s",
497 : loc_str, u_errorName(status))));
498 :
499 : if (U_ICU_VERSION_MAJOR_NUM < 54)
500 : {
501 : status = U_ZERO_ERROR;
502 : icu_set_collation_attributes(collator, fixed_str, &status);
503 :
504 : /*
505 : * Pretend the error came from ucol_open(), for consistent error
506 : * message across ICU versions.
507 : */
508 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
509 : {
510 : ucol_close(collator);
511 : ereport(ERROR,
512 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
513 : errmsg("could not open collator for locale \"%s\": %s",
514 : loc_str, u_errorName(status))));
515 : }
516 : }
517 :
518 48374 : pfree(fixed_str);
519 :
520 48374 : return collator;
521 : }
522 :
523 : /*
524 : * Wrapper around ucasemap_open() to handle API differences for older ICU
525 : * versions.
526 : *
527 : * Additionally makes sure we get the right options for case folding.
528 : */
529 : static UCaseMap *
530 131 : pg_ucasemap_open(const char *loc_str)
531 : {
532 131 : UErrorCode status = U_ZERO_ERROR;
533 : UCaseMap *casemap;
534 : char *fixed_str;
535 :
536 131 : fixed_str = fix_icu_locale_str(loc_str);
537 :
538 131 : casemap = ucasemap_open(fixed_str, foldcase_options(fixed_str), &status);
539 131 : if (U_FAILURE(status))
540 : /* use original string for error report */
541 0 : ereport(ERROR,
542 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
543 : errmsg("could not open casemap for locale \"%s\": %s",
544 : loc_str, u_errorName(status)));
545 :
546 131 : pfree(fixed_str);
547 :
548 131 : return casemap;
549 : }
550 :
551 : /*
552 : * Create a UCollator with the given locale string and rules.
553 : *
554 : * Ensure that no path leaks a UCollator.
555 : */
556 : static UCollator *
557 137 : make_icu_collator(const char *iculocstr, const char *icurules)
558 : {
559 137 : if (!icurules)
560 : {
561 : /* simple case without rules */
562 125 : return pg_ucol_open(iculocstr);
563 : }
564 : else
565 : {
566 : UCollator *collator_std_rules;
567 : UCollator *collator_all_rules;
568 : const UChar *std_rules;
569 : UChar *my_rules;
570 : UChar *all_rules;
571 : int32_t length;
572 : int32_t total;
573 : UErrorCode status;
574 :
575 : /*
576 : * If rules are specified, we extract the rules of the standard
577 : * collation, add our own rules, and make a new collator with the
578 : * combined rules.
579 : */
580 12 : icu_to_uchar(&my_rules, icurules, strlen(icurules));
581 :
582 12 : collator_std_rules = pg_ucol_open(iculocstr);
583 :
584 12 : std_rules = ucol_getRules(collator_std_rules, &length);
585 :
586 12 : total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
587 :
588 : /* avoid leaking collator on OOM */
589 12 : all_rules = palloc_array_extended(UChar, total, MCXT_ALLOC_NO_OOM);
590 12 : if (!all_rules)
591 : {
592 0 : ucol_close(collator_std_rules);
593 0 : ereport(ERROR,
594 : (errcode(ERRCODE_OUT_OF_MEMORY),
595 : errmsg("out of memory")));
596 : }
597 :
598 12 : u_strcpy(all_rules, std_rules);
599 12 : u_strcat(all_rules, my_rules);
600 :
601 12 : ucol_close(collator_std_rules);
602 :
603 12 : status = U_ZERO_ERROR;
604 12 : collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
605 : UCOL_DEFAULT, UCOL_DEFAULT,
606 : NULL, &status);
607 12 : if (U_FAILURE(status))
608 : {
609 4 : ereport(ERROR,
610 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
611 : errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
612 : iculocstr, icurules, u_errorName(status))));
613 : }
614 :
615 8 : pfree(my_rules);
616 8 : pfree(all_rules);
617 8 : return collator_all_rules;
618 : }
619 : }
620 :
621 : static size_t
622 0 : strlower_icu(char *dest, size_t destsize, const char *src, size_t srclen,
623 : pg_locale_t locale)
624 : {
625 0 : return icu_convert_case(u_strToLower, dest, destsize, src, srclen, locale);
626 : }
627 :
628 : static size_t
629 0 : strtitle_icu(char *dest, size_t destsize, const char *src, size_t srclen,
630 : pg_locale_t locale)
631 : {
632 0 : return icu_convert_case(u_strToTitle_default_BI, dest, destsize, src, srclen, locale);
633 : }
634 :
635 : static size_t
636 0 : strupper_icu(char *dest, size_t destsize, const char *src, size_t srclen,
637 : pg_locale_t locale)
638 : {
639 0 : return icu_convert_case(u_strToUpper, dest, destsize, src, srclen, locale);
640 : }
641 :
642 : static size_t
643 0 : strfold_icu(char *dest, size_t destsize, const char *src, size_t srclen,
644 : pg_locale_t locale)
645 : {
646 0 : return icu_convert_case(u_strFoldCase_default, dest, destsize, src, srclen, locale);
647 : }
648 :
649 : static size_t
650 364 : strlower_icu_utf8(char *dest, size_t destsize, const char *src, size_t srclen,
651 : pg_locale_t locale)
652 : {
653 364 : UErrorCode status = U_ZERO_ERROR;
654 : int32_t needed;
655 :
656 364 : needed = ucasemap_utf8ToLower(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
657 364 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
658 0 : ereport(ERROR,
659 : errmsg("case conversion failed: %s", u_errorName(status)));
660 364 : return needed;
661 : }
662 :
663 : static size_t
664 20 : strtitle_icu_utf8(char *dest, size_t destsize, const char *src, size_t srclen,
665 : pg_locale_t locale)
666 : {
667 20 : UErrorCode status = U_ZERO_ERROR;
668 : int32_t needed;
669 :
670 20 : needed = ucasemap_utf8ToTitle(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
671 20 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
672 0 : ereport(ERROR,
673 : errmsg("case conversion failed: %s", u_errorName(status)));
674 20 : return needed;
675 : }
676 :
677 : static size_t
678 76 : strupper_icu_utf8(char *dest, size_t destsize, const char *src, size_t srclen,
679 : pg_locale_t locale)
680 : {
681 76 : UErrorCode status = U_ZERO_ERROR;
682 : int32_t needed;
683 :
684 76 : needed = ucasemap_utf8ToUpper(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
685 76 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
686 0 : ereport(ERROR,
687 : errmsg("case conversion failed: %s", u_errorName(status)));
688 76 : return needed;
689 : }
690 :
691 : static size_t
692 10 : strfold_icu_utf8(char *dest, size_t destsize, const char *src, size_t srclen,
693 : pg_locale_t locale)
694 : {
695 10 : UErrorCode status = U_ZERO_ERROR;
696 : int32_t needed;
697 :
698 10 : needed = ucasemap_utf8FoldCase(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
699 10 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
700 0 : ereport(ERROR,
701 : errmsg("case conversion failed: %s", u_errorName(status)));
702 10 : return needed;
703 : }
704 :
705 : /*
706 : * For historical compatibility, behavior is not multibyte-aware.
707 : *
708 : * NB: uses libc tolower() for single-byte encodings (also for historical
709 : * compatibility), and therefore relies on the global LC_CTYPE setting.
710 : */
711 : static size_t
712 0 : downcase_ident_icu(char *dst, size_t dstsize, const char *src,
713 : size_t srclen, pg_locale_t locale)
714 : {
715 : int i;
716 : bool libc_lower;
717 0 : locale_t lt = locale->icu.lt;
718 :
719 0 : libc_lower = lt && (pg_database_encoding_max_length() == 1);
720 :
721 0 : for (i = 0; i < srclen && i < dstsize; i++)
722 : {
723 0 : unsigned char ch = (unsigned char) src[i];
724 :
725 0 : if (ch >= 'A' && ch <= 'Z')
726 0 : ch = pg_ascii_tolower(ch);
727 0 : else if (libc_lower && IS_HIGHBIT_SET(ch) && isupper_l(ch, lt))
728 0 : ch = tolower_l(ch, lt);
729 0 : dst[i] = (char) ch;
730 : }
731 :
732 0 : if (i < dstsize)
733 0 : dst[i] = '\0';
734 :
735 0 : return srclen;
736 : }
737 :
738 : /*
739 : * strncoll_icu_utf8
740 : *
741 : * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
742 : * database encoding.
743 : */
744 : #ifdef HAVE_UCOL_STRCOLLUTF8
745 : int
746 17454 : strncoll_icu_utf8(const char *arg1, size_t len1, const char *arg2, size_t len2,
747 : pg_locale_t locale)
748 : {
749 : int result;
750 : UErrorCode status;
751 :
752 : Assert(GetDatabaseEncoding() == PG_UTF8);
753 :
754 17454 : status = U_ZERO_ERROR;
755 17454 : result = ucol_strcollUTF8(locale->icu.ucol,
756 : arg1, len1,
757 : arg2, len2,
758 : &status);
759 17454 : if (U_FAILURE(status))
760 0 : ereport(ERROR,
761 : (errmsg("collation failed: %s", u_errorName(status))));
762 :
763 17454 : return result;
764 : }
765 :
766 : int
767 1180 : strcoll_icu_utf8(const char *arg1, const char *arg2, pg_locale_t locale)
768 : {
769 : int result;
770 : UErrorCode status;
771 :
772 : Assert(GetDatabaseEncoding() == PG_UTF8);
773 :
774 1180 : status = U_ZERO_ERROR;
775 1180 : result = ucol_strcollUTF8(locale->icu.ucol,
776 : arg1, -1,
777 : arg2, -1,
778 : &status);
779 1180 : if (U_FAILURE(status))
780 0 : ereport(ERROR,
781 : (errmsg("collation failed: %s", u_errorName(status))));
782 :
783 1180 : return result;
784 : }
785 : #endif
786 :
787 : static size_t
788 7928 : strnxfrm_icu_internal(char *dest, size_t destsize, const char *src, ssize_t srclen,
789 : pg_locale_t locale)
790 : {
791 : UChar sbuf[TEXTBUFLEN / sizeof(UChar)];
792 7928 : UChar *uchar = sbuf;
793 : int32_t ulen;
794 : Size result_bsize;
795 :
796 7928 : init_icu_converter();
797 :
798 7928 : ulen = uchar_length(icu_converter, src, srclen);
799 :
800 7928 : if (ulen >= lengthof(sbuf))
801 0 : uchar = palloc_array(UChar, ulen + 1);
802 :
803 7928 : ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
804 :
805 7928 : result_bsize = ucol_getSortKey(locale->icu.ucol,
806 : uchar, ulen,
807 : (uint8_t *) dest, destsize);
808 :
809 : /*
810 : * ucol_getSortKey() counts the nul-terminator in the result length, but
811 : * this function should not.
812 : */
813 : Assert(result_bsize > 0);
814 7928 : result_bsize--;
815 :
816 7928 : if (uchar != sbuf)
817 0 : pfree(uchar);
818 :
819 : /* if dest is defined, it should be nul-terminated */
820 : Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
821 :
822 7928 : return result_bsize;
823 : }
824 :
825 : static size_t
826 7928 : strnxfrm_icu(char *dest, size_t destsize, const char *src, size_t srclen,
827 : pg_locale_t locale)
828 : {
829 7928 : return strnxfrm_icu_internal(dest, destsize, src, srclen, locale);
830 : }
831 :
832 : static size_t
833 0 : strxfrm_icu(char *dest, size_t destsize, const char *src,
834 : pg_locale_t locale)
835 : {
836 0 : return strnxfrm_icu_internal(dest, destsize, src, -1, locale);
837 : }
838 :
839 : static size_t
840 1306 : strnxfrm_prefix_icu_utf8_internal(char *dest, size_t destsize,
841 : const char *src, ssize_t srclen,
842 : pg_locale_t locale)
843 : {
844 : size_t result;
845 : UCharIterator iter;
846 : uint32_t state[2];
847 : UErrorCode status;
848 :
849 : Assert(GetDatabaseEncoding() == PG_UTF8);
850 :
851 1306 : uiter_setUTF8(&iter, src, srclen);
852 1306 : state[0] = state[1] = 0; /* won't need that again */
853 1306 : status = U_ZERO_ERROR;
854 1306 : result = ucol_nextSortKeyPart(locale->icu.ucol,
855 : &iter,
856 : state,
857 : (uint8_t *) dest,
858 : destsize,
859 : &status);
860 1306 : if (U_FAILURE(status))
861 0 : ereport(ERROR,
862 : (errmsg("sort key generation failed: %s",
863 : u_errorName(status))));
864 :
865 1306 : return result;
866 : }
867 :
868 : static size_t
869 0 : strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
870 : const char *src, size_t srclen,
871 : pg_locale_t locale)
872 : {
873 0 : return strnxfrm_prefix_icu_utf8_internal(dest, destsize, src, srclen, locale);
874 : }
875 :
876 : static size_t
877 1306 : strxfrm_prefix_icu_utf8(char *dest, size_t destsize, const char *src,
878 : pg_locale_t locale)
879 : {
880 1306 : return strnxfrm_prefix_icu_utf8_internal(dest, destsize, src, -1, locale);
881 : }
882 :
883 : char *
884 48147 : get_collation_actual_version_icu(const char *collcollate)
885 : {
886 : UCollator *collator;
887 : UVersionInfo versioninfo;
888 : char buf[U_MAX_VERSION_STRING_LENGTH];
889 :
890 48147 : collator = pg_ucol_open(collcollate);
891 :
892 48147 : ucol_getVersion(collator, versioninfo);
893 48147 : ucol_close(collator);
894 :
895 48147 : u_versionToString(versioninfo, buf);
896 48147 : return pstrdup(buf);
897 : }
898 :
899 : /*
900 : * Convert a string in the database encoding into a string of UChars.
901 : *
902 : * The source string at buff is of length nbytes
903 : * (it needn't be nul-terminated)
904 : *
905 : * *buff_uchar receives a pointer to the palloc'd result string, and
906 : * the function's result is the number of UChars generated.
907 : *
908 : * The result string is nul-terminated, though most callers rely on the
909 : * result length instead.
910 : */
911 : static int32_t
912 12 : icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
913 : {
914 : int32_t len_uchar;
915 :
916 12 : init_icu_converter();
917 :
918 12 : len_uchar = uchar_length(icu_converter, buff, nbytes);
919 :
920 12 : *buff_uchar = palloc_array(UChar, len_uchar + 1);
921 12 : len_uchar = uchar_convert(icu_converter,
922 : *buff_uchar, len_uchar + 1, buff, nbytes);
923 :
924 12 : return len_uchar;
925 : }
926 :
927 : /*
928 : * Convert a string of UChars into the database encoding.
929 : *
930 : * The source string at buff_uchar is of length len_uchar
931 : * (it needn't be nul-terminated)
932 : *
933 : * *result receives a pointer to the palloc'd result string, and the
934 : * function's result is the number of bytes generated (not counting nul).
935 : *
936 : * The result string is nul-terminated.
937 : */
938 : static size_t
939 0 : icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar)
940 : {
941 : UErrorCode status;
942 : int32_t len_result;
943 :
944 0 : init_icu_converter();
945 :
946 0 : status = U_ZERO_ERROR;
947 0 : len_result = ucnv_fromUChars(icu_converter, NULL, 0,
948 : buff_uchar, len_uchar, &status);
949 0 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
950 0 : ereport(ERROR,
951 : (errmsg("%s failed: %s", "ucnv_fromUChars",
952 : u_errorName(status))));
953 :
954 0 : if (len_result + 1 > destsize)
955 0 : return len_result;
956 :
957 0 : status = U_ZERO_ERROR;
958 0 : len_result = ucnv_fromUChars(icu_converter, dest, len_result + 1,
959 : buff_uchar, len_uchar, &status);
960 0 : if (U_FAILURE(status) ||
961 0 : status == U_STRING_NOT_TERMINATED_WARNING)
962 0 : ereport(ERROR,
963 : (errmsg("%s failed: %s", "ucnv_fromUChars",
964 : u_errorName(status))));
965 :
966 0 : return len_result;
967 : }
968 :
969 : static int32_t
970 0 : convert_case_uchar(ICU_Convert_Func func, pg_locale_t mylocale,
971 : UChar **buff_dest, UChar *buff_source, int32_t len_source)
972 : {
973 : UErrorCode status;
974 : int32_t len_dest;
975 :
976 0 : len_dest = len_source; /* try first with same length */
977 0 : *buff_dest = palloc_array(UChar, len_dest);
978 0 : status = U_ZERO_ERROR;
979 0 : len_dest = func(*buff_dest, len_dest, buff_source, len_source,
980 : mylocale->icu.locale, &status);
981 0 : if (status == U_BUFFER_OVERFLOW_ERROR)
982 : {
983 : /* try again with adjusted length */
984 0 : pfree(*buff_dest);
985 0 : *buff_dest = palloc_array(UChar, len_dest);
986 0 : status = U_ZERO_ERROR;
987 0 : len_dest = func(*buff_dest, len_dest, buff_source, len_source,
988 : mylocale->icu.locale, &status);
989 : }
990 0 : if (U_FAILURE(status))
991 0 : ereport(ERROR,
992 : (errmsg("case conversion failed: %s", u_errorName(status))));
993 0 : return len_dest;
994 : }
995 :
996 : static int32_t
997 0 : icu_convert_case(ICU_Convert_Func func, char *dest, size_t destsize,
998 : const char *src, size_t srclen, pg_locale_t locale)
999 : {
1000 : int32_t len_uchar;
1001 : int32_t len_conv;
1002 : UChar *buff_uchar;
1003 : UChar *buff_conv;
1004 : size_t result_len;
1005 :
1006 0 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
1007 0 : len_conv = convert_case_uchar(func, locale, &buff_conv,
1008 : buff_uchar, len_uchar);
1009 0 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
1010 0 : pfree(buff_uchar);
1011 0 : pfree(buff_conv);
1012 :
1013 0 : return result_len;
1014 : }
1015 :
1016 : static int32_t
1017 0 : u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
1018 : const UChar *src, int32_t srcLength,
1019 : const char *locale,
1020 : UErrorCode *pErrorCode)
1021 : {
1022 0 : return u_strToTitle(dest, destCapacity, src, srcLength,
1023 : NULL, locale, pErrorCode);
1024 : }
1025 :
1026 : static int32_t
1027 0 : u_strFoldCase_default(UChar *dest, int32_t destCapacity,
1028 : const UChar *src, int32_t srcLength,
1029 : const char *locale,
1030 : UErrorCode *pErrorCode)
1031 : {
1032 0 : return u_strFoldCase(dest, destCapacity, src, srcLength,
1033 0 : foldcase_options(locale), pErrorCode);
1034 : }
1035 :
1036 : /*
1037 : * Return the correct u_strFoldCase() options for the given locale.
1038 : *
1039 : * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
1040 : * folding does not accept a locale. Instead it just supports a single option
1041 : * relevant to Turkic languages 'az' and 'tr'; check for those languages.
1042 : */
1043 : static int32_t
1044 131 : foldcase_options(const char *locale)
1045 : {
1046 131 : uint32 options = U_FOLD_CASE_DEFAULT;
1047 : char lang[ULOC_LANG_CAPACITY];
1048 131 : UErrorCode status = U_ZERO_ERROR;
1049 :
1050 131 : uloc_getLanguage(locale, lang, ULOC_LANG_CAPACITY, &status);
1051 131 : if (U_SUCCESS(status) && status != U_STRING_NOT_TERMINATED_WARNING)
1052 : {
1053 : /*
1054 : * The option name is confusing, but it causes u_strFoldCase to use
1055 : * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
1056 : */
1057 131 : if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
1058 4 : options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
1059 : }
1060 :
1061 131 : return options;
1062 : }
1063 :
1064 : /*
1065 : * strncoll_icu
1066 : *
1067 : * Convert the arguments from the database encoding to UChar strings, then
1068 : * call ucol_strcoll().
1069 : *
1070 : * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
1071 : * caller should call that instead.
1072 : */
1073 : static int
1074 0 : strncoll_icu_internal(const char *arg1, ssize_t len1,
1075 : const char *arg2, ssize_t len2,
1076 : pg_locale_t locale)
1077 : {
1078 : UChar sbuf[TEXTBUFLEN / sizeof(UChar)];
1079 0 : UChar *buf = sbuf;
1080 : int32_t ulen1;
1081 : int32_t ulen2;
1082 : size_t bufsize;
1083 : UChar *uchar1,
1084 : *uchar2;
1085 : int result;
1086 :
1087 : /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */
1088 : #ifdef HAVE_UCOL_STRCOLLUTF8
1089 : Assert(GetDatabaseEncoding() != PG_UTF8);
1090 : #endif
1091 :
1092 0 : init_icu_converter();
1093 :
1094 0 : ulen1 = uchar_length(icu_converter, arg1, len1);
1095 0 : ulen2 = uchar_length(icu_converter, arg2, len2);
1096 :
1097 : /* ulen1+1 or ulen2+1 doesn't risk overflow, but summing them might */
1098 0 : bufsize = add_size(ulen1 + 1, ulen2 + 1);
1099 0 : if (bufsize > lengthof(sbuf))
1100 0 : buf = palloc_array(UChar, bufsize);
1101 :
1102 0 : uchar1 = buf;
1103 0 : uchar2 = buf + ulen1 + 1;
1104 :
1105 0 : ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
1106 0 : ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
1107 :
1108 0 : result = ucol_strcoll(locale->icu.ucol,
1109 : uchar1, ulen1,
1110 : uchar2, ulen2);
1111 :
1112 0 : if (buf != sbuf)
1113 0 : pfree(buf);
1114 :
1115 0 : return result;
1116 : }
1117 :
1118 : static int
1119 0 : strncoll_icu(const char *arg1, size_t len1, const char *arg2, size_t len2,
1120 : pg_locale_t locale)
1121 : {
1122 0 : return strncoll_icu_internal(arg1, len1, arg2, len2, locale);
1123 : }
1124 :
1125 : static int
1126 0 : strcoll_icu(const char *arg1, const char *arg2, pg_locale_t locale)
1127 : {
1128 0 : return strncoll_icu_internal(arg1, -1, arg2, -1, locale);
1129 : }
1130 :
1131 : static size_t
1132 0 : strnxfrm_prefix_icu_internal(char *dest, size_t destsize,
1133 : const char *src, ssize_t srclen,
1134 : pg_locale_t locale)
1135 : {
1136 : UChar sbuf[TEXTBUFLEN / sizeof(UChar)];
1137 0 : UChar *uchar = sbuf;
1138 : UCharIterator iter;
1139 : uint32_t state[2];
1140 : UErrorCode status;
1141 : int32_t ulen;
1142 : Size result_bsize;
1143 :
1144 : /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */
1145 : Assert(GetDatabaseEncoding() != PG_UTF8);
1146 :
1147 0 : init_icu_converter();
1148 :
1149 0 : ulen = uchar_length(icu_converter, src, srclen);
1150 :
1151 0 : if (ulen >= lengthof(sbuf))
1152 0 : uchar = palloc_array(UChar, ulen + 1);
1153 :
1154 0 : ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
1155 :
1156 0 : uiter_setString(&iter, uchar, ulen);
1157 0 : state[0] = state[1] = 0; /* won't need that again */
1158 0 : status = U_ZERO_ERROR;
1159 0 : result_bsize = ucol_nextSortKeyPart(locale->icu.ucol,
1160 : &iter,
1161 : state,
1162 : (uint8_t *) dest,
1163 : destsize,
1164 : &status);
1165 0 : if (U_FAILURE(status))
1166 0 : ereport(ERROR,
1167 : (errmsg("sort key generation failed: %s",
1168 : u_errorName(status))));
1169 :
1170 0 : if (uchar != sbuf)
1171 0 : pfree(uchar);
1172 :
1173 0 : return result_bsize;
1174 : }
1175 :
1176 : static size_t
1177 0 : strnxfrm_prefix_icu(char *dest, size_t destsize, const char *src, size_t srclen,
1178 : pg_locale_t locale)
1179 : {
1180 0 : return strnxfrm_prefix_icu_internal(dest, destsize, src, srclen, locale);
1181 : }
1182 :
1183 : static size_t
1184 0 : strxfrm_prefix_icu(char *dest, size_t destsize, const char *src,
1185 : pg_locale_t locale)
1186 : {
1187 0 : return strnxfrm_prefix_icu_internal(dest, destsize, src, -1, locale);
1188 : }
1189 :
1190 : static void
1191 7940 : init_icu_converter(void)
1192 : {
1193 : const char *icu_encoding_name;
1194 : UErrorCode status;
1195 : UConverter *conv;
1196 :
1197 7940 : if (icu_converter)
1198 7936 : return; /* already done */
1199 :
1200 4 : icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
1201 4 : if (!icu_encoding_name)
1202 0 : ereport(ERROR,
1203 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1204 : errmsg("encoding \"%s\" not supported by ICU",
1205 : pg_encoding_to_char(GetDatabaseEncoding()))));
1206 :
1207 4 : status = U_ZERO_ERROR;
1208 4 : conv = ucnv_open(icu_encoding_name, &status);
1209 4 : if (U_FAILURE(status))
1210 0 : ereport(ERROR,
1211 : (errmsg("could not open ICU converter for encoding \"%s\": %s",
1212 : icu_encoding_name, u_errorName(status))));
1213 :
1214 4 : icu_converter = conv;
1215 : }
1216 :
1217 : /*
1218 : * Find length, in UChars, of given string if converted to UChar string.
1219 : *
1220 : * A length of -1 indicates that the input string is NUL-terminated.
1221 : *
1222 : * Note: given the assumption that the input string fits in MaxAllocSize,
1223 : * the result cannot overflow int32_t. But callers must be careful about
1224 : * multiplying the result by sizeof(UChar).
1225 : */
1226 : static int32_t
1227 7940 : uchar_length(UConverter *converter, const char *str, int32_t len)
1228 : {
1229 7940 : UErrorCode status = U_ZERO_ERROR;
1230 : int32_t ulen;
1231 :
1232 7940 : ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
1233 7940 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
1234 0 : ereport(ERROR,
1235 : (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
1236 7940 : return ulen;
1237 : }
1238 :
1239 : /*
1240 : * Convert the given source string into a UChar string, stored in dest, and
1241 : * return the length (in UChars).
1242 : *
1243 : * A srclen of -1 indicates that the input string is NUL-terminated.
1244 : */
1245 : static int32_t
1246 7940 : uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
1247 : const char *src, int32_t srclen)
1248 : {
1249 7940 : UErrorCode status = U_ZERO_ERROR;
1250 : int32_t ulen;
1251 :
1252 7940 : ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
1253 7940 : if (U_FAILURE(status))
1254 0 : ereport(ERROR,
1255 : (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
1256 7940 : return ulen;
1257 : }
1258 :
1259 : /*
1260 : * Parse collation attributes from the given locale string and apply them to
1261 : * the open collator.
1262 : *
1263 : * First, the locale string is canonicalized to an ICU format locale ID such
1264 : * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
1265 : * the key-value arguments.
1266 : *
1267 : * Starting with ICU version 54, the attributes are processed automatically by
1268 : * ucol_open(), so this is only necessary for emulating this behavior on older
1269 : * versions.
1270 : */
1271 : pg_attribute_unused()
1272 : static void
1273 0 : icu_set_collation_attributes(UCollator *collator, const char *loc,
1274 : UErrorCode *status)
1275 : {
1276 : int32_t len;
1277 : char *icu_locale_id;
1278 : char *lower_str;
1279 : char *str;
1280 : char *token;
1281 :
1282 : /*
1283 : * The input locale may be a BCP 47 language tag, e.g.
1284 : * "und-u-kc-ks-level1", which expresses the same attributes in a
1285 : * different form. It will be converted to the equivalent ICU format
1286 : * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
1287 : * uloc_canonicalize().
1288 : */
1289 0 : *status = U_ZERO_ERROR;
1290 0 : len = uloc_canonicalize(loc, NULL, 0, status);
1291 0 : icu_locale_id = palloc(len + 1);
1292 0 : *status = U_ZERO_ERROR;
1293 0 : len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
1294 0 : if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
1295 0 : return;
1296 :
1297 0 : lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
1298 :
1299 0 : pfree(icu_locale_id);
1300 :
1301 0 : str = strchr(lower_str, '@');
1302 0 : if (!str)
1303 0 : return;
1304 0 : str++;
1305 :
1306 0 : while ((token = strsep(&str, ";")))
1307 : {
1308 0 : char *e = strchr(token, '=');
1309 :
1310 0 : if (e)
1311 : {
1312 : char *name;
1313 : char *value;
1314 : UColAttribute uattr;
1315 : UColAttributeValue uvalue;
1316 :
1317 0 : *status = U_ZERO_ERROR;
1318 :
1319 0 : *e = '\0';
1320 0 : name = token;
1321 0 : value = e + 1;
1322 :
1323 : /*
1324 : * See attribute name and value lists in ICU i18n/coll.cpp
1325 : */
1326 0 : if (strcmp(name, "colstrength") == 0)
1327 0 : uattr = UCOL_STRENGTH;
1328 0 : else if (strcmp(name, "colbackwards") == 0)
1329 0 : uattr = UCOL_FRENCH_COLLATION;
1330 0 : else if (strcmp(name, "colcaselevel") == 0)
1331 0 : uattr = UCOL_CASE_LEVEL;
1332 0 : else if (strcmp(name, "colcasefirst") == 0)
1333 0 : uattr = UCOL_CASE_FIRST;
1334 0 : else if (strcmp(name, "colalternate") == 0)
1335 0 : uattr = UCOL_ALTERNATE_HANDLING;
1336 0 : else if (strcmp(name, "colnormalization") == 0)
1337 0 : uattr = UCOL_NORMALIZATION_MODE;
1338 0 : else if (strcmp(name, "colnumeric") == 0)
1339 0 : uattr = UCOL_NUMERIC_COLLATION;
1340 : else
1341 : /* ignore if unknown */
1342 0 : continue;
1343 :
1344 0 : if (strcmp(value, "primary") == 0)
1345 0 : uvalue = UCOL_PRIMARY;
1346 0 : else if (strcmp(value, "secondary") == 0)
1347 0 : uvalue = UCOL_SECONDARY;
1348 0 : else if (strcmp(value, "tertiary") == 0)
1349 0 : uvalue = UCOL_TERTIARY;
1350 0 : else if (strcmp(value, "quaternary") == 0)
1351 0 : uvalue = UCOL_QUATERNARY;
1352 0 : else if (strcmp(value, "identical") == 0)
1353 0 : uvalue = UCOL_IDENTICAL;
1354 0 : else if (strcmp(value, "no") == 0)
1355 0 : uvalue = UCOL_OFF;
1356 0 : else if (strcmp(value, "yes") == 0)
1357 0 : uvalue = UCOL_ON;
1358 0 : else if (strcmp(value, "shifted") == 0)
1359 0 : uvalue = UCOL_SHIFTED;
1360 0 : else if (strcmp(value, "non-ignorable") == 0)
1361 0 : uvalue = UCOL_NON_IGNORABLE;
1362 0 : else if (strcmp(value, "lower") == 0)
1363 0 : uvalue = UCOL_LOWER_FIRST;
1364 0 : else if (strcmp(value, "upper") == 0)
1365 0 : uvalue = UCOL_UPPER_FIRST;
1366 : else
1367 : {
1368 0 : *status = U_ILLEGAL_ARGUMENT_ERROR;
1369 0 : break;
1370 : }
1371 :
1372 0 : ucol_setAttribute(collator, uattr, uvalue, status);
1373 : }
1374 : }
1375 :
1376 0 : pfree(lower_str);
1377 : }
1378 :
1379 : #endif /* USE_ICU */
|