Line data Source code
1 : /*-----------------------------------------------------------------------
2 : *
3 : * PostgreSQL locale utilities for ICU
4 : *
5 : * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 : *
7 : * src/backend/utils/adt/pg_locale_icu.c
8 : *
9 : *-----------------------------------------------------------------------
10 : */
11 :
12 : #include "postgres.h"
13 :
14 : #ifdef USE_ICU
15 : #include <unicode/ucnv.h>
16 : #include <unicode/ustring.h>
17 :
18 : /*
19 : * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
20 : * (see
21 : * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
22 : */
23 : #if U_ICU_VERSION_MAJOR_NUM >= 53
24 : #define HAVE_UCOL_STRCOLLUTF8 1
25 : #else
26 : #undef HAVE_UCOL_STRCOLLUTF8
27 : #endif
28 :
29 : #endif
30 :
31 : #include "access/htup_details.h"
32 : #include "catalog/pg_database.h"
33 : #include "catalog/pg_collation.h"
34 : #include "mb/pg_wchar.h"
35 : #include "miscadmin.h"
36 : #include "utils/builtins.h"
37 : #include "utils/formatting.h"
38 : #include "utils/memutils.h"
39 : #include "utils/pg_locale.h"
40 : #include "utils/syscache.h"
41 :
42 : /*
43 : * Size of stack buffer to use for string transformations, used to avoid heap
44 : * allocations in typical cases. This should be large enough that most strings
45 : * will fit, but small enough that we feel comfortable putting it on the
46 : * stack.
47 : */
48 : #define TEXTBUFLEN 1024
49 :
50 : extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context);
51 : extern size_t strlower_icu(char *dest, size_t destsize, const char *src,
52 : ssize_t srclen, pg_locale_t locale);
53 : extern size_t strtitle_icu(char *dest, size_t destsize, const char *src,
54 : ssize_t srclen, pg_locale_t locale);
55 : extern size_t strupper_icu(char *dest, size_t destsize, const char *src,
56 : ssize_t srclen, pg_locale_t locale);
57 : extern size_t strfold_icu(char *dest, size_t destsize, const char *src,
58 : ssize_t srclen, pg_locale_t locale);
59 :
60 : #ifdef USE_ICU
61 :
62 : extern UCollator *pg_ucol_open(const char *loc_str);
63 :
64 : static size_t strnxfrm_icu(char *dest, size_t destsize,
65 : const char *src, ssize_t srclen,
66 : pg_locale_t locale);
67 : extern char *get_collation_actual_version_icu(const char *collcollate);
68 :
69 : typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity,
70 : const UChar *src, int32_t srcLength,
71 : const char *locale,
72 : UErrorCode *pErrorCode);
73 :
74 : /*
75 : * Converter object for converting between ICU's UChar strings and C strings
76 : * in database encoding. Since the database encoding doesn't change, we only
77 : * need one of these per session.
78 : */
79 : static UConverter *icu_converter = NULL;
80 :
81 : static UCollator *make_icu_collator(const char *iculocstr,
82 : const char *icurules);
83 : static int strncoll_icu(const char *arg1, ssize_t len1,
84 : const char *arg2, ssize_t len2,
85 : pg_locale_t locale);
86 : static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
87 : const char *src, ssize_t srclen,
88 : pg_locale_t locale);
89 : #ifdef HAVE_UCOL_STRCOLLUTF8
90 : static int strncoll_icu_utf8(const char *arg1, ssize_t len1,
91 : const char *arg2, ssize_t len2,
92 : pg_locale_t locale);
93 : #endif
94 : static size_t strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
95 : const char *src, ssize_t srclen,
96 : pg_locale_t locale);
97 : static void init_icu_converter(void);
98 : static size_t uchar_length(UConverter *converter,
99 : const char *str, int32_t len);
100 : static int32_t uchar_convert(UConverter *converter,
101 : UChar *dest, int32_t destlen,
102 : const char *src, int32_t srclen);
103 : static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff,
104 : size_t nbytes);
105 : static size_t icu_from_uchar(char *dest, size_t destsize,
106 : const UChar *buff_uchar, int32_t len_uchar);
107 : static void icu_set_collation_attributes(UCollator *collator, const char *loc,
108 : UErrorCode *status);
109 : static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
110 : UChar **buff_dest, UChar *buff_source,
111 : int32_t len_source);
112 : static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
113 : const UChar *src, int32_t srcLength,
114 : const char *locale,
115 : UErrorCode *pErrorCode);
116 : static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
117 : const UChar *src, int32_t srcLength,
118 : const char *locale,
119 : UErrorCode *pErrorCode);
120 :
121 : static const struct collate_methods collate_methods_icu = {
122 : .strncoll = strncoll_icu,
123 : .strnxfrm = strnxfrm_icu,
124 : .strnxfrm_prefix = strnxfrm_prefix_icu,
125 : .strxfrm_is_safe = true,
126 : };
127 :
128 : static const struct collate_methods collate_methods_icu_utf8 = {
129 : #ifdef HAVE_UCOL_STRCOLLUTF8
130 : .strncoll = strncoll_icu_utf8,
131 : #else
132 : .strncoll = strncoll_icu,
133 : #endif
134 : .strnxfrm = strnxfrm_icu,
135 : .strnxfrm_prefix = strnxfrm_prefix_icu_utf8,
136 : .strxfrm_is_safe = true,
137 : };
138 :
139 : #endif
140 :
141 : pg_locale_t
142 212 : create_pg_locale_icu(Oid collid, MemoryContext context)
143 : {
144 : #ifdef USE_ICU
145 : bool deterministic;
146 : const char *iculocstr;
147 212 : const char *icurules = NULL;
148 : UCollator *collator;
149 : pg_locale_t result;
150 :
151 212 : if (collid == DEFAULT_COLLATION_OID)
152 : {
153 : HeapTuple tp;
154 : Datum datum;
155 : bool isnull;
156 :
157 26 : tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
158 26 : if (!HeapTupleIsValid(tp))
159 0 : elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
160 :
161 : /* default database collation is always deterministic */
162 26 : deterministic = true;
163 26 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
164 : Anum_pg_database_datlocale);
165 26 : iculocstr = TextDatumGetCString(datum);
166 26 : datum = SysCacheGetAttr(DATABASEOID, tp,
167 : Anum_pg_database_daticurules, &isnull);
168 26 : if (!isnull)
169 0 : icurules = TextDatumGetCString(datum);
170 :
171 26 : ReleaseSysCache(tp);
172 : }
173 : else
174 : {
175 : Form_pg_collation collform;
176 : HeapTuple tp;
177 : Datum datum;
178 : bool isnull;
179 :
180 186 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
181 186 : if (!HeapTupleIsValid(tp))
182 0 : elog(ERROR, "cache lookup failed for collation %u", collid);
183 186 : collform = (Form_pg_collation) GETSTRUCT(tp);
184 186 : deterministic = collform->collisdeterministic;
185 186 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
186 : Anum_pg_collation_colllocale);
187 186 : iculocstr = TextDatumGetCString(datum);
188 186 : datum = SysCacheGetAttr(COLLOID, tp,
189 : Anum_pg_collation_collicurules, &isnull);
190 186 : if (!isnull)
191 12 : icurules = TextDatumGetCString(datum);
192 :
193 186 : ReleaseSysCache(tp);
194 : }
195 :
196 212 : collator = make_icu_collator(iculocstr, icurules);
197 :
198 202 : result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
199 202 : result->info.icu.locale = MemoryContextStrdup(context, iculocstr);
200 202 : result->info.icu.ucol = collator;
201 202 : result->provider = COLLPROVIDER_ICU;
202 202 : result->deterministic = deterministic;
203 202 : result->collate_is_c = false;
204 202 : result->ctype_is_c = false;
205 202 : if (GetDatabaseEncoding() == PG_UTF8)
206 202 : result->collate = &collate_methods_icu_utf8;
207 : else
208 0 : result->collate = &collate_methods_icu;
209 :
210 202 : return result;
211 : #else
212 : /* could get here if a collation was created by a build with ICU */
213 : ereport(ERROR,
214 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
215 : errmsg("ICU is not supported in this build")));
216 :
217 : return NULL;
218 : #endif
219 : }
220 :
221 : #ifdef USE_ICU
222 :
223 : /*
224 : * Wrapper around ucol_open() to handle API differences for older ICU
225 : * versions.
226 : *
227 : * Ensure that no path leaks a UCollator.
228 : */
229 : UCollator *
230 74430 : pg_ucol_open(const char *loc_str)
231 : {
232 : UCollator *collator;
233 : UErrorCode status;
234 74430 : const char *orig_str = loc_str;
235 74430 : char *fixed_str = NULL;
236 :
237 : /*
238 : * Must never open default collator, because it depends on the environment
239 : * and may change at any time. Should not happen, but check here to catch
240 : * bugs that might be hard to catch otherwise.
241 : *
242 : * NB: the default collator is not the same as the collator for the root
243 : * locale. The root locale may be specified as the empty string, "und", or
244 : * "root". The default collator is opened by passing NULL to ucol_open().
245 : */
246 74430 : if (loc_str == NULL)
247 0 : elog(ERROR, "opening default collator is not supported");
248 :
249 : /*
250 : * In ICU versions 54 and earlier, "und" is not a recognized spelling of
251 : * the root locale. If the first component of the locale is "und", replace
252 : * with "root" before opening.
253 : */
254 : if (U_ICU_VERSION_MAJOR_NUM < 55)
255 : {
256 : char lang[ULOC_LANG_CAPACITY];
257 :
258 : status = U_ZERO_ERROR;
259 : uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
260 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
261 : {
262 : ereport(ERROR,
263 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
264 : errmsg("could not get language from locale \"%s\": %s",
265 : loc_str, u_errorName(status))));
266 : }
267 :
268 : if (strcmp(lang, "und") == 0)
269 : {
270 : const char *remainder = loc_str + strlen("und");
271 :
272 : fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
273 : strcpy(fixed_str, "root");
274 : strcat(fixed_str, remainder);
275 :
276 : loc_str = fixed_str;
277 : }
278 : }
279 :
280 74430 : status = U_ZERO_ERROR;
281 74430 : collator = ucol_open(loc_str, &status);
282 74430 : if (U_FAILURE(status))
283 12 : ereport(ERROR,
284 : /* use original string for error report */
285 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
286 : errmsg("could not open collator for locale \"%s\": %s",
287 : orig_str, u_errorName(status))));
288 :
289 : if (U_ICU_VERSION_MAJOR_NUM < 54)
290 : {
291 : status = U_ZERO_ERROR;
292 : icu_set_collation_attributes(collator, loc_str, &status);
293 :
294 : /*
295 : * Pretend the error came from ucol_open(), for consistent error
296 : * message across ICU versions.
297 : */
298 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
299 : {
300 : ucol_close(collator);
301 : ereport(ERROR,
302 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
303 : errmsg("could not open collator for locale \"%s\": %s",
304 : orig_str, u_errorName(status))));
305 : }
306 : }
307 :
308 74418 : if (fixed_str != NULL)
309 0 : pfree(fixed_str);
310 :
311 74418 : return collator;
312 : }
313 :
314 : /*
315 : * Create a UCollator with the given locale string and rules.
316 : *
317 : * Ensure that no path leaks a UCollator.
318 : */
319 : static UCollator *
320 212 : make_icu_collator(const char *iculocstr, const char *icurules)
321 : {
322 212 : if (!icurules)
323 : {
324 : /* simple case without rules */
325 200 : return pg_ucol_open(iculocstr);
326 : }
327 : else
328 : {
329 : UCollator *collator_std_rules;
330 : UCollator *collator_all_rules;
331 : const UChar *std_rules;
332 : UChar *my_rules;
333 : UChar *all_rules;
334 : int32_t length;
335 : int32_t total;
336 : UErrorCode status;
337 :
338 : /*
339 : * If rules are specified, we extract the rules of the standard
340 : * collation, add our own rules, and make a new collator with the
341 : * combined rules.
342 : */
343 12 : icu_to_uchar(&my_rules, icurules, strlen(icurules));
344 :
345 12 : collator_std_rules = pg_ucol_open(iculocstr);
346 :
347 12 : std_rules = ucol_getRules(collator_std_rules, &length);
348 :
349 12 : total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
350 :
351 : /* avoid leaking collator on OOM */
352 12 : all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
353 12 : if (!all_rules)
354 : {
355 0 : ucol_close(collator_std_rules);
356 0 : ereport(ERROR,
357 : (errcode(ERRCODE_OUT_OF_MEMORY),
358 : errmsg("out of memory")));
359 : }
360 :
361 12 : u_strcpy(all_rules, std_rules);
362 12 : u_strcat(all_rules, my_rules);
363 :
364 12 : ucol_close(collator_std_rules);
365 :
366 12 : status = U_ZERO_ERROR;
367 12 : collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
368 : UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
369 : NULL, &status);
370 12 : if (U_FAILURE(status))
371 : {
372 6 : ereport(ERROR,
373 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
374 : errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
375 : iculocstr, icurules, u_errorName(status))));
376 : }
377 :
378 6 : return collator_all_rules;
379 : }
380 : }
381 :
382 : size_t
383 528 : strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
384 : pg_locale_t locale)
385 : {
386 : int32_t len_uchar;
387 : int32_t len_conv;
388 : UChar *buff_uchar;
389 : UChar *buff_conv;
390 : size_t result_len;
391 :
392 528 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
393 528 : len_conv = icu_convert_case(u_strToLower, locale,
394 : &buff_conv, buff_uchar, len_uchar);
395 528 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
396 528 : pfree(buff_uchar);
397 528 : pfree(buff_conv);
398 :
399 528 : return result_len;
400 : }
401 :
402 : size_t
403 30 : strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
404 : pg_locale_t locale)
405 : {
406 : int32_t len_uchar;
407 : int32_t len_conv;
408 : UChar *buff_uchar;
409 : UChar *buff_conv;
410 : size_t result_len;
411 :
412 30 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
413 30 : len_conv = icu_convert_case(u_strToTitle_default_BI, locale,
414 : &buff_conv, buff_uchar, len_uchar);
415 30 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
416 30 : pfree(buff_uchar);
417 30 : pfree(buff_conv);
418 :
419 30 : return result_len;
420 : }
421 :
422 : size_t
423 54 : strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
424 : pg_locale_t locale)
425 : {
426 : int32_t len_uchar;
427 : int32_t len_conv;
428 : UChar *buff_uchar;
429 : UChar *buff_conv;
430 : size_t result_len;
431 :
432 54 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
433 54 : len_conv = icu_convert_case(u_strToUpper, locale,
434 : &buff_conv, buff_uchar, len_uchar);
435 54 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
436 54 : pfree(buff_uchar);
437 54 : pfree(buff_conv);
438 :
439 54 : return result_len;
440 : }
441 :
442 : size_t
443 12 : strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
444 : pg_locale_t locale)
445 : {
446 : int32_t len_uchar;
447 : int32_t len_conv;
448 : UChar *buff_uchar;
449 : UChar *buff_conv;
450 : size_t result_len;
451 :
452 12 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
453 12 : len_conv = icu_convert_case(u_strFoldCase_default, locale,
454 : &buff_conv, buff_uchar, len_uchar);
455 12 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
456 12 : pfree(buff_uchar);
457 12 : pfree(buff_conv);
458 :
459 12 : return result_len;
460 : }
461 :
462 : /*
463 : * strncoll_icu_utf8
464 : *
465 : * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
466 : * database encoding. An argument length of -1 means the string is
467 : * NUL-terminated.
468 : */
469 : #ifdef HAVE_UCOL_STRCOLLUTF8
470 : int
471 25470 : strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
472 : pg_locale_t locale)
473 : {
474 : int result;
475 : UErrorCode status;
476 :
477 : Assert(locale->provider == COLLPROVIDER_ICU);
478 :
479 : Assert(GetDatabaseEncoding() == PG_UTF8);
480 :
481 25470 : status = U_ZERO_ERROR;
482 25470 : result = ucol_strcollUTF8(locale->info.icu.ucol,
483 : arg1, len1,
484 : arg2, len2,
485 : &status);
486 25470 : if (U_FAILURE(status))
487 0 : ereport(ERROR,
488 : (errmsg("collation failed: %s", u_errorName(status))));
489 :
490 25470 : return result;
491 : }
492 : #endif
493 :
494 : /* 'srclen' of -1 means the strings are NUL-terminated */
495 : size_t
496 10020 : strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
497 : pg_locale_t locale)
498 : {
499 : char sbuf[TEXTBUFLEN];
500 10020 : char *buf = sbuf;
501 : UChar *uchar;
502 : int32_t ulen;
503 : size_t uchar_bsize;
504 : Size result_bsize;
505 :
506 : Assert(locale->provider == COLLPROVIDER_ICU);
507 :
508 10020 : init_icu_converter();
509 :
510 10020 : ulen = uchar_length(icu_converter, src, srclen);
511 :
512 10020 : uchar_bsize = (ulen + 1) * sizeof(UChar);
513 :
514 10020 : if (uchar_bsize > TEXTBUFLEN)
515 0 : buf = palloc(uchar_bsize);
516 :
517 10020 : uchar = (UChar *) buf;
518 :
519 10020 : ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
520 :
521 10020 : result_bsize = ucol_getSortKey(locale->info.icu.ucol,
522 : uchar, ulen,
523 : (uint8_t *) dest, destsize);
524 :
525 : /*
526 : * ucol_getSortKey() counts the nul-terminator in the result length, but
527 : * this function should not.
528 : */
529 : Assert(result_bsize > 0);
530 10020 : result_bsize--;
531 :
532 10020 : if (buf != sbuf)
533 0 : pfree(buf);
534 :
535 : /* if dest is defined, it should be nul-terminated */
536 : Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
537 :
538 10020 : return result_bsize;
539 : }
540 :
541 : /* 'srclen' of -1 means the strings are NUL-terminated */
542 : size_t
543 1668 : strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
544 : const char *src, ssize_t srclen,
545 : pg_locale_t locale)
546 : {
547 : size_t result;
548 : UCharIterator iter;
549 : uint32_t state[2];
550 : UErrorCode status;
551 :
552 : Assert(locale->provider == COLLPROVIDER_ICU);
553 :
554 : Assert(GetDatabaseEncoding() == PG_UTF8);
555 :
556 1668 : uiter_setUTF8(&iter, src, srclen);
557 1668 : state[0] = state[1] = 0; /* won't need that again */
558 1668 : status = U_ZERO_ERROR;
559 1668 : result = ucol_nextSortKeyPart(locale->info.icu.ucol,
560 : &iter,
561 : state,
562 : (uint8_t *) dest,
563 : destsize,
564 : &status);
565 1668 : if (U_FAILURE(status))
566 0 : ereport(ERROR,
567 : (errmsg("sort key generation failed: %s",
568 : u_errorName(status))));
569 :
570 1668 : return result;
571 : }
572 :
573 : char *
574 74068 : get_collation_actual_version_icu(const char *collcollate)
575 : {
576 : UCollator *collator;
577 : UVersionInfo versioninfo;
578 : char buf[U_MAX_VERSION_STRING_LENGTH];
579 :
580 74068 : collator = pg_ucol_open(collcollate);
581 :
582 74068 : ucol_getVersion(collator, versioninfo);
583 74068 : ucol_close(collator);
584 :
585 74068 : u_versionToString(versioninfo, buf);
586 74068 : return pstrdup(buf);
587 : }
588 :
589 : /*
590 : * Convert a string in the database encoding into a string of UChars.
591 : *
592 : * The source string at buff is of length nbytes
593 : * (it needn't be nul-terminated)
594 : *
595 : * *buff_uchar receives a pointer to the palloc'd result string, and
596 : * the function's result is the number of UChars generated.
597 : *
598 : * The result string is nul-terminated, though most callers rely on the
599 : * result length instead.
600 : */
601 : static int32_t
602 636 : icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
603 : {
604 : int32_t len_uchar;
605 :
606 636 : init_icu_converter();
607 :
608 636 : len_uchar = uchar_length(icu_converter, buff, nbytes);
609 :
610 636 : *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
611 636 : len_uchar = uchar_convert(icu_converter,
612 : *buff_uchar, len_uchar + 1, buff, nbytes);
613 :
614 636 : return len_uchar;
615 : }
616 :
617 : /*
618 : * Convert a string of UChars into the database encoding.
619 : *
620 : * The source string at buff_uchar is of length len_uchar
621 : * (it needn't be nul-terminated)
622 : *
623 : * *result receives a pointer to the palloc'd result string, and the
624 : * function's result is the number of bytes generated (not counting nul).
625 : *
626 : * The result string is nul-terminated.
627 : */
628 : static size_t
629 624 : icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar)
630 : {
631 : UErrorCode status;
632 : int32_t len_result;
633 :
634 624 : init_icu_converter();
635 :
636 624 : status = U_ZERO_ERROR;
637 624 : len_result = ucnv_fromUChars(icu_converter, NULL, 0,
638 : buff_uchar, len_uchar, &status);
639 624 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
640 0 : ereport(ERROR,
641 : (errmsg("%s failed: %s", "ucnv_fromUChars",
642 : u_errorName(status))));
643 :
644 624 : if (len_result + 1 > destsize)
645 60 : return len_result;
646 :
647 564 : status = U_ZERO_ERROR;
648 564 : len_result = ucnv_fromUChars(icu_converter, dest, len_result + 1,
649 : buff_uchar, len_uchar, &status);
650 564 : if (U_FAILURE(status) ||
651 564 : status == U_STRING_NOT_TERMINATED_WARNING)
652 0 : ereport(ERROR,
653 : (errmsg("%s failed: %s", "ucnv_fromUChars",
654 : u_errorName(status))));
655 :
656 564 : return len_result;
657 : }
658 :
659 : static int32_t
660 624 : icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
661 : UChar **buff_dest, UChar *buff_source, int32_t len_source)
662 : {
663 : UErrorCode status;
664 : int32_t len_dest;
665 :
666 624 : len_dest = len_source; /* try first with same length */
667 624 : *buff_dest = palloc(len_dest * sizeof(**buff_dest));
668 624 : status = U_ZERO_ERROR;
669 624 : len_dest = func(*buff_dest, len_dest, buff_source, len_source,
670 : mylocale->info.icu.locale, &status);
671 624 : if (status == U_BUFFER_OVERFLOW_ERROR)
672 : {
673 : /* try again with adjusted length */
674 18 : pfree(*buff_dest);
675 18 : *buff_dest = palloc(len_dest * sizeof(**buff_dest));
676 18 : status = U_ZERO_ERROR;
677 18 : len_dest = func(*buff_dest, len_dest, buff_source, len_source,
678 : mylocale->info.icu.locale, &status);
679 : }
680 624 : if (U_FAILURE(status))
681 0 : ereport(ERROR,
682 : (errmsg("case conversion failed: %s", u_errorName(status))));
683 624 : return len_dest;
684 : }
685 :
686 : static int32_t
687 30 : u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
688 : const UChar *src, int32_t srcLength,
689 : const char *locale,
690 : UErrorCode *pErrorCode)
691 : {
692 30 : return u_strToTitle(dest, destCapacity, src, srcLength,
693 : NULL, locale, pErrorCode);
694 : }
695 :
696 : static int32_t
697 24 : u_strFoldCase_default(UChar *dest, int32_t destCapacity,
698 : const UChar *src, int32_t srcLength,
699 : const char *locale,
700 : UErrorCode *pErrorCode)
701 : {
702 24 : uint32 options = U_FOLD_CASE_DEFAULT;
703 : char lang[3];
704 : UErrorCode status;
705 :
706 : /*
707 : * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
708 : * folding does not accept a locale. Instead it just supports a single
709 : * option relevant to Turkic languages 'az' and 'tr'; check for those
710 : * languages to enable the option.
711 : */
712 24 : status = U_ZERO_ERROR;
713 24 : uloc_getLanguage(locale, lang, 3, &status);
714 24 : if (U_SUCCESS(status))
715 : {
716 : /*
717 : * The option name is confusing, but it causes u_strFoldCase to use
718 : * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
719 : */
720 24 : if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
721 12 : options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
722 : }
723 :
724 24 : return u_strFoldCase(dest, destCapacity, src, srcLength,
725 : options, pErrorCode);
726 : }
727 :
728 : /*
729 : * strncoll_icu
730 : *
731 : * Convert the arguments from the database encoding to UChar strings, then
732 : * call ucol_strcoll(). An argument length of -1 means that the string is
733 : * NUL-terminated.
734 : *
735 : * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
736 : * caller should call that instead.
737 : */
738 : static int
739 0 : strncoll_icu(const char *arg1, ssize_t len1,
740 : const char *arg2, ssize_t len2, pg_locale_t locale)
741 : {
742 : char sbuf[TEXTBUFLEN];
743 0 : char *buf = sbuf;
744 : int32_t ulen1;
745 : int32_t ulen2;
746 : size_t bufsize1;
747 : size_t bufsize2;
748 : UChar *uchar1,
749 : *uchar2;
750 : int result;
751 :
752 : Assert(locale->provider == COLLPROVIDER_ICU);
753 :
754 : /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */
755 : #ifdef HAVE_UCOL_STRCOLLUTF8
756 : Assert(GetDatabaseEncoding() != PG_UTF8);
757 : #endif
758 :
759 0 : init_icu_converter();
760 :
761 0 : ulen1 = uchar_length(icu_converter, arg1, len1);
762 0 : ulen2 = uchar_length(icu_converter, arg2, len2);
763 :
764 0 : bufsize1 = (ulen1 + 1) * sizeof(UChar);
765 0 : bufsize2 = (ulen2 + 1) * sizeof(UChar);
766 :
767 0 : if (bufsize1 + bufsize2 > TEXTBUFLEN)
768 0 : buf = palloc(bufsize1 + bufsize2);
769 :
770 0 : uchar1 = (UChar *) buf;
771 0 : uchar2 = (UChar *) (buf + bufsize1);
772 :
773 0 : ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
774 0 : ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
775 :
776 0 : result = ucol_strcoll(locale->info.icu.ucol,
777 : uchar1, ulen1,
778 : uchar2, ulen2);
779 :
780 0 : if (buf != sbuf)
781 0 : pfree(buf);
782 :
783 0 : return result;
784 : }
785 :
786 : /* 'srclen' of -1 means the strings are NUL-terminated */
787 : static size_t
788 0 : strnxfrm_prefix_icu(char *dest, size_t destsize,
789 : const char *src, ssize_t srclen,
790 : pg_locale_t locale)
791 : {
792 : char sbuf[TEXTBUFLEN];
793 0 : char *buf = sbuf;
794 : UCharIterator iter;
795 : uint32_t state[2];
796 : UErrorCode status;
797 0 : int32_t ulen = -1;
798 0 : UChar *uchar = NULL;
799 : size_t uchar_bsize;
800 : Size result_bsize;
801 :
802 : Assert(locale->provider == COLLPROVIDER_ICU);
803 :
804 : /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */
805 : Assert(GetDatabaseEncoding() != PG_UTF8);
806 :
807 0 : init_icu_converter();
808 :
809 0 : ulen = uchar_length(icu_converter, src, srclen);
810 :
811 0 : uchar_bsize = (ulen + 1) * sizeof(UChar);
812 :
813 0 : if (uchar_bsize > TEXTBUFLEN)
814 0 : buf = palloc(uchar_bsize);
815 :
816 0 : uchar = (UChar *) buf;
817 :
818 0 : ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
819 :
820 0 : uiter_setString(&iter, uchar, ulen);
821 0 : state[0] = state[1] = 0; /* won't need that again */
822 0 : status = U_ZERO_ERROR;
823 0 : result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol,
824 : &iter,
825 : state,
826 : (uint8_t *) dest,
827 : destsize,
828 : &status);
829 0 : if (U_FAILURE(status))
830 0 : ereport(ERROR,
831 : (errmsg("sort key generation failed: %s",
832 : u_errorName(status))));
833 :
834 0 : return result_bsize;
835 : }
836 :
837 : static void
838 11280 : init_icu_converter(void)
839 : {
840 : const char *icu_encoding_name;
841 : UErrorCode status;
842 : UConverter *conv;
843 :
844 11280 : if (icu_converter)
845 11274 : return; /* already done */
846 :
847 6 : icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
848 6 : if (!icu_encoding_name)
849 0 : ereport(ERROR,
850 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
851 : errmsg("encoding \"%s\" not supported by ICU",
852 : pg_encoding_to_char(GetDatabaseEncoding()))));
853 :
854 6 : status = U_ZERO_ERROR;
855 6 : conv = ucnv_open(icu_encoding_name, &status);
856 6 : if (U_FAILURE(status))
857 0 : ereport(ERROR,
858 : (errmsg("could not open ICU converter for encoding \"%s\": %s",
859 : icu_encoding_name, u_errorName(status))));
860 :
861 6 : icu_converter = conv;
862 : }
863 :
864 : /*
865 : * Find length, in UChars, of given string if converted to UChar string.
866 : *
867 : * A length of -1 indicates that the input string is NUL-terminated.
868 : */
869 : static size_t
870 10656 : uchar_length(UConverter *converter, const char *str, int32_t len)
871 : {
872 10656 : UErrorCode status = U_ZERO_ERROR;
873 : int32_t ulen;
874 :
875 10656 : ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
876 10656 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
877 0 : ereport(ERROR,
878 : (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
879 10656 : return ulen;
880 : }
881 :
882 : /*
883 : * Convert the given source string into a UChar string, stored in dest, and
884 : * return the length (in UChars).
885 : *
886 : * A srclen of -1 indicates that the input string is NUL-terminated.
887 : */
888 : static int32_t
889 10656 : uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
890 : const char *src, int32_t srclen)
891 : {
892 10656 : UErrorCode status = U_ZERO_ERROR;
893 : int32_t ulen;
894 :
895 10656 : status = U_ZERO_ERROR;
896 10656 : ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
897 10656 : if (U_FAILURE(status))
898 0 : ereport(ERROR,
899 : (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
900 10656 : return ulen;
901 : }
902 :
903 : /*
904 : * Parse collation attributes from the given locale string and apply them to
905 : * the open collator.
906 : *
907 : * First, the locale string is canonicalized to an ICU format locale ID such
908 : * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
909 : * the key-value arguments.
910 : *
911 : * Starting with ICU version 54, the attributes are processed automatically by
912 : * ucol_open(), so this is only necessary for emulating this behavior on older
913 : * versions.
914 : */
915 : pg_attribute_unused()
916 : static void
917 0 : icu_set_collation_attributes(UCollator *collator, const char *loc,
918 : UErrorCode *status)
919 : {
920 : int32_t len;
921 : char *icu_locale_id;
922 : char *lower_str;
923 : char *str;
924 : char *token;
925 :
926 : /*
927 : * The input locale may be a BCP 47 language tag, e.g.
928 : * "und-u-kc-ks-level1", which expresses the same attributes in a
929 : * different form. It will be converted to the equivalent ICU format
930 : * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
931 : * uloc_canonicalize().
932 : */
933 0 : *status = U_ZERO_ERROR;
934 0 : len = uloc_canonicalize(loc, NULL, 0, status);
935 0 : icu_locale_id = palloc(len + 1);
936 0 : *status = U_ZERO_ERROR;
937 0 : len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
938 0 : if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
939 0 : return;
940 :
941 0 : lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
942 :
943 0 : pfree(icu_locale_id);
944 :
945 0 : str = strchr(lower_str, '@');
946 0 : if (!str)
947 0 : return;
948 0 : str++;
949 :
950 0 : while ((token = strsep(&str, ";")))
951 : {
952 0 : char *e = strchr(token, '=');
953 :
954 0 : if (e)
955 : {
956 : char *name;
957 : char *value;
958 : UColAttribute uattr;
959 : UColAttributeValue uvalue;
960 :
961 0 : *status = U_ZERO_ERROR;
962 :
963 0 : *e = '\0';
964 0 : name = token;
965 0 : value = e + 1;
966 :
967 : /*
968 : * See attribute name and value lists in ICU i18n/coll.cpp
969 : */
970 0 : if (strcmp(name, "colstrength") == 0)
971 0 : uattr = UCOL_STRENGTH;
972 0 : else if (strcmp(name, "colbackwards") == 0)
973 0 : uattr = UCOL_FRENCH_COLLATION;
974 0 : else if (strcmp(name, "colcaselevel") == 0)
975 0 : uattr = UCOL_CASE_LEVEL;
976 0 : else if (strcmp(name, "colcasefirst") == 0)
977 0 : uattr = UCOL_CASE_FIRST;
978 0 : else if (strcmp(name, "colalternate") == 0)
979 0 : uattr = UCOL_ALTERNATE_HANDLING;
980 0 : else if (strcmp(name, "colnormalization") == 0)
981 0 : uattr = UCOL_NORMALIZATION_MODE;
982 0 : else if (strcmp(name, "colnumeric") == 0)
983 0 : uattr = UCOL_NUMERIC_COLLATION;
984 : else
985 : /* ignore if unknown */
986 0 : continue;
987 :
988 0 : if (strcmp(value, "primary") == 0)
989 0 : uvalue = UCOL_PRIMARY;
990 0 : else if (strcmp(value, "secondary") == 0)
991 0 : uvalue = UCOL_SECONDARY;
992 0 : else if (strcmp(value, "tertiary") == 0)
993 0 : uvalue = UCOL_TERTIARY;
994 0 : else if (strcmp(value, "quaternary") == 0)
995 0 : uvalue = UCOL_QUATERNARY;
996 0 : else if (strcmp(value, "identical") == 0)
997 0 : uvalue = UCOL_IDENTICAL;
998 0 : else if (strcmp(value, "no") == 0)
999 0 : uvalue = UCOL_OFF;
1000 0 : else if (strcmp(value, "yes") == 0)
1001 0 : uvalue = UCOL_ON;
1002 0 : else if (strcmp(value, "shifted") == 0)
1003 0 : uvalue = UCOL_SHIFTED;
1004 0 : else if (strcmp(value, "non-ignorable") == 0)
1005 0 : uvalue = UCOL_NON_IGNORABLE;
1006 0 : else if (strcmp(value, "lower") == 0)
1007 0 : uvalue = UCOL_LOWER_FIRST;
1008 0 : else if (strcmp(value, "upper") == 0)
1009 0 : uvalue = UCOL_UPPER_FIRST;
1010 : else
1011 : {
1012 0 : *status = U_ILLEGAL_ARGUMENT_ERROR;
1013 0 : break;
1014 : }
1015 :
1016 0 : ucol_setAttribute(collator, uattr, uvalue, status);
1017 : }
1018 : }
1019 :
1020 0 : pfree(lower_str);
1021 : }
1022 :
1023 : #endif /* USE_ICU */
|