Line data Source code
1 : /*-----------------------------------------------------------------------
2 : *
3 : * PostgreSQL locale utilities for ICU
4 : *
5 : * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 : *
7 : * src/backend/utils/adt/pg_locale_icu.c
8 : *
9 : *-----------------------------------------------------------------------
10 : */
11 :
12 : #include "postgres.h"
13 :
14 : #ifdef USE_ICU
15 : #include <unicode/ucnv.h>
16 : #include <unicode/ustring.h>
17 :
18 : /*
19 : * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
20 : * (see
21 : * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
22 : */
23 : #if U_ICU_VERSION_MAJOR_NUM >= 53
24 : #define HAVE_UCOL_STRCOLLUTF8 1
25 : #else
26 : #undef HAVE_UCOL_STRCOLLUTF8
27 : #endif
28 :
29 : #endif
30 :
31 : #include "access/htup_details.h"
32 : #include "catalog/pg_database.h"
33 : #include "catalog/pg_collation.h"
34 : #include "mb/pg_wchar.h"
35 : #include "miscadmin.h"
36 : #include "utils/builtins.h"
37 : #include "utils/formatting.h"
38 : #include "utils/memutils.h"
39 : #include "utils/pg_locale.h"
40 : #include "utils/syscache.h"
41 :
42 : /*
43 : * Size of stack buffer to use for string transformations, used to avoid heap
44 : * allocations in typical cases. This should be large enough that most strings
45 : * will fit, but small enough that we feel comfortable putting it on the
46 : * stack.
47 : */
48 : #define TEXTBUFLEN 1024
49 :
50 : extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context);
51 : extern size_t strlower_icu(char *dst, size_t dstsize, const char *src,
52 : ssize_t srclen, pg_locale_t locale);
53 : extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src,
54 : ssize_t srclen, pg_locale_t locale);
55 : extern size_t strupper_icu(char *dst, size_t dstsize, const char *src,
56 : ssize_t srclen, pg_locale_t locale);
57 : extern size_t strfold_icu(char *dst, size_t dstsize, const char *src,
58 : ssize_t srclen, pg_locale_t locale);
59 :
60 : #ifdef USE_ICU
61 :
62 : extern UCollator *pg_ucol_open(const char *loc_str);
63 :
64 : static int strncoll_icu(const char *arg1, ssize_t len1,
65 : const char *arg2, ssize_t len2,
66 : pg_locale_t locale);
67 : static size_t strnxfrm_icu(char *dest, size_t destsize,
68 : const char *src, ssize_t srclen,
69 : pg_locale_t locale);
70 : static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
71 : const char *src, ssize_t srclen,
72 : pg_locale_t locale);
73 : extern char *get_collation_actual_version_icu(const char *collcollate);
74 :
75 : typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity,
76 : const UChar *src, int32_t srcLength,
77 : const char *locale,
78 : UErrorCode *pErrorCode);
79 :
80 : /*
81 : * Converter object for converting between ICU's UChar strings and C strings
82 : * in database encoding. Since the database encoding doesn't change, we only
83 : * need one of these per session.
84 : */
85 : static UConverter *icu_converter = NULL;
86 :
87 : static UCollator *make_icu_collator(const char *iculocstr,
88 : const char *icurules);
89 : static int strncoll_icu(const char *arg1, ssize_t len1,
90 : const char *arg2, ssize_t len2,
91 : pg_locale_t locale);
92 : static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
93 : const char *src, ssize_t srclen,
94 : pg_locale_t locale);
95 : #ifdef HAVE_UCOL_STRCOLLUTF8
96 : static int strncoll_icu_utf8(const char *arg1, ssize_t len1,
97 : const char *arg2, ssize_t len2,
98 : pg_locale_t locale);
99 : #endif
100 : static size_t strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
101 : const char *src, ssize_t srclen,
102 : pg_locale_t locale);
103 : static void init_icu_converter(void);
104 : static size_t uchar_length(UConverter *converter,
105 : const char *str, int32_t len);
106 : static int32_t uchar_convert(UConverter *converter,
107 : UChar *dest, int32_t destlen,
108 : const char *src, int32_t srclen);
109 : static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff,
110 : size_t nbytes);
111 : static size_t icu_from_uchar(char *dest, size_t destsize,
112 : const UChar *buff_uchar, int32_t len_uchar);
113 : static void icu_set_collation_attributes(UCollator *collator, const char *loc,
114 : UErrorCode *status);
115 : static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
116 : UChar **buff_dest, UChar *buff_source,
117 : int32_t len_source);
118 : static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
119 : const UChar *src, int32_t srcLength,
120 : const char *locale,
121 : UErrorCode *pErrorCode);
122 : static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
123 : const UChar *src, int32_t srcLength,
124 : const char *locale,
125 : UErrorCode *pErrorCode);
126 :
127 : static const struct collate_methods collate_methods_icu = {
128 : .strncoll = strncoll_icu,
129 : .strnxfrm = strnxfrm_icu,
130 : .strnxfrm_prefix = strnxfrm_prefix_icu,
131 : .strxfrm_is_safe = true,
132 : };
133 :
134 : static const struct collate_methods collate_methods_icu_utf8 = {
135 : #ifdef HAVE_UCOL_STRCOLLUTF8
136 : .strncoll = strncoll_icu_utf8,
137 : #else
138 : .strncoll = strncoll_icu,
139 : #endif
140 : .strnxfrm = strnxfrm_icu,
141 : .strnxfrm_prefix = strnxfrm_prefix_icu_utf8,
142 : .strxfrm_is_safe = true,
143 : };
144 :
145 : #endif
146 :
147 : pg_locale_t
148 210 : create_pg_locale_icu(Oid collid, MemoryContext context)
149 : {
150 : #ifdef USE_ICU
151 : bool deterministic;
152 : const char *iculocstr;
153 210 : const char *icurules = NULL;
154 : UCollator *collator;
155 : pg_locale_t result;
156 :
157 210 : if (collid == DEFAULT_COLLATION_OID)
158 : {
159 : HeapTuple tp;
160 : Datum datum;
161 : bool isnull;
162 :
163 26 : tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
164 26 : if (!HeapTupleIsValid(tp))
165 0 : elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
166 :
167 : /* default database collation is always deterministic */
168 26 : deterministic = true;
169 26 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
170 : Anum_pg_database_datlocale);
171 26 : iculocstr = TextDatumGetCString(datum);
172 26 : datum = SysCacheGetAttr(DATABASEOID, tp,
173 : Anum_pg_database_daticurules, &isnull);
174 26 : if (!isnull)
175 0 : icurules = TextDatumGetCString(datum);
176 :
177 26 : ReleaseSysCache(tp);
178 : }
179 : else
180 : {
181 : Form_pg_collation collform;
182 : HeapTuple tp;
183 : Datum datum;
184 : bool isnull;
185 :
186 184 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
187 184 : if (!HeapTupleIsValid(tp))
188 0 : elog(ERROR, "cache lookup failed for collation %u", collid);
189 184 : collform = (Form_pg_collation) GETSTRUCT(tp);
190 184 : deterministic = collform->collisdeterministic;
191 184 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
192 : Anum_pg_collation_colllocale);
193 184 : iculocstr = TextDatumGetCString(datum);
194 184 : datum = SysCacheGetAttr(COLLOID, tp,
195 : Anum_pg_collation_collicurules, &isnull);
196 184 : if (!isnull)
197 12 : icurules = TextDatumGetCString(datum);
198 :
199 184 : ReleaseSysCache(tp);
200 : }
201 :
202 210 : collator = make_icu_collator(iculocstr, icurules);
203 :
204 200 : result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
205 200 : result->info.icu.locale = MemoryContextStrdup(context, iculocstr);
206 200 : result->info.icu.ucol = collator;
207 200 : result->provider = COLLPROVIDER_ICU;
208 200 : result->deterministic = deterministic;
209 200 : result->collate_is_c = false;
210 200 : result->ctype_is_c = false;
211 200 : if (GetDatabaseEncoding() == PG_UTF8)
212 200 : result->collate = &collate_methods_icu_utf8;
213 : else
214 0 : result->collate = &collate_methods_icu;
215 :
216 200 : return result;
217 : #else
218 : /* could get here if a collation was created by a build with ICU */
219 : ereport(ERROR,
220 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
221 : errmsg("ICU is not supported in this build")));
222 :
223 : return NULL;
224 : #endif
225 : }
226 :
227 : #ifdef USE_ICU
228 :
229 : /*
230 : * Wrapper around ucol_open() to handle API differences for older ICU
231 : * versions.
232 : *
233 : * Ensure that no path leaks a UCollator.
234 : */
235 : UCollator *
236 68150 : pg_ucol_open(const char *loc_str)
237 : {
238 : UCollator *collator;
239 : UErrorCode status;
240 68150 : const char *orig_str = loc_str;
241 68150 : char *fixed_str = NULL;
242 :
243 : /*
244 : * Must never open default collator, because it depends on the environment
245 : * and may change at any time. Should not happen, but check here to catch
246 : * bugs that might be hard to catch otherwise.
247 : *
248 : * NB: the default collator is not the same as the collator for the root
249 : * locale. The root locale may be specified as the empty string, "und", or
250 : * "root". The default collator is opened by passing NULL to ucol_open().
251 : */
252 68150 : if (loc_str == NULL)
253 0 : elog(ERROR, "opening default collator is not supported");
254 :
255 : /*
256 : * In ICU versions 54 and earlier, "und" is not a recognized spelling of
257 : * the root locale. If the first component of the locale is "und", replace
258 : * with "root" before opening.
259 : */
260 : if (U_ICU_VERSION_MAJOR_NUM < 55)
261 : {
262 : char lang[ULOC_LANG_CAPACITY];
263 :
264 : status = U_ZERO_ERROR;
265 : uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
266 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
267 : {
268 : ereport(ERROR,
269 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
270 : errmsg("could not get language from locale \"%s\": %s",
271 : loc_str, u_errorName(status))));
272 : }
273 :
274 : if (strcmp(lang, "und") == 0)
275 : {
276 : const char *remainder = loc_str + strlen("und");
277 :
278 : fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
279 : strcpy(fixed_str, "root");
280 : strcat(fixed_str, remainder);
281 :
282 : loc_str = fixed_str;
283 : }
284 : }
285 :
286 68150 : status = U_ZERO_ERROR;
287 68150 : collator = ucol_open(loc_str, &status);
288 68150 : if (U_FAILURE(status))
289 12 : ereport(ERROR,
290 : /* use original string for error report */
291 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
292 : errmsg("could not open collator for locale \"%s\": %s",
293 : orig_str, u_errorName(status))));
294 :
295 : if (U_ICU_VERSION_MAJOR_NUM < 54)
296 : {
297 : status = U_ZERO_ERROR;
298 : icu_set_collation_attributes(collator, loc_str, &status);
299 :
300 : /*
301 : * Pretend the error came from ucol_open(), for consistent error
302 : * message across ICU versions.
303 : */
304 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
305 : {
306 : ucol_close(collator);
307 : ereport(ERROR,
308 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
309 : errmsg("could not open collator for locale \"%s\": %s",
310 : orig_str, u_errorName(status))));
311 : }
312 : }
313 :
314 68138 : if (fixed_str != NULL)
315 0 : pfree(fixed_str);
316 :
317 68138 : return collator;
318 : }
319 :
320 : /*
321 : * Create a UCollator with the given locale string and rules.
322 : *
323 : * Ensure that no path leaks a UCollator.
324 : */
325 : static UCollator *
326 210 : make_icu_collator(const char *iculocstr, const char *icurules)
327 : {
328 210 : if (!icurules)
329 : {
330 : /* simple case without rules */
331 198 : return pg_ucol_open(iculocstr);
332 : }
333 : else
334 : {
335 : UCollator *collator_std_rules;
336 : UCollator *collator_all_rules;
337 : const UChar *std_rules;
338 : UChar *my_rules;
339 : UChar *all_rules;
340 : int32_t length;
341 : int32_t total;
342 : UErrorCode status;
343 :
344 : /*
345 : * If rules are specified, we extract the rules of the standard
346 : * collation, add our own rules, and make a new collator with the
347 : * combined rules.
348 : */
349 12 : icu_to_uchar(&my_rules, icurules, strlen(icurules));
350 :
351 12 : collator_std_rules = pg_ucol_open(iculocstr);
352 :
353 12 : std_rules = ucol_getRules(collator_std_rules, &length);
354 :
355 12 : total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
356 :
357 : /* avoid leaking collator on OOM */
358 12 : all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
359 12 : if (!all_rules)
360 : {
361 0 : ucol_close(collator_std_rules);
362 0 : ereport(ERROR,
363 : (errcode(ERRCODE_OUT_OF_MEMORY),
364 : errmsg("out of memory")));
365 : }
366 :
367 12 : u_strcpy(all_rules, std_rules);
368 12 : u_strcat(all_rules, my_rules);
369 :
370 12 : ucol_close(collator_std_rules);
371 :
372 12 : status = U_ZERO_ERROR;
373 12 : collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
374 : UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
375 : NULL, &status);
376 12 : if (U_FAILURE(status))
377 : {
378 6 : ereport(ERROR,
379 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
380 : errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
381 : iculocstr, icurules, u_errorName(status))));
382 : }
383 :
384 6 : return collator_all_rules;
385 : }
386 : }
387 :
388 : size_t
389 528 : strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
390 : pg_locale_t locale)
391 : {
392 : int32_t len_uchar;
393 : int32_t len_conv;
394 : UChar *buff_uchar;
395 : UChar *buff_conv;
396 : size_t result_len;
397 :
398 528 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
399 528 : len_conv = icu_convert_case(u_strToLower, locale,
400 : &buff_conv, buff_uchar, len_uchar);
401 528 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
402 528 : pfree(buff_uchar);
403 528 : pfree(buff_conv);
404 :
405 528 : return result_len;
406 : }
407 :
408 : size_t
409 30 : strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
410 : pg_locale_t locale)
411 : {
412 : int32_t len_uchar;
413 : int32_t len_conv;
414 : UChar *buff_uchar;
415 : UChar *buff_conv;
416 : size_t result_len;
417 :
418 30 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
419 30 : len_conv = icu_convert_case(u_strToTitle_default_BI, locale,
420 : &buff_conv, buff_uchar, len_uchar);
421 30 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
422 30 : pfree(buff_uchar);
423 30 : pfree(buff_conv);
424 :
425 30 : return result_len;
426 : }
427 :
428 : size_t
429 54 : strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
430 : pg_locale_t locale)
431 : {
432 : int32_t len_uchar;
433 : int32_t len_conv;
434 : UChar *buff_uchar;
435 : UChar *buff_conv;
436 : size_t result_len;
437 :
438 54 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
439 54 : len_conv = icu_convert_case(u_strToUpper, locale,
440 : &buff_conv, buff_uchar, len_uchar);
441 54 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
442 54 : pfree(buff_uchar);
443 54 : pfree(buff_conv);
444 :
445 54 : return result_len;
446 : }
447 :
448 : size_t
449 12 : strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
450 : pg_locale_t locale)
451 : {
452 : int32_t len_uchar;
453 : int32_t len_conv;
454 : UChar *buff_uchar;
455 : UChar *buff_conv;
456 : size_t result_len;
457 :
458 12 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
459 12 : len_conv = icu_convert_case(u_strFoldCase_default, locale,
460 : &buff_conv, buff_uchar, len_uchar);
461 12 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
462 12 : pfree(buff_uchar);
463 12 : pfree(buff_conv);
464 :
465 12 : return result_len;
466 : }
467 :
468 : /*
469 : * strncoll_icu_utf8
470 : *
471 : * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
472 : * database encoding. An argument length of -1 means the string is
473 : * NUL-terminated.
474 : */
475 : #ifdef HAVE_UCOL_STRCOLLUTF8
476 : int
477 25414 : strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
478 : pg_locale_t locale)
479 : {
480 : int result;
481 : UErrorCode status;
482 :
483 : Assert(locale->provider == COLLPROVIDER_ICU);
484 :
485 : Assert(GetDatabaseEncoding() == PG_UTF8);
486 :
487 25414 : status = U_ZERO_ERROR;
488 25414 : result = ucol_strcollUTF8(locale->info.icu.ucol,
489 : arg1, len1,
490 : arg2, len2,
491 : &status);
492 25414 : if (U_FAILURE(status))
493 0 : ereport(ERROR,
494 : (errmsg("collation failed: %s", u_errorName(status))));
495 :
496 25414 : return result;
497 : }
498 : #endif
499 :
500 : /* 'srclen' of -1 means the strings are NUL-terminated */
501 : size_t
502 10020 : strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
503 : pg_locale_t locale)
504 : {
505 : char sbuf[TEXTBUFLEN];
506 10020 : char *buf = sbuf;
507 : UChar *uchar;
508 : int32_t ulen;
509 : size_t uchar_bsize;
510 : Size result_bsize;
511 :
512 : Assert(locale->provider == COLLPROVIDER_ICU);
513 :
514 10020 : init_icu_converter();
515 :
516 10020 : ulen = uchar_length(icu_converter, src, srclen);
517 :
518 10020 : uchar_bsize = (ulen + 1) * sizeof(UChar);
519 :
520 10020 : if (uchar_bsize > TEXTBUFLEN)
521 0 : buf = palloc(uchar_bsize);
522 :
523 10020 : uchar = (UChar *) buf;
524 :
525 10020 : ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
526 :
527 10020 : result_bsize = ucol_getSortKey(locale->info.icu.ucol,
528 : uchar, ulen,
529 : (uint8_t *) dest, destsize);
530 :
531 : /*
532 : * ucol_getSortKey() counts the nul-terminator in the result length, but
533 : * this function should not.
534 : */
535 : Assert(result_bsize > 0);
536 10020 : result_bsize--;
537 :
538 10020 : if (buf != sbuf)
539 0 : pfree(buf);
540 :
541 : /* if dest is defined, it should be nul-terminated */
542 : Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
543 :
544 10020 : return result_bsize;
545 : }
546 :
547 : /* 'srclen' of -1 means the strings are NUL-terminated */
548 : size_t
549 1656 : strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
550 : const char *src, ssize_t srclen,
551 : pg_locale_t locale)
552 : {
553 : size_t result;
554 : UCharIterator iter;
555 : uint32_t state[2];
556 : UErrorCode status;
557 :
558 : Assert(locale->provider == COLLPROVIDER_ICU);
559 :
560 : Assert(GetDatabaseEncoding() == PG_UTF8);
561 :
562 1656 : uiter_setUTF8(&iter, src, srclen);
563 1656 : state[0] = state[1] = 0; /* won't need that again */
564 1656 : status = U_ZERO_ERROR;
565 1656 : result = ucol_nextSortKeyPart(locale->info.icu.ucol,
566 : &iter,
567 : state,
568 : (uint8_t *) dest,
569 : destsize,
570 : &status);
571 1656 : if (U_FAILURE(status))
572 0 : ereport(ERROR,
573 : (errmsg("sort key generation failed: %s",
574 : u_errorName(status))));
575 :
576 1656 : return result;
577 : }
578 :
579 : char *
580 67792 : get_collation_actual_version_icu(const char *collcollate)
581 : {
582 : UCollator *collator;
583 : UVersionInfo versioninfo;
584 : char buf[U_MAX_VERSION_STRING_LENGTH];
585 :
586 67792 : collator = pg_ucol_open(collcollate);
587 :
588 67792 : ucol_getVersion(collator, versioninfo);
589 67792 : ucol_close(collator);
590 :
591 67792 : u_versionToString(versioninfo, buf);
592 67792 : return pstrdup(buf);
593 : }
594 :
595 : /*
596 : * Convert a string in the database encoding into a string of UChars.
597 : *
598 : * The source string at buff is of length nbytes
599 : * (it needn't be nul-terminated)
600 : *
601 : * *buff_uchar receives a pointer to the palloc'd result string, and
602 : * the function's result is the number of UChars generated.
603 : *
604 : * The result string is nul-terminated, though most callers rely on the
605 : * result length instead.
606 : */
607 : static int32_t
608 636 : icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
609 : {
610 : int32_t len_uchar;
611 :
612 636 : init_icu_converter();
613 :
614 636 : len_uchar = uchar_length(icu_converter, buff, nbytes);
615 :
616 636 : *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
617 636 : len_uchar = uchar_convert(icu_converter,
618 : *buff_uchar, len_uchar + 1, buff, nbytes);
619 :
620 636 : return len_uchar;
621 : }
622 :
623 : /*
624 : * Convert a string of UChars into the database encoding.
625 : *
626 : * The source string at buff_uchar is of length len_uchar
627 : * (it needn't be nul-terminated)
628 : *
629 : * *result receives a pointer to the palloc'd result string, and the
630 : * function's result is the number of bytes generated (not counting nul).
631 : *
632 : * The result string is nul-terminated.
633 : */
634 : static size_t
635 624 : icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar)
636 : {
637 : UErrorCode status;
638 : int32_t len_result;
639 :
640 624 : init_icu_converter();
641 :
642 624 : status = U_ZERO_ERROR;
643 624 : len_result = ucnv_fromUChars(icu_converter, NULL, 0,
644 : buff_uchar, len_uchar, &status);
645 624 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
646 0 : ereport(ERROR,
647 : (errmsg("%s failed: %s", "ucnv_fromUChars",
648 : u_errorName(status))));
649 :
650 624 : if (len_result + 1 > destsize)
651 60 : return len_result;
652 :
653 564 : status = U_ZERO_ERROR;
654 564 : len_result = ucnv_fromUChars(icu_converter, dest, len_result + 1,
655 : buff_uchar, len_uchar, &status);
656 564 : if (U_FAILURE(status) ||
657 564 : status == U_STRING_NOT_TERMINATED_WARNING)
658 0 : ereport(ERROR,
659 : (errmsg("%s failed: %s", "ucnv_fromUChars",
660 : u_errorName(status))));
661 :
662 564 : return len_result;
663 : }
664 :
665 : static int32_t
666 624 : icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
667 : UChar **buff_dest, UChar *buff_source, int32_t len_source)
668 : {
669 : UErrorCode status;
670 : int32_t len_dest;
671 :
672 624 : len_dest = len_source; /* try first with same length */
673 624 : *buff_dest = palloc(len_dest * sizeof(**buff_dest));
674 624 : status = U_ZERO_ERROR;
675 624 : len_dest = func(*buff_dest, len_dest, buff_source, len_source,
676 : mylocale->info.icu.locale, &status);
677 624 : if (status == U_BUFFER_OVERFLOW_ERROR)
678 : {
679 : /* try again with adjusted length */
680 18 : pfree(*buff_dest);
681 18 : *buff_dest = palloc(len_dest * sizeof(**buff_dest));
682 18 : status = U_ZERO_ERROR;
683 18 : len_dest = func(*buff_dest, len_dest, buff_source, len_source,
684 : mylocale->info.icu.locale, &status);
685 : }
686 624 : if (U_FAILURE(status))
687 0 : ereport(ERROR,
688 : (errmsg("case conversion failed: %s", u_errorName(status))));
689 624 : return len_dest;
690 : }
691 :
692 : static int32_t
693 30 : u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
694 : const UChar *src, int32_t srcLength,
695 : const char *locale,
696 : UErrorCode *pErrorCode)
697 : {
698 30 : return u_strToTitle(dest, destCapacity, src, srcLength,
699 : NULL, locale, pErrorCode);
700 : }
701 :
702 : static int32_t
703 24 : u_strFoldCase_default(UChar *dest, int32_t destCapacity,
704 : const UChar *src, int32_t srcLength,
705 : const char *locale,
706 : UErrorCode *pErrorCode)
707 : {
708 24 : uint32 options = U_FOLD_CASE_DEFAULT;
709 : char lang[3];
710 : UErrorCode status;
711 :
712 : /*
713 : * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
714 : * folding does not accept a locale. Instead it just supports a single
715 : * option relevant to Turkic languages 'az' and 'tr'; check for those
716 : * languages to enable the option.
717 : */
718 24 : status = U_ZERO_ERROR;
719 24 : uloc_getLanguage(locale, lang, 3, &status);
720 24 : if (U_SUCCESS(status))
721 : {
722 : /*
723 : * The option name is confusing, but it causes u_strFoldCase to use
724 : * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
725 : */
726 24 : if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
727 12 : options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
728 : }
729 :
730 24 : return u_strFoldCase(dest, destCapacity, src, srcLength,
731 : options, pErrorCode);
732 : }
733 :
734 : /*
735 : * strncoll_icu
736 : *
737 : * Convert the arguments from the database encoding to UChar strings, then
738 : * call ucol_strcoll(). An argument length of -1 means that the string is
739 : * NUL-terminated.
740 : *
741 : * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
742 : * caller should call that instead.
743 : */
744 : static int
745 0 : strncoll_icu(const char *arg1, ssize_t len1,
746 : const char *arg2, ssize_t len2, pg_locale_t locale)
747 : {
748 : char sbuf[TEXTBUFLEN];
749 0 : char *buf = sbuf;
750 : int32_t ulen1;
751 : int32_t ulen2;
752 : size_t bufsize1;
753 : size_t bufsize2;
754 : UChar *uchar1,
755 : *uchar2;
756 : int result;
757 :
758 : Assert(locale->provider == COLLPROVIDER_ICU);
759 :
760 : /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */
761 : #ifdef HAVE_UCOL_STRCOLLUTF8
762 : Assert(GetDatabaseEncoding() != PG_UTF8);
763 : #endif
764 :
765 0 : init_icu_converter();
766 :
767 0 : ulen1 = uchar_length(icu_converter, arg1, len1);
768 0 : ulen2 = uchar_length(icu_converter, arg2, len2);
769 :
770 0 : bufsize1 = (ulen1 + 1) * sizeof(UChar);
771 0 : bufsize2 = (ulen2 + 1) * sizeof(UChar);
772 :
773 0 : if (bufsize1 + bufsize2 > TEXTBUFLEN)
774 0 : buf = palloc(bufsize1 + bufsize2);
775 :
776 0 : uchar1 = (UChar *) buf;
777 0 : uchar2 = (UChar *) (buf + bufsize1);
778 :
779 0 : ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
780 0 : ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
781 :
782 0 : result = ucol_strcoll(locale->info.icu.ucol,
783 : uchar1, ulen1,
784 : uchar2, ulen2);
785 :
786 0 : if (buf != sbuf)
787 0 : pfree(buf);
788 :
789 0 : return result;
790 : }
791 :
792 : /* 'srclen' of -1 means the strings are NUL-terminated */
793 : static size_t
794 0 : strnxfrm_prefix_icu(char *dest, size_t destsize,
795 : const char *src, ssize_t srclen,
796 : pg_locale_t locale)
797 : {
798 : char sbuf[TEXTBUFLEN];
799 0 : char *buf = sbuf;
800 : UCharIterator iter;
801 : uint32_t state[2];
802 : UErrorCode status;
803 0 : int32_t ulen = -1;
804 0 : UChar *uchar = NULL;
805 : size_t uchar_bsize;
806 : Size result_bsize;
807 :
808 : Assert(locale->provider == COLLPROVIDER_ICU);
809 :
810 : /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */
811 : Assert(GetDatabaseEncoding() != PG_UTF8);
812 :
813 0 : init_icu_converter();
814 :
815 0 : ulen = uchar_length(icu_converter, src, srclen);
816 :
817 0 : uchar_bsize = (ulen + 1) * sizeof(UChar);
818 :
819 0 : if (uchar_bsize > TEXTBUFLEN)
820 0 : buf = palloc(uchar_bsize);
821 :
822 0 : uchar = (UChar *) buf;
823 :
824 0 : ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
825 :
826 0 : uiter_setString(&iter, uchar, ulen);
827 0 : state[0] = state[1] = 0; /* won't need that again */
828 0 : status = U_ZERO_ERROR;
829 0 : result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol,
830 : &iter,
831 : state,
832 : (uint8_t *) dest,
833 : destsize,
834 : &status);
835 0 : if (U_FAILURE(status))
836 0 : ereport(ERROR,
837 : (errmsg("sort key generation failed: %s",
838 : u_errorName(status))));
839 :
840 0 : return result_bsize;
841 : }
842 :
843 : static void
844 11280 : init_icu_converter(void)
845 : {
846 : const char *icu_encoding_name;
847 : UErrorCode status;
848 : UConverter *conv;
849 :
850 11280 : if (icu_converter)
851 11274 : return; /* already done */
852 :
853 6 : icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
854 6 : if (!icu_encoding_name)
855 0 : ereport(ERROR,
856 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
857 : errmsg("encoding \"%s\" not supported by ICU",
858 : pg_encoding_to_char(GetDatabaseEncoding()))));
859 :
860 6 : status = U_ZERO_ERROR;
861 6 : conv = ucnv_open(icu_encoding_name, &status);
862 6 : if (U_FAILURE(status))
863 0 : ereport(ERROR,
864 : (errmsg("could not open ICU converter for encoding \"%s\": %s",
865 : icu_encoding_name, u_errorName(status))));
866 :
867 6 : icu_converter = conv;
868 : }
869 :
870 : /*
871 : * Find length, in UChars, of given string if converted to UChar string.
872 : *
873 : * A length of -1 indicates that the input string is NUL-terminated.
874 : */
875 : static size_t
876 10656 : uchar_length(UConverter *converter, const char *str, int32_t len)
877 : {
878 10656 : UErrorCode status = U_ZERO_ERROR;
879 : int32_t ulen;
880 :
881 10656 : ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
882 10656 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
883 0 : ereport(ERROR,
884 : (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
885 10656 : return ulen;
886 : }
887 :
888 : /*
889 : * Convert the given source string into a UChar string, stored in dest, and
890 : * return the length (in UChars).
891 : *
892 : * A srclen of -1 indicates that the input string is NUL-terminated.
893 : */
894 : static int32_t
895 10656 : uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
896 : const char *src, int32_t srclen)
897 : {
898 10656 : UErrorCode status = U_ZERO_ERROR;
899 : int32_t ulen;
900 :
901 10656 : status = U_ZERO_ERROR;
902 10656 : ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
903 10656 : if (U_FAILURE(status))
904 0 : ereport(ERROR,
905 : (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
906 10656 : return ulen;
907 : }
908 :
909 : /*
910 : * Parse collation attributes from the given locale string and apply them to
911 : * the open collator.
912 : *
913 : * First, the locale string is canonicalized to an ICU format locale ID such
914 : * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
915 : * the key-value arguments.
916 : *
917 : * Starting with ICU version 54, the attributes are processed automatically by
918 : * ucol_open(), so this is only necessary for emulating this behavior on older
919 : * versions.
920 : */
921 : pg_attribute_unused()
922 : static void
923 0 : icu_set_collation_attributes(UCollator *collator, const char *loc,
924 : UErrorCode *status)
925 : {
926 : int32_t len;
927 : char *icu_locale_id;
928 : char *lower_str;
929 : char *str;
930 : char *token;
931 :
932 : /*
933 : * The input locale may be a BCP 47 language tag, e.g.
934 : * "und-u-kc-ks-level1", which expresses the same attributes in a
935 : * different form. It will be converted to the equivalent ICU format
936 : * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
937 : * uloc_canonicalize().
938 : */
939 0 : *status = U_ZERO_ERROR;
940 0 : len = uloc_canonicalize(loc, NULL, 0, status);
941 0 : icu_locale_id = palloc(len + 1);
942 0 : *status = U_ZERO_ERROR;
943 0 : len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
944 0 : if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
945 0 : return;
946 :
947 0 : lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
948 :
949 0 : pfree(icu_locale_id);
950 :
951 0 : str = strchr(lower_str, '@');
952 0 : if (!str)
953 0 : return;
954 0 : str++;
955 :
956 0 : while ((token = strsep(&str, ";")))
957 : {
958 0 : char *e = strchr(token, '=');
959 :
960 0 : if (e)
961 : {
962 : char *name;
963 : char *value;
964 : UColAttribute uattr;
965 : UColAttributeValue uvalue;
966 :
967 0 : *status = U_ZERO_ERROR;
968 :
969 0 : *e = '\0';
970 0 : name = token;
971 0 : value = e + 1;
972 :
973 : /*
974 : * See attribute name and value lists in ICU i18n/coll.cpp
975 : */
976 0 : if (strcmp(name, "colstrength") == 0)
977 0 : uattr = UCOL_STRENGTH;
978 0 : else if (strcmp(name, "colbackwards") == 0)
979 0 : uattr = UCOL_FRENCH_COLLATION;
980 0 : else if (strcmp(name, "colcaselevel") == 0)
981 0 : uattr = UCOL_CASE_LEVEL;
982 0 : else if (strcmp(name, "colcasefirst") == 0)
983 0 : uattr = UCOL_CASE_FIRST;
984 0 : else if (strcmp(name, "colalternate") == 0)
985 0 : uattr = UCOL_ALTERNATE_HANDLING;
986 0 : else if (strcmp(name, "colnormalization") == 0)
987 0 : uattr = UCOL_NORMALIZATION_MODE;
988 0 : else if (strcmp(name, "colnumeric") == 0)
989 0 : uattr = UCOL_NUMERIC_COLLATION;
990 : else
991 : /* ignore if unknown */
992 0 : continue;
993 :
994 0 : if (strcmp(value, "primary") == 0)
995 0 : uvalue = UCOL_PRIMARY;
996 0 : else if (strcmp(value, "secondary") == 0)
997 0 : uvalue = UCOL_SECONDARY;
998 0 : else if (strcmp(value, "tertiary") == 0)
999 0 : uvalue = UCOL_TERTIARY;
1000 0 : else if (strcmp(value, "quaternary") == 0)
1001 0 : uvalue = UCOL_QUATERNARY;
1002 0 : else if (strcmp(value, "identical") == 0)
1003 0 : uvalue = UCOL_IDENTICAL;
1004 0 : else if (strcmp(value, "no") == 0)
1005 0 : uvalue = UCOL_OFF;
1006 0 : else if (strcmp(value, "yes") == 0)
1007 0 : uvalue = UCOL_ON;
1008 0 : else if (strcmp(value, "shifted") == 0)
1009 0 : uvalue = UCOL_SHIFTED;
1010 0 : else if (strcmp(value, "non-ignorable") == 0)
1011 0 : uvalue = UCOL_NON_IGNORABLE;
1012 0 : else if (strcmp(value, "lower") == 0)
1013 0 : uvalue = UCOL_LOWER_FIRST;
1014 0 : else if (strcmp(value, "upper") == 0)
1015 0 : uvalue = UCOL_UPPER_FIRST;
1016 : else
1017 : {
1018 0 : *status = U_ILLEGAL_ARGUMENT_ERROR;
1019 0 : break;
1020 : }
1021 :
1022 0 : ucol_setAttribute(collator, uattr, uvalue, status);
1023 : }
1024 : }
1025 :
1026 0 : pfree(lower_str);
1027 : }
1028 :
1029 : #endif /* USE_ICU */
|