Line data Source code
1 : /*-----------------------------------------------------------------------
2 : *
3 : * PostgreSQL locale utilities for ICU
4 : *
5 : * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 : *
7 : * src/backend/utils/adt/pg_locale_icu.c
8 : *
9 : *-----------------------------------------------------------------------
10 : */
11 :
12 : #include "postgres.h"
13 :
14 : #ifdef USE_ICU
15 : #include <unicode/ucnv.h>
16 : #include <unicode/ustring.h>
17 :
18 : /*
19 : * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
20 : * (see
21 : * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
22 : */
23 : #if U_ICU_VERSION_MAJOR_NUM >= 53
24 : #define HAVE_UCOL_STRCOLLUTF8 1
25 : #else
26 : #undef HAVE_UCOL_STRCOLLUTF8
27 : #endif
28 :
29 : #endif
30 :
31 : #include "access/htup_details.h"
32 : #include "catalog/pg_database.h"
33 : #include "catalog/pg_collation.h"
34 : #include "mb/pg_wchar.h"
35 : #include "miscadmin.h"
36 : #include "utils/builtins.h"
37 : #include "utils/formatting.h"
38 : #include "utils/memutils.h"
39 : #include "utils/pg_locale.h"
40 : #include "utils/syscache.h"
41 :
42 : /*
43 : * Size of stack buffer to use for string transformations, used to avoid heap
44 : * allocations in typical cases. This should be large enough that most strings
45 : * will fit, but small enough that we feel comfortable putting it on the
46 : * stack.
47 : */
48 : #define TEXTBUFLEN 1024
49 :
50 : extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context);
51 : extern size_t strlower_icu(char *dst, size_t dstsize, const char *src,
52 : ssize_t srclen, pg_locale_t locale);
53 : extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src,
54 : ssize_t srclen, pg_locale_t locale);
55 : extern size_t strupper_icu(char *dst, size_t dstsize, const char *src,
56 : ssize_t srclen, pg_locale_t locale);
57 :
58 : #ifdef USE_ICU
59 :
60 : extern UCollator *pg_ucol_open(const char *loc_str);
61 :
62 : static int strncoll_icu(const char *arg1, ssize_t len1,
63 : const char *arg2, ssize_t len2,
64 : pg_locale_t locale);
65 : static size_t strnxfrm_icu(char *dest, size_t destsize,
66 : const char *src, ssize_t srclen,
67 : pg_locale_t locale);
68 : static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
69 : const char *src, ssize_t srclen,
70 : pg_locale_t locale);
71 : extern char *get_collation_actual_version_icu(const char *collcollate);
72 :
73 : typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity,
74 : const UChar *src, int32_t srcLength,
75 : const char *locale,
76 : UErrorCode *pErrorCode);
77 :
78 : /*
79 : * Converter object for converting between ICU's UChar strings and C strings
80 : * in database encoding. Since the database encoding doesn't change, we only
81 : * need one of these per session.
82 : */
83 : static UConverter *icu_converter = NULL;
84 :
85 : static UCollator *make_icu_collator(const char *iculocstr,
86 : const char *icurules);
87 : static int strncoll_icu(const char *arg1, ssize_t len1,
88 : const char *arg2, ssize_t len2,
89 : pg_locale_t locale);
90 : static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
91 : const char *src, ssize_t srclen,
92 : pg_locale_t locale);
93 : #ifdef HAVE_UCOL_STRCOLLUTF8
94 : static int strncoll_icu_utf8(const char *arg1, ssize_t len1,
95 : const char *arg2, ssize_t len2,
96 : pg_locale_t locale);
97 : #endif
98 : static size_t strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
99 : const char *src, ssize_t srclen,
100 : pg_locale_t locale);
101 : static void init_icu_converter(void);
102 : static size_t uchar_length(UConverter *converter,
103 : const char *str, int32_t len);
104 : static int32_t uchar_convert(UConverter *converter,
105 : UChar *dest, int32_t destlen,
106 : const char *src, int32_t srclen);
107 : static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff,
108 : size_t nbytes);
109 : static size_t icu_from_uchar(char *dest, size_t destsize,
110 : const UChar *buff_uchar, int32_t len_uchar);
111 : static void icu_set_collation_attributes(UCollator *collator, const char *loc,
112 : UErrorCode *status);
113 : static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
114 : UChar **buff_dest, UChar *buff_source,
115 : int32_t len_source);
116 : static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
117 : const UChar *src, int32_t srcLength,
118 : const char *locale,
119 : UErrorCode *pErrorCode);
120 :
121 : static const struct collate_methods collate_methods_icu = {
122 : .strncoll = strncoll_icu,
123 : .strnxfrm = strnxfrm_icu,
124 : .strnxfrm_prefix = strnxfrm_prefix_icu,
125 : .strxfrm_is_safe = true,
126 : };
127 :
128 : static const struct collate_methods collate_methods_icu_utf8 = {
129 : #ifdef HAVE_UCOL_STRCOLLUTF8
130 : .strncoll = strncoll_icu_utf8,
131 : #else
132 : .strncoll = strncoll_icu,
133 : #endif
134 : .strnxfrm = strnxfrm_icu,
135 : .strnxfrm_prefix = strnxfrm_prefix_icu_utf8,
136 : .strxfrm_is_safe = true,
137 : };
138 :
139 : #endif
140 :
141 : pg_locale_t
142 210 : create_pg_locale_icu(Oid collid, MemoryContext context)
143 : {
144 : #ifdef USE_ICU
145 : bool deterministic;
146 : const char *iculocstr;
147 210 : const char *icurules = NULL;
148 : UCollator *collator;
149 : pg_locale_t result;
150 :
151 210 : if (collid == DEFAULT_COLLATION_OID)
152 : {
153 : HeapTuple tp;
154 : Datum datum;
155 : bool isnull;
156 :
157 26 : tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
158 26 : if (!HeapTupleIsValid(tp))
159 0 : elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
160 :
161 : /* default database collation is always deterministic */
162 26 : deterministic = true;
163 26 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
164 : Anum_pg_database_datlocale);
165 26 : iculocstr = TextDatumGetCString(datum);
166 26 : datum = SysCacheGetAttr(DATABASEOID, tp,
167 : Anum_pg_database_daticurules, &isnull);
168 26 : if (!isnull)
169 0 : icurules = TextDatumGetCString(datum);
170 :
171 26 : ReleaseSysCache(tp);
172 : }
173 : else
174 : {
175 : Form_pg_collation collform;
176 : HeapTuple tp;
177 : Datum datum;
178 : bool isnull;
179 :
180 184 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
181 184 : if (!HeapTupleIsValid(tp))
182 0 : elog(ERROR, "cache lookup failed for collation %u", collid);
183 184 : collform = (Form_pg_collation) GETSTRUCT(tp);
184 184 : deterministic = collform->collisdeterministic;
185 184 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
186 : Anum_pg_collation_colllocale);
187 184 : iculocstr = TextDatumGetCString(datum);
188 184 : datum = SysCacheGetAttr(COLLOID, tp,
189 : Anum_pg_collation_collicurules, &isnull);
190 184 : if (!isnull)
191 12 : icurules = TextDatumGetCString(datum);
192 :
193 184 : ReleaseSysCache(tp);
194 : }
195 :
196 210 : collator = make_icu_collator(iculocstr, icurules);
197 :
198 200 : result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
199 200 : result->info.icu.locale = MemoryContextStrdup(context, iculocstr);
200 200 : result->info.icu.ucol = collator;
201 200 : result->provider = COLLPROVIDER_ICU;
202 200 : result->deterministic = deterministic;
203 200 : result->collate_is_c = false;
204 200 : result->ctype_is_c = false;
205 200 : if (GetDatabaseEncoding() == PG_UTF8)
206 200 : result->collate = &collate_methods_icu_utf8;
207 : else
208 0 : result->collate = &collate_methods_icu;
209 :
210 200 : return result;
211 : #else
212 : /* could get here if a collation was created by a build with ICU */
213 : ereport(ERROR,
214 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
215 : errmsg("ICU is not supported in this build")));
216 :
217 : return NULL;
218 : #endif
219 : }
220 :
221 : #ifdef USE_ICU
222 :
223 : /*
224 : * Wrapper around ucol_open() to handle API differences for older ICU
225 : * versions.
226 : *
227 : * Ensure that no path leaks a UCollator.
228 : */
229 : UCollator *
230 68150 : pg_ucol_open(const char *loc_str)
231 : {
232 : UCollator *collator;
233 : UErrorCode status;
234 68150 : const char *orig_str = loc_str;
235 68150 : char *fixed_str = NULL;
236 :
237 : /*
238 : * Must never open default collator, because it depends on the environment
239 : * and may change at any time. Should not happen, but check here to catch
240 : * bugs that might be hard to catch otherwise.
241 : *
242 : * NB: the default collator is not the same as the collator for the root
243 : * locale. The root locale may be specified as the empty string, "und", or
244 : * "root". The default collator is opened by passing NULL to ucol_open().
245 : */
246 68150 : if (loc_str == NULL)
247 0 : elog(ERROR, "opening default collator is not supported");
248 :
249 : /*
250 : * In ICU versions 54 and earlier, "und" is not a recognized spelling of
251 : * the root locale. If the first component of the locale is "und", replace
252 : * with "root" before opening.
253 : */
254 : if (U_ICU_VERSION_MAJOR_NUM < 55)
255 : {
256 : char lang[ULOC_LANG_CAPACITY];
257 :
258 : status = U_ZERO_ERROR;
259 : uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
260 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
261 : {
262 : ereport(ERROR,
263 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
264 : errmsg("could not get language from locale \"%s\": %s",
265 : loc_str, u_errorName(status))));
266 : }
267 :
268 : if (strcmp(lang, "und") == 0)
269 : {
270 : const char *remainder = loc_str + strlen("und");
271 :
272 : fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
273 : strcpy(fixed_str, "root");
274 : strcat(fixed_str, remainder);
275 :
276 : loc_str = fixed_str;
277 : }
278 : }
279 :
280 68150 : status = U_ZERO_ERROR;
281 68150 : collator = ucol_open(loc_str, &status);
282 68150 : if (U_FAILURE(status))
283 12 : ereport(ERROR,
284 : /* use original string for error report */
285 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
286 : errmsg("could not open collator for locale \"%s\": %s",
287 : orig_str, u_errorName(status))));
288 :
289 : if (U_ICU_VERSION_MAJOR_NUM < 54)
290 : {
291 : status = U_ZERO_ERROR;
292 : icu_set_collation_attributes(collator, loc_str, &status);
293 :
294 : /*
295 : * Pretend the error came from ucol_open(), for consistent error
296 : * message across ICU versions.
297 : */
298 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
299 : {
300 : ucol_close(collator);
301 : ereport(ERROR,
302 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
303 : errmsg("could not open collator for locale \"%s\": %s",
304 : orig_str, u_errorName(status))));
305 : }
306 : }
307 :
308 68138 : if (fixed_str != NULL)
309 0 : pfree(fixed_str);
310 :
311 68138 : return collator;
312 : }
313 :
314 : /*
315 : * Create a UCollator with the given locale string and rules.
316 : *
317 : * Ensure that no path leaks a UCollator.
318 : */
319 : static UCollator *
320 210 : make_icu_collator(const char *iculocstr, const char *icurules)
321 : {
322 210 : if (!icurules)
323 : {
324 : /* simple case without rules */
325 198 : return pg_ucol_open(iculocstr);
326 : }
327 : else
328 : {
329 : UCollator *collator_std_rules;
330 : UCollator *collator_all_rules;
331 : const UChar *std_rules;
332 : UChar *my_rules;
333 : UChar *all_rules;
334 : int32_t length;
335 : int32_t total;
336 : UErrorCode status;
337 :
338 : /*
339 : * If rules are specified, we extract the rules of the standard
340 : * collation, add our own rules, and make a new collator with the
341 : * combined rules.
342 : */
343 12 : icu_to_uchar(&my_rules, icurules, strlen(icurules));
344 :
345 12 : collator_std_rules = pg_ucol_open(iculocstr);
346 :
347 12 : std_rules = ucol_getRules(collator_std_rules, &length);
348 :
349 12 : total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
350 :
351 : /* avoid leaking collator on OOM */
352 12 : all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
353 12 : if (!all_rules)
354 : {
355 0 : ucol_close(collator_std_rules);
356 0 : ereport(ERROR,
357 : (errcode(ERRCODE_OUT_OF_MEMORY),
358 : errmsg("out of memory")));
359 : }
360 :
361 12 : u_strcpy(all_rules, std_rules);
362 12 : u_strcat(all_rules, my_rules);
363 :
364 12 : ucol_close(collator_std_rules);
365 :
366 12 : status = U_ZERO_ERROR;
367 12 : collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
368 : UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
369 : NULL, &status);
370 12 : if (U_FAILURE(status))
371 : {
372 6 : ereport(ERROR,
373 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
374 : errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
375 : iculocstr, icurules, u_errorName(status))));
376 : }
377 :
378 6 : return collator_all_rules;
379 : }
380 : }
381 :
382 : size_t
383 516 : strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
384 : pg_locale_t locale)
385 : {
386 : int32_t len_uchar;
387 : int32_t len_conv;
388 : UChar *buff_uchar;
389 : UChar *buff_conv;
390 : size_t result_len;
391 :
392 516 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
393 516 : len_conv = icu_convert_case(u_strToLower, locale,
394 : &buff_conv, buff_uchar, len_uchar);
395 516 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
396 516 : pfree(buff_uchar);
397 516 : pfree(buff_conv);
398 :
399 516 : return result_len;
400 : }
401 :
402 : size_t
403 30 : strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
404 : pg_locale_t locale)
405 : {
406 : int32_t len_uchar;
407 : int32_t len_conv;
408 : UChar *buff_uchar;
409 : UChar *buff_conv;
410 : size_t result_len;
411 :
412 30 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
413 30 : len_conv = icu_convert_case(u_strToTitle_default_BI, locale,
414 : &buff_conv, buff_uchar, len_uchar);
415 30 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
416 30 : pfree(buff_uchar);
417 30 : pfree(buff_conv);
418 :
419 30 : return result_len;
420 : }
421 :
422 : size_t
423 54 : strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
424 : pg_locale_t locale)
425 : {
426 : int32_t len_uchar;
427 : int32_t len_conv;
428 : UChar *buff_uchar;
429 : UChar *buff_conv;
430 : size_t result_len;
431 :
432 54 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
433 54 : len_conv = icu_convert_case(u_strToUpper, locale,
434 : &buff_conv, buff_uchar, len_uchar);
435 54 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
436 54 : pfree(buff_uchar);
437 54 : pfree(buff_conv);
438 :
439 54 : return result_len;
440 : }
441 :
442 : /*
443 : * strncoll_icu_utf8
444 : *
445 : * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
446 : * database encoding. An argument length of -1 means the string is
447 : * NUL-terminated.
448 : */
449 : #ifdef HAVE_UCOL_STRCOLLUTF8
450 : int
451 22886 : strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
452 : pg_locale_t locale)
453 : {
454 : int result;
455 : UErrorCode status;
456 :
457 : Assert(locale->provider == COLLPROVIDER_ICU);
458 :
459 : Assert(GetDatabaseEncoding() == PG_UTF8);
460 :
461 22886 : status = U_ZERO_ERROR;
462 22886 : result = ucol_strcollUTF8(locale->info.icu.ucol,
463 : arg1, len1,
464 : arg2, len2,
465 : &status);
466 22886 : if (U_FAILURE(status))
467 0 : ereport(ERROR,
468 : (errmsg("collation failed: %s", u_errorName(status))));
469 :
470 22886 : return result;
471 : }
472 : #endif
473 :
474 : /* 'srclen' of -1 means the strings are NUL-terminated */
475 : size_t
476 10020 : strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
477 : pg_locale_t locale)
478 : {
479 : char sbuf[TEXTBUFLEN];
480 10020 : char *buf = sbuf;
481 : UChar *uchar;
482 : int32_t ulen;
483 : size_t uchar_bsize;
484 : Size result_bsize;
485 :
486 : Assert(locale->provider == COLLPROVIDER_ICU);
487 :
488 10020 : init_icu_converter();
489 :
490 10020 : ulen = uchar_length(icu_converter, src, srclen);
491 :
492 10020 : uchar_bsize = (ulen + 1) * sizeof(UChar);
493 :
494 10020 : if (uchar_bsize > TEXTBUFLEN)
495 0 : buf = palloc(uchar_bsize);
496 :
497 10020 : uchar = (UChar *) buf;
498 :
499 10020 : ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
500 :
501 10020 : result_bsize = ucol_getSortKey(locale->info.icu.ucol,
502 : uchar, ulen,
503 : (uint8_t *) dest, destsize);
504 :
505 : /*
506 : * ucol_getSortKey() counts the nul-terminator in the result length, but
507 : * this function should not.
508 : */
509 : Assert(result_bsize > 0);
510 10020 : result_bsize--;
511 :
512 10020 : if (buf != sbuf)
513 0 : pfree(buf);
514 :
515 : /* if dest is defined, it should be nul-terminated */
516 : Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
517 :
518 10020 : return result_bsize;
519 : }
520 :
521 : /* 'srclen' of -1 means the strings are NUL-terminated */
522 : size_t
523 1656 : strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
524 : const char *src, ssize_t srclen,
525 : pg_locale_t locale)
526 : {
527 : size_t result;
528 : UCharIterator iter;
529 : uint32_t state[2];
530 : UErrorCode status;
531 :
532 : Assert(locale->provider == COLLPROVIDER_ICU);
533 :
534 : Assert(GetDatabaseEncoding() == PG_UTF8);
535 :
536 1656 : uiter_setUTF8(&iter, src, srclen);
537 1656 : state[0] = state[1] = 0; /* won't need that again */
538 1656 : status = U_ZERO_ERROR;
539 1656 : result = ucol_nextSortKeyPart(locale->info.icu.ucol,
540 : &iter,
541 : state,
542 : (uint8_t *) dest,
543 : destsize,
544 : &status);
545 1656 : if (U_FAILURE(status))
546 0 : ereport(ERROR,
547 : (errmsg("sort key generation failed: %s",
548 : u_errorName(status))));
549 :
550 1656 : return result;
551 : }
552 :
553 : char *
554 67792 : get_collation_actual_version_icu(const char *collcollate)
555 : {
556 : UCollator *collator;
557 : UVersionInfo versioninfo;
558 : char buf[U_MAX_VERSION_STRING_LENGTH];
559 :
560 67792 : collator = pg_ucol_open(collcollate);
561 :
562 67792 : ucol_getVersion(collator, versioninfo);
563 67792 : ucol_close(collator);
564 :
565 67792 : u_versionToString(versioninfo, buf);
566 67792 : return pstrdup(buf);
567 : }
568 :
569 : /*
570 : * Convert a string in the database encoding into a string of UChars.
571 : *
572 : * The source string at buff is of length nbytes
573 : * (it needn't be nul-terminated)
574 : *
575 : * *buff_uchar receives a pointer to the palloc'd result string, and
576 : * the function's result is the number of UChars generated.
577 : *
578 : * The result string is nul-terminated, though most callers rely on the
579 : * result length instead.
580 : */
581 : static int32_t
582 612 : icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
583 : {
584 : int32_t len_uchar;
585 :
586 612 : init_icu_converter();
587 :
588 612 : len_uchar = uchar_length(icu_converter, buff, nbytes);
589 :
590 612 : *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
591 612 : len_uchar = uchar_convert(icu_converter,
592 : *buff_uchar, len_uchar + 1, buff, nbytes);
593 :
594 612 : return len_uchar;
595 : }
596 :
597 : /*
598 : * Convert a string of UChars into the database encoding.
599 : *
600 : * The source string at buff_uchar is of length len_uchar
601 : * (it needn't be nul-terminated)
602 : *
603 : * *result receives a pointer to the palloc'd result string, and the
604 : * function's result is the number of bytes generated (not counting nul).
605 : *
606 : * The result string is nul-terminated.
607 : */
608 : static size_t
609 600 : icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar)
610 : {
611 : UErrorCode status;
612 : int32_t len_result;
613 :
614 600 : init_icu_converter();
615 :
616 600 : status = U_ZERO_ERROR;
617 600 : len_result = ucnv_fromUChars(icu_converter, NULL, 0,
618 : buff_uchar, len_uchar, &status);
619 600 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
620 0 : ereport(ERROR,
621 : (errmsg("%s failed: %s", "ucnv_fromUChars",
622 : u_errorName(status))));
623 :
624 600 : if (len_result + 1 > destsize)
625 60 : return len_result;
626 :
627 540 : status = U_ZERO_ERROR;
628 540 : len_result = ucnv_fromUChars(icu_converter, dest, len_result + 1,
629 : buff_uchar, len_uchar, &status);
630 540 : if (U_FAILURE(status) ||
631 540 : status == U_STRING_NOT_TERMINATED_WARNING)
632 0 : ereport(ERROR,
633 : (errmsg("%s failed: %s", "ucnv_fromUChars",
634 : u_errorName(status))));
635 :
636 540 : return len_result;
637 : }
638 :
639 : static int32_t
640 600 : icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
641 : UChar **buff_dest, UChar *buff_source, int32_t len_source)
642 : {
643 : UErrorCode status;
644 : int32_t len_dest;
645 :
646 600 : len_dest = len_source; /* try first with same length */
647 600 : *buff_dest = palloc(len_dest * sizeof(**buff_dest));
648 600 : status = U_ZERO_ERROR;
649 600 : len_dest = func(*buff_dest, len_dest, buff_source, len_source,
650 : mylocale->info.icu.locale, &status);
651 600 : if (status == U_BUFFER_OVERFLOW_ERROR)
652 : {
653 : /* try again with adjusted length */
654 0 : pfree(*buff_dest);
655 0 : *buff_dest = palloc(len_dest * sizeof(**buff_dest));
656 0 : status = U_ZERO_ERROR;
657 0 : len_dest = func(*buff_dest, len_dest, buff_source, len_source,
658 : mylocale->info.icu.locale, &status);
659 : }
660 600 : if (U_FAILURE(status))
661 0 : ereport(ERROR,
662 : (errmsg("case conversion failed: %s", u_errorName(status))));
663 600 : return len_dest;
664 : }
665 :
666 : static int32_t
667 30 : u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
668 : const UChar *src, int32_t srcLength,
669 : const char *locale,
670 : UErrorCode *pErrorCode)
671 : {
672 30 : return u_strToTitle(dest, destCapacity, src, srcLength,
673 : NULL, locale, pErrorCode);
674 : }
675 :
676 : /*
677 : * strncoll_icu
678 : *
679 : * Convert the arguments from the database encoding to UChar strings, then
680 : * call ucol_strcoll(). An argument length of -1 means that the string is
681 : * NUL-terminated.
682 : *
683 : * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
684 : * caller should call that instead.
685 : */
686 : static int
687 0 : strncoll_icu(const char *arg1, ssize_t len1,
688 : const char *arg2, ssize_t len2, pg_locale_t locale)
689 : {
690 : char sbuf[TEXTBUFLEN];
691 0 : char *buf = sbuf;
692 : int32_t ulen1;
693 : int32_t ulen2;
694 : size_t bufsize1;
695 : size_t bufsize2;
696 : UChar *uchar1,
697 : *uchar2;
698 : int result;
699 :
700 : Assert(locale->provider == COLLPROVIDER_ICU);
701 :
702 : /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */
703 : #ifdef HAVE_UCOL_STRCOLLUTF8
704 : Assert(GetDatabaseEncoding() != PG_UTF8);
705 : #endif
706 :
707 0 : init_icu_converter();
708 :
709 0 : ulen1 = uchar_length(icu_converter, arg1, len1);
710 0 : ulen2 = uchar_length(icu_converter, arg2, len2);
711 :
712 0 : bufsize1 = (ulen1 + 1) * sizeof(UChar);
713 0 : bufsize2 = (ulen2 + 1) * sizeof(UChar);
714 :
715 0 : if (bufsize1 + bufsize2 > TEXTBUFLEN)
716 0 : buf = palloc(bufsize1 + bufsize2);
717 :
718 0 : uchar1 = (UChar *) buf;
719 0 : uchar2 = (UChar *) (buf + bufsize1);
720 :
721 0 : ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
722 0 : ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
723 :
724 0 : result = ucol_strcoll(locale->info.icu.ucol,
725 : uchar1, ulen1,
726 : uchar2, ulen2);
727 :
728 0 : if (buf != sbuf)
729 0 : pfree(buf);
730 :
731 0 : return result;
732 : }
733 :
734 : /* 'srclen' of -1 means the strings are NUL-terminated */
735 : static size_t
736 0 : strnxfrm_prefix_icu(char *dest, size_t destsize,
737 : const char *src, ssize_t srclen,
738 : pg_locale_t locale)
739 : {
740 : char sbuf[TEXTBUFLEN];
741 0 : char *buf = sbuf;
742 : UCharIterator iter;
743 : uint32_t state[2];
744 : UErrorCode status;
745 0 : int32_t ulen = -1;
746 0 : UChar *uchar = NULL;
747 : size_t uchar_bsize;
748 : Size result_bsize;
749 :
750 : Assert(locale->provider == COLLPROVIDER_ICU);
751 :
752 : /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */
753 : Assert(GetDatabaseEncoding() != PG_UTF8);
754 :
755 0 : init_icu_converter();
756 :
757 0 : ulen = uchar_length(icu_converter, src, srclen);
758 :
759 0 : uchar_bsize = (ulen + 1) * sizeof(UChar);
760 :
761 0 : if (uchar_bsize > TEXTBUFLEN)
762 0 : buf = palloc(uchar_bsize);
763 :
764 0 : uchar = (UChar *) buf;
765 :
766 0 : ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
767 :
768 0 : uiter_setString(&iter, uchar, ulen);
769 0 : state[0] = state[1] = 0; /* won't need that again */
770 0 : status = U_ZERO_ERROR;
771 0 : result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol,
772 : &iter,
773 : state,
774 : (uint8_t *) dest,
775 : destsize,
776 : &status);
777 0 : if (U_FAILURE(status))
778 0 : ereport(ERROR,
779 : (errmsg("sort key generation failed: %s",
780 : u_errorName(status))));
781 :
782 0 : return result_bsize;
783 : }
784 :
785 : static void
786 11232 : init_icu_converter(void)
787 : {
788 : const char *icu_encoding_name;
789 : UErrorCode status;
790 : UConverter *conv;
791 :
792 11232 : if (icu_converter)
793 11226 : return; /* already done */
794 :
795 6 : icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
796 6 : if (!icu_encoding_name)
797 0 : ereport(ERROR,
798 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
799 : errmsg("encoding \"%s\" not supported by ICU",
800 : pg_encoding_to_char(GetDatabaseEncoding()))));
801 :
802 6 : status = U_ZERO_ERROR;
803 6 : conv = ucnv_open(icu_encoding_name, &status);
804 6 : if (U_FAILURE(status))
805 0 : ereport(ERROR,
806 : (errmsg("could not open ICU converter for encoding \"%s\": %s",
807 : icu_encoding_name, u_errorName(status))));
808 :
809 6 : icu_converter = conv;
810 : }
811 :
812 : /*
813 : * Find length, in UChars, of given string if converted to UChar string.
814 : *
815 : * A length of -1 indicates that the input string is NUL-terminated.
816 : */
817 : static size_t
818 10632 : uchar_length(UConverter *converter, const char *str, int32_t len)
819 : {
820 10632 : UErrorCode status = U_ZERO_ERROR;
821 : int32_t ulen;
822 :
823 10632 : ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
824 10632 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
825 0 : ereport(ERROR,
826 : (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
827 10632 : return ulen;
828 : }
829 :
830 : /*
831 : * Convert the given source string into a UChar string, stored in dest, and
832 : * return the length (in UChars).
833 : *
834 : * A srclen of -1 indicates that the input string is NUL-terminated.
835 : */
836 : static int32_t
837 10632 : uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
838 : const char *src, int32_t srclen)
839 : {
840 10632 : UErrorCode status = U_ZERO_ERROR;
841 : int32_t ulen;
842 :
843 10632 : status = U_ZERO_ERROR;
844 10632 : ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
845 10632 : if (U_FAILURE(status))
846 0 : ereport(ERROR,
847 : (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
848 10632 : return ulen;
849 : }
850 :
851 : /*
852 : * Parse collation attributes from the given locale string and apply them to
853 : * the open collator.
854 : *
855 : * First, the locale string is canonicalized to an ICU format locale ID such
856 : * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
857 : * the key-value arguments.
858 : *
859 : * Starting with ICU version 54, the attributes are processed automatically by
860 : * ucol_open(), so this is only necessary for emulating this behavior on older
861 : * versions.
862 : */
863 : pg_attribute_unused()
864 : static void
865 0 : icu_set_collation_attributes(UCollator *collator, const char *loc,
866 : UErrorCode *status)
867 : {
868 : int32_t len;
869 : char *icu_locale_id;
870 : char *lower_str;
871 : char *str;
872 : char *token;
873 :
874 : /*
875 : * The input locale may be a BCP 47 language tag, e.g.
876 : * "und-u-kc-ks-level1", which expresses the same attributes in a
877 : * different form. It will be converted to the equivalent ICU format
878 : * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
879 : * uloc_canonicalize().
880 : */
881 0 : *status = U_ZERO_ERROR;
882 0 : len = uloc_canonicalize(loc, NULL, 0, status);
883 0 : icu_locale_id = palloc(len + 1);
884 0 : *status = U_ZERO_ERROR;
885 0 : len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
886 0 : if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
887 0 : return;
888 :
889 0 : lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
890 :
891 0 : pfree(icu_locale_id);
892 :
893 0 : str = strchr(lower_str, '@');
894 0 : if (!str)
895 0 : return;
896 0 : str++;
897 :
898 0 : while ((token = strsep(&str, ";")))
899 : {
900 0 : char *e = strchr(token, '=');
901 :
902 0 : if (e)
903 : {
904 : char *name;
905 : char *value;
906 : UColAttribute uattr;
907 : UColAttributeValue uvalue;
908 :
909 0 : *status = U_ZERO_ERROR;
910 :
911 0 : *e = '\0';
912 0 : name = token;
913 0 : value = e + 1;
914 :
915 : /*
916 : * See attribute name and value lists in ICU i18n/coll.cpp
917 : */
918 0 : if (strcmp(name, "colstrength") == 0)
919 0 : uattr = UCOL_STRENGTH;
920 0 : else if (strcmp(name, "colbackwards") == 0)
921 0 : uattr = UCOL_FRENCH_COLLATION;
922 0 : else if (strcmp(name, "colcaselevel") == 0)
923 0 : uattr = UCOL_CASE_LEVEL;
924 0 : else if (strcmp(name, "colcasefirst") == 0)
925 0 : uattr = UCOL_CASE_FIRST;
926 0 : else if (strcmp(name, "colalternate") == 0)
927 0 : uattr = UCOL_ALTERNATE_HANDLING;
928 0 : else if (strcmp(name, "colnormalization") == 0)
929 0 : uattr = UCOL_NORMALIZATION_MODE;
930 0 : else if (strcmp(name, "colnumeric") == 0)
931 0 : uattr = UCOL_NUMERIC_COLLATION;
932 : else
933 : /* ignore if unknown */
934 0 : continue;
935 :
936 0 : if (strcmp(value, "primary") == 0)
937 0 : uvalue = UCOL_PRIMARY;
938 0 : else if (strcmp(value, "secondary") == 0)
939 0 : uvalue = UCOL_SECONDARY;
940 0 : else if (strcmp(value, "tertiary") == 0)
941 0 : uvalue = UCOL_TERTIARY;
942 0 : else if (strcmp(value, "quaternary") == 0)
943 0 : uvalue = UCOL_QUATERNARY;
944 0 : else if (strcmp(value, "identical") == 0)
945 0 : uvalue = UCOL_IDENTICAL;
946 0 : else if (strcmp(value, "no") == 0)
947 0 : uvalue = UCOL_OFF;
948 0 : else if (strcmp(value, "yes") == 0)
949 0 : uvalue = UCOL_ON;
950 0 : else if (strcmp(value, "shifted") == 0)
951 0 : uvalue = UCOL_SHIFTED;
952 0 : else if (strcmp(value, "non-ignorable") == 0)
953 0 : uvalue = UCOL_NON_IGNORABLE;
954 0 : else if (strcmp(value, "lower") == 0)
955 0 : uvalue = UCOL_LOWER_FIRST;
956 0 : else if (strcmp(value, "upper") == 0)
957 0 : uvalue = UCOL_UPPER_FIRST;
958 : else
959 : {
960 0 : *status = U_ILLEGAL_ARGUMENT_ERROR;
961 0 : break;
962 : }
963 :
964 0 : ucol_setAttribute(collator, uattr, uvalue, status);
965 : }
966 : }
967 :
968 0 : pfree(lower_str);
969 : }
970 :
971 : #endif /* USE_ICU */
|