Line data Source code
1 : /*-----------------------------------------------------------------------
2 : *
3 : * PostgreSQL locale utilities for ICU
4 : *
5 : * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 : *
7 : * src/backend/utils/adt/pg_locale_icu.c
8 : *
9 : *-----------------------------------------------------------------------
10 : */
11 :
12 : #include "postgres.h"
13 :
14 : #ifdef USE_ICU
15 : #include <unicode/ucnv.h>
16 : #include <unicode/ustring.h>
17 :
18 : /*
19 : * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
20 : * (see
21 : * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
22 : */
23 : #if U_ICU_VERSION_MAJOR_NUM >= 53
24 : #define HAVE_UCOL_STRCOLLUTF8 1
25 : #else
26 : #undef HAVE_UCOL_STRCOLLUTF8
27 : #endif
28 :
29 : #endif
30 :
31 : #include "access/htup_details.h"
32 : #include "catalog/pg_database.h"
33 : #include "catalog/pg_collation.h"
34 : #include "mb/pg_wchar.h"
35 : #include "miscadmin.h"
36 : #include "utils/builtins.h"
37 : #include "utils/formatting.h"
38 : #include "utils/memutils.h"
39 : #include "utils/pg_locale.h"
40 : #include "utils/syscache.h"
41 :
42 : /*
43 : * Size of stack buffer to use for string transformations, used to avoid heap
44 : * allocations in typical cases. This should be large enough that most strings
45 : * will fit, but small enough that we feel comfortable putting it on the
46 : * stack.
47 : */
48 : #define TEXTBUFLEN 1024
49 :
50 : extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context);
51 :
52 : #ifdef USE_ICU
53 :
54 : extern UCollator *pg_ucol_open(const char *loc_str);
55 :
56 : static size_t strlower_icu(char *dest, size_t destsize, const char *src,
57 : ssize_t srclen, pg_locale_t locale);
58 : static size_t strtitle_icu(char *dest, size_t destsize, const char *src,
59 : ssize_t srclen, pg_locale_t locale);
60 : static size_t strupper_icu(char *dest, size_t destsize, const char *src,
61 : ssize_t srclen, pg_locale_t locale);
62 : static size_t strfold_icu(char *dest, size_t destsize, const char *src,
63 : ssize_t srclen, pg_locale_t locale);
64 : static int strncoll_icu(const char *arg1, ssize_t len1,
65 : const char *arg2, ssize_t len2,
66 : pg_locale_t locale);
67 : static size_t strnxfrm_icu(char *dest, size_t destsize,
68 : const char *src, ssize_t srclen,
69 : pg_locale_t locale);
70 : extern char *get_collation_actual_version_icu(const char *collcollate);
71 :
72 : typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity,
73 : const UChar *src, int32_t srcLength,
74 : const char *locale,
75 : UErrorCode *pErrorCode);
76 :
77 : /*
78 : * Converter object for converting between ICU's UChar strings and C strings
79 : * in database encoding. Since the database encoding doesn't change, we only
80 : * need one of these per session.
81 : */
82 : static UConverter *icu_converter = NULL;
83 :
84 : static UCollator *make_icu_collator(const char *iculocstr,
85 : const char *icurules);
86 : static int strncoll_icu(const char *arg1, ssize_t len1,
87 : const char *arg2, ssize_t len2,
88 : pg_locale_t locale);
89 : static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
90 : const char *src, ssize_t srclen,
91 : pg_locale_t locale);
92 : #ifdef HAVE_UCOL_STRCOLLUTF8
93 : static int strncoll_icu_utf8(const char *arg1, ssize_t len1,
94 : const char *arg2, ssize_t len2,
95 : pg_locale_t locale);
96 : #endif
97 : static size_t strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
98 : const char *src, ssize_t srclen,
99 : pg_locale_t locale);
100 : static void init_icu_converter(void);
101 : static size_t uchar_length(UConverter *converter,
102 : const char *str, int32_t len);
103 : static int32_t uchar_convert(UConverter *converter,
104 : UChar *dest, int32_t destlen,
105 : const char *src, int32_t srclen);
106 : static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff,
107 : size_t nbytes);
108 : static size_t icu_from_uchar(char *dest, size_t destsize,
109 : const UChar *buff_uchar, int32_t len_uchar);
110 : static void icu_set_collation_attributes(UCollator *collator, const char *loc,
111 : UErrorCode *status);
112 : static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
113 : UChar **buff_dest, UChar *buff_source,
114 : int32_t len_source);
115 : static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
116 : const UChar *src, int32_t srcLength,
117 : const char *locale,
118 : UErrorCode *pErrorCode);
119 : static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
120 : const UChar *src, int32_t srcLength,
121 : const char *locale,
122 : UErrorCode *pErrorCode);
123 :
124 : static bool
125 126 : char_is_cased_icu(char ch, pg_locale_t locale)
126 : {
127 126 : return IS_HIGHBIT_SET(ch) ||
128 252 : (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
129 : }
130 :
131 : static pg_wchar
132 108 : toupper_icu(pg_wchar wc, pg_locale_t locale)
133 : {
134 108 : return u_toupper(wc);
135 : }
136 :
137 : static pg_wchar
138 108 : tolower_icu(pg_wchar wc, pg_locale_t locale)
139 : {
140 108 : return u_tolower(wc);
141 : }
142 :
143 : static const struct collate_methods collate_methods_icu = {
144 : .strncoll = strncoll_icu,
145 : .strnxfrm = strnxfrm_icu,
146 : .strnxfrm_prefix = strnxfrm_prefix_icu,
147 : .strxfrm_is_safe = true,
148 : };
149 :
150 : static const struct collate_methods collate_methods_icu_utf8 = {
151 : #ifdef HAVE_UCOL_STRCOLLUTF8
152 : .strncoll = strncoll_icu_utf8,
153 : #else
154 : .strncoll = strncoll_icu,
155 : #endif
156 : .strnxfrm = strnxfrm_icu,
157 : .strnxfrm_prefix = strnxfrm_prefix_icu_utf8,
158 : .strxfrm_is_safe = true,
159 : };
160 :
161 : static bool
162 12288 : wc_isdigit_icu(pg_wchar wc, pg_locale_t locale)
163 : {
164 12288 : return u_isdigit(wc);
165 : }
166 :
167 : static bool
168 12288 : wc_isalpha_icu(pg_wchar wc, pg_locale_t locale)
169 : {
170 12288 : return u_isalpha(wc);
171 : }
172 :
173 : static bool
174 12288 : wc_isalnum_icu(pg_wchar wc, pg_locale_t locale)
175 : {
176 12288 : return u_isalnum(wc);
177 : }
178 :
179 : static bool
180 12288 : wc_isupper_icu(pg_wchar wc, pg_locale_t locale)
181 : {
182 12288 : return u_isupper(wc);
183 : }
184 :
185 : static bool
186 12288 : wc_islower_icu(pg_wchar wc, pg_locale_t locale)
187 : {
188 12288 : return u_islower(wc);
189 : }
190 :
191 : static bool
192 12288 : wc_isgraph_icu(pg_wchar wc, pg_locale_t locale)
193 : {
194 12288 : return u_isgraph(wc);
195 : }
196 :
197 : static bool
198 12288 : wc_isprint_icu(pg_wchar wc, pg_locale_t locale)
199 : {
200 12288 : return u_isprint(wc);
201 : }
202 :
203 : static bool
204 12288 : wc_ispunct_icu(pg_wchar wc, pg_locale_t locale)
205 : {
206 12288 : return u_ispunct(wc);
207 : }
208 :
209 : static bool
210 12288 : wc_isspace_icu(pg_wchar wc, pg_locale_t locale)
211 : {
212 12288 : return u_isspace(wc);
213 : }
214 :
215 : static const struct ctype_methods ctype_methods_icu = {
216 : .strlower = strlower_icu,
217 : .strtitle = strtitle_icu,
218 : .strupper = strupper_icu,
219 : .strfold = strfold_icu,
220 : .wc_isdigit = wc_isdigit_icu,
221 : .wc_isalpha = wc_isalpha_icu,
222 : .wc_isalnum = wc_isalnum_icu,
223 : .wc_isupper = wc_isupper_icu,
224 : .wc_islower = wc_islower_icu,
225 : .wc_isgraph = wc_isgraph_icu,
226 : .wc_isprint = wc_isprint_icu,
227 : .wc_ispunct = wc_ispunct_icu,
228 : .wc_isspace = wc_isspace_icu,
229 : .char_is_cased = char_is_cased_icu,
230 : .wc_toupper = toupper_icu,
231 : .wc_tolower = tolower_icu,
232 : };
233 : #endif
234 :
235 : pg_locale_t
236 212 : create_pg_locale_icu(Oid collid, MemoryContext context)
237 : {
238 : #ifdef USE_ICU
239 : bool deterministic;
240 : const char *iculocstr;
241 212 : const char *icurules = NULL;
242 : UCollator *collator;
243 : pg_locale_t result;
244 :
245 212 : if (collid == DEFAULT_COLLATION_OID)
246 : {
247 : HeapTuple tp;
248 : Datum datum;
249 : bool isnull;
250 :
251 26 : tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
252 26 : if (!HeapTupleIsValid(tp))
253 0 : elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
254 :
255 : /* default database collation is always deterministic */
256 26 : deterministic = true;
257 26 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
258 : Anum_pg_database_datlocale);
259 26 : iculocstr = TextDatumGetCString(datum);
260 26 : datum = SysCacheGetAttr(DATABASEOID, tp,
261 : Anum_pg_database_daticurules, &isnull);
262 26 : if (!isnull)
263 0 : icurules = TextDatumGetCString(datum);
264 :
265 26 : ReleaseSysCache(tp);
266 : }
267 : else
268 : {
269 : Form_pg_collation collform;
270 : HeapTuple tp;
271 : Datum datum;
272 : bool isnull;
273 :
274 186 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
275 186 : if (!HeapTupleIsValid(tp))
276 0 : elog(ERROR, "cache lookup failed for collation %u", collid);
277 186 : collform = (Form_pg_collation) GETSTRUCT(tp);
278 186 : deterministic = collform->collisdeterministic;
279 186 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
280 : Anum_pg_collation_colllocale);
281 186 : iculocstr = TextDatumGetCString(datum);
282 186 : datum = SysCacheGetAttr(COLLOID, tp,
283 : Anum_pg_collation_collicurules, &isnull);
284 186 : if (!isnull)
285 12 : icurules = TextDatumGetCString(datum);
286 :
287 186 : ReleaseSysCache(tp);
288 : }
289 :
290 212 : collator = make_icu_collator(iculocstr, icurules);
291 :
292 202 : result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
293 202 : result->info.icu.locale = MemoryContextStrdup(context, iculocstr);
294 202 : result->info.icu.ucol = collator;
295 202 : result->deterministic = deterministic;
296 202 : result->collate_is_c = false;
297 202 : result->ctype_is_c = false;
298 202 : if (GetDatabaseEncoding() == PG_UTF8)
299 202 : result->collate = &collate_methods_icu_utf8;
300 : else
301 0 : result->collate = &collate_methods_icu;
302 202 : result->ctype = &ctype_methods_icu;
303 :
304 202 : return result;
305 : #else
306 : /* could get here if a collation was created by a build with ICU */
307 : ereport(ERROR,
308 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
309 : errmsg("ICU is not supported in this build")));
310 :
311 : return NULL;
312 : #endif
313 : }
314 :
315 : #ifdef USE_ICU
316 :
317 : /*
318 : * Wrapper around ucol_open() to handle API differences for older ICU
319 : * versions.
320 : *
321 : * Ensure that no path leaks a UCollator.
322 : */
323 : UCollator *
324 78206 : pg_ucol_open(const char *loc_str)
325 : {
326 : UCollator *collator;
327 : UErrorCode status;
328 78206 : const char *orig_str = loc_str;
329 78206 : char *fixed_str = NULL;
330 :
331 : /*
332 : * Must never open default collator, because it depends on the environment
333 : * and may change at any time. Should not happen, but check here to catch
334 : * bugs that might be hard to catch otherwise.
335 : *
336 : * NB: the default collator is not the same as the collator for the root
337 : * locale. The root locale may be specified as the empty string, "und", or
338 : * "root". The default collator is opened by passing NULL to ucol_open().
339 : */
340 78206 : if (loc_str == NULL)
341 0 : elog(ERROR, "opening default collator is not supported");
342 :
343 : /*
344 : * In ICU versions 54 and earlier, "und" is not a recognized spelling of
345 : * the root locale. If the first component of the locale is "und", replace
346 : * with "root" before opening.
347 : */
348 : if (U_ICU_VERSION_MAJOR_NUM < 55)
349 : {
350 : char lang[ULOC_LANG_CAPACITY];
351 :
352 : status = U_ZERO_ERROR;
353 : uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
354 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
355 : {
356 : ereport(ERROR,
357 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
358 : errmsg("could not get language from locale \"%s\": %s",
359 : loc_str, u_errorName(status))));
360 : }
361 :
362 : if (strcmp(lang, "und") == 0)
363 : {
364 : const char *remainder = loc_str + strlen("und");
365 :
366 : fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
367 : strcpy(fixed_str, "root");
368 : strcat(fixed_str, remainder);
369 :
370 : loc_str = fixed_str;
371 : }
372 : }
373 :
374 78206 : status = U_ZERO_ERROR;
375 78206 : collator = ucol_open(loc_str, &status);
376 78206 : if (U_FAILURE(status))
377 12 : ereport(ERROR,
378 : /* use original string for error report */
379 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
380 : errmsg("could not open collator for locale \"%s\": %s",
381 : orig_str, u_errorName(status))));
382 :
383 : if (U_ICU_VERSION_MAJOR_NUM < 54)
384 : {
385 : status = U_ZERO_ERROR;
386 : icu_set_collation_attributes(collator, loc_str, &status);
387 :
388 : /*
389 : * Pretend the error came from ucol_open(), for consistent error
390 : * message across ICU versions.
391 : */
392 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
393 : {
394 : ucol_close(collator);
395 : ereport(ERROR,
396 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
397 : errmsg("could not open collator for locale \"%s\": %s",
398 : orig_str, u_errorName(status))));
399 : }
400 : }
401 :
402 78194 : if (fixed_str != NULL)
403 0 : pfree(fixed_str);
404 :
405 78194 : return collator;
406 : }
407 :
408 : /*
409 : * Create a UCollator with the given locale string and rules.
410 : *
411 : * Ensure that no path leaks a UCollator.
412 : */
413 : static UCollator *
414 212 : make_icu_collator(const char *iculocstr, const char *icurules)
415 : {
416 212 : if (!icurules)
417 : {
418 : /* simple case without rules */
419 200 : return pg_ucol_open(iculocstr);
420 : }
421 : else
422 : {
423 : UCollator *collator_std_rules;
424 : UCollator *collator_all_rules;
425 : const UChar *std_rules;
426 : UChar *my_rules;
427 : UChar *all_rules;
428 : int32_t length;
429 : int32_t total;
430 : UErrorCode status;
431 :
432 : /*
433 : * If rules are specified, we extract the rules of the standard
434 : * collation, add our own rules, and make a new collator with the
435 : * combined rules.
436 : */
437 12 : icu_to_uchar(&my_rules, icurules, strlen(icurules));
438 :
439 12 : collator_std_rules = pg_ucol_open(iculocstr);
440 :
441 12 : std_rules = ucol_getRules(collator_std_rules, &length);
442 :
443 12 : total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
444 :
445 : /* avoid leaking collator on OOM */
446 12 : all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
447 12 : if (!all_rules)
448 : {
449 0 : ucol_close(collator_std_rules);
450 0 : ereport(ERROR,
451 : (errcode(ERRCODE_OUT_OF_MEMORY),
452 : errmsg("out of memory")));
453 : }
454 :
455 12 : u_strcpy(all_rules, std_rules);
456 12 : u_strcat(all_rules, my_rules);
457 :
458 12 : ucol_close(collator_std_rules);
459 :
460 12 : status = U_ZERO_ERROR;
461 12 : collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
462 : UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
463 : NULL, &status);
464 12 : if (U_FAILURE(status))
465 : {
466 6 : ereport(ERROR,
467 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
468 : errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
469 : iculocstr, icurules, u_errorName(status))));
470 : }
471 :
472 6 : return collator_all_rules;
473 : }
474 : }
475 :
476 : static size_t
477 528 : strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
478 : pg_locale_t locale)
479 : {
480 : int32_t len_uchar;
481 : int32_t len_conv;
482 : UChar *buff_uchar;
483 : UChar *buff_conv;
484 : size_t result_len;
485 :
486 528 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
487 528 : len_conv = icu_convert_case(u_strToLower, locale,
488 : &buff_conv, buff_uchar, len_uchar);
489 528 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
490 528 : pfree(buff_uchar);
491 528 : pfree(buff_conv);
492 :
493 528 : return result_len;
494 : }
495 :
496 : static size_t
497 30 : strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
498 : pg_locale_t locale)
499 : {
500 : int32_t len_uchar;
501 : int32_t len_conv;
502 : UChar *buff_uchar;
503 : UChar *buff_conv;
504 : size_t result_len;
505 :
506 30 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
507 30 : len_conv = icu_convert_case(u_strToTitle_default_BI, locale,
508 : &buff_conv, buff_uchar, len_uchar);
509 30 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
510 30 : pfree(buff_uchar);
511 30 : pfree(buff_conv);
512 :
513 30 : return result_len;
514 : }
515 :
516 : static size_t
517 54 : strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
518 : pg_locale_t locale)
519 : {
520 : int32_t len_uchar;
521 : int32_t len_conv;
522 : UChar *buff_uchar;
523 : UChar *buff_conv;
524 : size_t result_len;
525 :
526 54 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
527 54 : len_conv = icu_convert_case(u_strToUpper, locale,
528 : &buff_conv, buff_uchar, len_uchar);
529 54 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
530 54 : pfree(buff_uchar);
531 54 : pfree(buff_conv);
532 :
533 54 : return result_len;
534 : }
535 :
536 : static size_t
537 12 : strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
538 : pg_locale_t locale)
539 : {
540 : int32_t len_uchar;
541 : int32_t len_conv;
542 : UChar *buff_uchar;
543 : UChar *buff_conv;
544 : size_t result_len;
545 :
546 12 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
547 12 : len_conv = icu_convert_case(u_strFoldCase_default, locale,
548 : &buff_conv, buff_uchar, len_uchar);
549 12 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
550 12 : pfree(buff_uchar);
551 12 : pfree(buff_conv);
552 :
553 12 : return result_len;
554 : }
555 :
556 : /*
557 : * strncoll_icu_utf8
558 : *
559 : * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
560 : * database encoding. An argument length of -1 means the string is
561 : * NUL-terminated.
562 : */
563 : #ifdef HAVE_UCOL_STRCOLLUTF8
564 : int
565 25856 : strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
566 : pg_locale_t locale)
567 : {
568 : int result;
569 : UErrorCode status;
570 :
571 : Assert(GetDatabaseEncoding() == PG_UTF8);
572 :
573 25856 : status = U_ZERO_ERROR;
574 25856 : result = ucol_strcollUTF8(locale->info.icu.ucol,
575 : arg1, len1,
576 : arg2, len2,
577 : &status);
578 25856 : if (U_FAILURE(status))
579 0 : ereport(ERROR,
580 : (errmsg("collation failed: %s", u_errorName(status))));
581 :
582 25856 : return result;
583 : }
584 : #endif
585 :
586 : /* 'srclen' of -1 means the strings are NUL-terminated */
587 : size_t
588 10020 : strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
589 : pg_locale_t locale)
590 : {
591 : char sbuf[TEXTBUFLEN];
592 10020 : char *buf = sbuf;
593 : UChar *uchar;
594 : int32_t ulen;
595 : size_t uchar_bsize;
596 : Size result_bsize;
597 :
598 10020 : init_icu_converter();
599 :
600 10020 : ulen = uchar_length(icu_converter, src, srclen);
601 :
602 10020 : uchar_bsize = (ulen + 1) * sizeof(UChar);
603 :
604 10020 : if (uchar_bsize > TEXTBUFLEN)
605 0 : buf = palloc(uchar_bsize);
606 :
607 10020 : uchar = (UChar *) buf;
608 :
609 10020 : ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
610 :
611 10020 : result_bsize = ucol_getSortKey(locale->info.icu.ucol,
612 : uchar, ulen,
613 : (uint8_t *) dest, destsize);
614 :
615 : /*
616 : * ucol_getSortKey() counts the nul-terminator in the result length, but
617 : * this function should not.
618 : */
619 : Assert(result_bsize > 0);
620 10020 : result_bsize--;
621 :
622 10020 : if (buf != sbuf)
623 0 : pfree(buf);
624 :
625 : /* if dest is defined, it should be nul-terminated */
626 : Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
627 :
628 10020 : return result_bsize;
629 : }
630 :
631 : /* 'srclen' of -1 means the strings are NUL-terminated */
632 : size_t
633 1668 : strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
634 : const char *src, ssize_t srclen,
635 : pg_locale_t locale)
636 : {
637 : size_t result;
638 : UCharIterator iter;
639 : uint32_t state[2];
640 : UErrorCode status;
641 :
642 : Assert(GetDatabaseEncoding() == PG_UTF8);
643 :
644 1668 : uiter_setUTF8(&iter, src, srclen);
645 1668 : state[0] = state[1] = 0; /* won't need that again */
646 1668 : status = U_ZERO_ERROR;
647 1668 : result = ucol_nextSortKeyPart(locale->info.icu.ucol,
648 : &iter,
649 : state,
650 : (uint8_t *) dest,
651 : destsize,
652 : &status);
653 1668 : if (U_FAILURE(status))
654 0 : ereport(ERROR,
655 : (errmsg("sort key generation failed: %s",
656 : u_errorName(status))));
657 :
658 1668 : return result;
659 : }
660 :
661 : char *
662 77844 : get_collation_actual_version_icu(const char *collcollate)
663 : {
664 : UCollator *collator;
665 : UVersionInfo versioninfo;
666 : char buf[U_MAX_VERSION_STRING_LENGTH];
667 :
668 77844 : collator = pg_ucol_open(collcollate);
669 :
670 77844 : ucol_getVersion(collator, versioninfo);
671 77844 : ucol_close(collator);
672 :
673 77844 : u_versionToString(versioninfo, buf);
674 77844 : return pstrdup(buf);
675 : }
676 :
677 : /*
678 : * Convert a string in the database encoding into a string of UChars.
679 : *
680 : * The source string at buff is of length nbytes
681 : * (it needn't be nul-terminated)
682 : *
683 : * *buff_uchar receives a pointer to the palloc'd result string, and
684 : * the function's result is the number of UChars generated.
685 : *
686 : * The result string is nul-terminated, though most callers rely on the
687 : * result length instead.
688 : */
689 : static int32_t
690 636 : icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
691 : {
692 : int32_t len_uchar;
693 :
694 636 : init_icu_converter();
695 :
696 636 : len_uchar = uchar_length(icu_converter, buff, nbytes);
697 :
698 636 : *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
699 636 : len_uchar = uchar_convert(icu_converter,
700 : *buff_uchar, len_uchar + 1, buff, nbytes);
701 :
702 636 : return len_uchar;
703 : }
704 :
705 : /*
706 : * Convert a string of UChars into the database encoding.
707 : *
708 : * The source string at buff_uchar is of length len_uchar
709 : * (it needn't be nul-terminated)
710 : *
711 : * *result receives a pointer to the palloc'd result string, and the
712 : * function's result is the number of bytes generated (not counting nul).
713 : *
714 : * The result string is nul-terminated.
715 : */
716 : static size_t
717 624 : icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar)
718 : {
719 : UErrorCode status;
720 : int32_t len_result;
721 :
722 624 : init_icu_converter();
723 :
724 624 : status = U_ZERO_ERROR;
725 624 : len_result = ucnv_fromUChars(icu_converter, NULL, 0,
726 : buff_uchar, len_uchar, &status);
727 624 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
728 0 : ereport(ERROR,
729 : (errmsg("%s failed: %s", "ucnv_fromUChars",
730 : u_errorName(status))));
731 :
732 624 : if (len_result + 1 > destsize)
733 60 : return len_result;
734 :
735 564 : status = U_ZERO_ERROR;
736 564 : len_result = ucnv_fromUChars(icu_converter, dest, len_result + 1,
737 : buff_uchar, len_uchar, &status);
738 564 : if (U_FAILURE(status) ||
739 564 : status == U_STRING_NOT_TERMINATED_WARNING)
740 0 : ereport(ERROR,
741 : (errmsg("%s failed: %s", "ucnv_fromUChars",
742 : u_errorName(status))));
743 :
744 564 : return len_result;
745 : }
746 :
747 : static int32_t
748 624 : icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
749 : UChar **buff_dest, UChar *buff_source, int32_t len_source)
750 : {
751 : UErrorCode status;
752 : int32_t len_dest;
753 :
754 624 : len_dest = len_source; /* try first with same length */
755 624 : *buff_dest = palloc(len_dest * sizeof(**buff_dest));
756 624 : status = U_ZERO_ERROR;
757 624 : len_dest = func(*buff_dest, len_dest, buff_source, len_source,
758 : mylocale->info.icu.locale, &status);
759 624 : if (status == U_BUFFER_OVERFLOW_ERROR)
760 : {
761 : /* try again with adjusted length */
762 18 : pfree(*buff_dest);
763 18 : *buff_dest = palloc(len_dest * sizeof(**buff_dest));
764 18 : status = U_ZERO_ERROR;
765 18 : len_dest = func(*buff_dest, len_dest, buff_source, len_source,
766 : mylocale->info.icu.locale, &status);
767 : }
768 624 : if (U_FAILURE(status))
769 0 : ereport(ERROR,
770 : (errmsg("case conversion failed: %s", u_errorName(status))));
771 624 : return len_dest;
772 : }
773 :
774 : static int32_t
775 30 : u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
776 : const UChar *src, int32_t srcLength,
777 : const char *locale,
778 : UErrorCode *pErrorCode)
779 : {
780 30 : return u_strToTitle(dest, destCapacity, src, srcLength,
781 : NULL, locale, pErrorCode);
782 : }
783 :
784 : static int32_t
785 24 : u_strFoldCase_default(UChar *dest, int32_t destCapacity,
786 : const UChar *src, int32_t srcLength,
787 : const char *locale,
788 : UErrorCode *pErrorCode)
789 : {
790 24 : uint32 options = U_FOLD_CASE_DEFAULT;
791 : char lang[3];
792 : UErrorCode status;
793 :
794 : /*
795 : * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
796 : * folding does not accept a locale. Instead it just supports a single
797 : * option relevant to Turkic languages 'az' and 'tr'; check for those
798 : * languages to enable the option.
799 : */
800 24 : status = U_ZERO_ERROR;
801 24 : uloc_getLanguage(locale, lang, 3, &status);
802 24 : if (U_SUCCESS(status))
803 : {
804 : /*
805 : * The option name is confusing, but it causes u_strFoldCase to use
806 : * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
807 : */
808 24 : if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
809 12 : options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
810 : }
811 :
812 24 : return u_strFoldCase(dest, destCapacity, src, srcLength,
813 : options, pErrorCode);
814 : }
815 :
816 : /*
817 : * strncoll_icu
818 : *
819 : * Convert the arguments from the database encoding to UChar strings, then
820 : * call ucol_strcoll(). An argument length of -1 means that the string is
821 : * NUL-terminated.
822 : *
823 : * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
824 : * caller should call that instead.
825 : */
826 : static int
827 0 : strncoll_icu(const char *arg1, ssize_t len1,
828 : const char *arg2, ssize_t len2, pg_locale_t locale)
829 : {
830 : char sbuf[TEXTBUFLEN];
831 0 : char *buf = sbuf;
832 : int32_t ulen1;
833 : int32_t ulen2;
834 : size_t bufsize1;
835 : size_t bufsize2;
836 : UChar *uchar1,
837 : *uchar2;
838 : int result;
839 :
840 : /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */
841 : #ifdef HAVE_UCOL_STRCOLLUTF8
842 : Assert(GetDatabaseEncoding() != PG_UTF8);
843 : #endif
844 :
845 0 : init_icu_converter();
846 :
847 0 : ulen1 = uchar_length(icu_converter, arg1, len1);
848 0 : ulen2 = uchar_length(icu_converter, arg2, len2);
849 :
850 0 : bufsize1 = (ulen1 + 1) * sizeof(UChar);
851 0 : bufsize2 = (ulen2 + 1) * sizeof(UChar);
852 :
853 0 : if (bufsize1 + bufsize2 > TEXTBUFLEN)
854 0 : buf = palloc(bufsize1 + bufsize2);
855 :
856 0 : uchar1 = (UChar *) buf;
857 0 : uchar2 = (UChar *) (buf + bufsize1);
858 :
859 0 : ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
860 0 : ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
861 :
862 0 : result = ucol_strcoll(locale->info.icu.ucol,
863 : uchar1, ulen1,
864 : uchar2, ulen2);
865 :
866 0 : if (buf != sbuf)
867 0 : pfree(buf);
868 :
869 0 : return result;
870 : }
871 :
872 : /* 'srclen' of -1 means the strings are NUL-terminated */
873 : static size_t
874 0 : strnxfrm_prefix_icu(char *dest, size_t destsize,
875 : const char *src, ssize_t srclen,
876 : pg_locale_t locale)
877 : {
878 : char sbuf[TEXTBUFLEN];
879 0 : char *buf = sbuf;
880 : UCharIterator iter;
881 : uint32_t state[2];
882 : UErrorCode status;
883 0 : int32_t ulen = -1;
884 0 : UChar *uchar = NULL;
885 : size_t uchar_bsize;
886 : Size result_bsize;
887 :
888 : /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */
889 : Assert(GetDatabaseEncoding() != PG_UTF8);
890 :
891 0 : init_icu_converter();
892 :
893 0 : ulen = uchar_length(icu_converter, src, srclen);
894 :
895 0 : uchar_bsize = (ulen + 1) * sizeof(UChar);
896 :
897 0 : if (uchar_bsize > TEXTBUFLEN)
898 0 : buf = palloc(uchar_bsize);
899 :
900 0 : uchar = (UChar *) buf;
901 :
902 0 : ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
903 :
904 0 : uiter_setString(&iter, uchar, ulen);
905 0 : state[0] = state[1] = 0; /* won't need that again */
906 0 : status = U_ZERO_ERROR;
907 0 : result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol,
908 : &iter,
909 : state,
910 : (uint8_t *) dest,
911 : destsize,
912 : &status);
913 0 : if (U_FAILURE(status))
914 0 : ereport(ERROR,
915 : (errmsg("sort key generation failed: %s",
916 : u_errorName(status))));
917 :
918 0 : return result_bsize;
919 : }
920 :
921 : static void
922 11280 : init_icu_converter(void)
923 : {
924 : const char *icu_encoding_name;
925 : UErrorCode status;
926 : UConverter *conv;
927 :
928 11280 : if (icu_converter)
929 11274 : return; /* already done */
930 :
931 6 : icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
932 6 : if (!icu_encoding_name)
933 0 : ereport(ERROR,
934 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
935 : errmsg("encoding \"%s\" not supported by ICU",
936 : pg_encoding_to_char(GetDatabaseEncoding()))));
937 :
938 6 : status = U_ZERO_ERROR;
939 6 : conv = ucnv_open(icu_encoding_name, &status);
940 6 : if (U_FAILURE(status))
941 0 : ereport(ERROR,
942 : (errmsg("could not open ICU converter for encoding \"%s\": %s",
943 : icu_encoding_name, u_errorName(status))));
944 :
945 6 : icu_converter = conv;
946 : }
947 :
948 : /*
949 : * Find length, in UChars, of given string if converted to UChar string.
950 : *
951 : * A length of -1 indicates that the input string is NUL-terminated.
952 : */
953 : static size_t
954 10656 : uchar_length(UConverter *converter, const char *str, int32_t len)
955 : {
956 10656 : UErrorCode status = U_ZERO_ERROR;
957 : int32_t ulen;
958 :
959 10656 : ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
960 10656 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
961 0 : ereport(ERROR,
962 : (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
963 10656 : return ulen;
964 : }
965 :
966 : /*
967 : * Convert the given source string into a UChar string, stored in dest, and
968 : * return the length (in UChars).
969 : *
970 : * A srclen of -1 indicates that the input string is NUL-terminated.
971 : */
972 : static int32_t
973 10656 : uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
974 : const char *src, int32_t srclen)
975 : {
976 10656 : UErrorCode status = U_ZERO_ERROR;
977 : int32_t ulen;
978 :
979 10656 : status = U_ZERO_ERROR;
980 10656 : ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
981 10656 : if (U_FAILURE(status))
982 0 : ereport(ERROR,
983 : (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
984 10656 : return ulen;
985 : }
986 :
987 : /*
988 : * Parse collation attributes from the given locale string and apply them to
989 : * the open collator.
990 : *
991 : * First, the locale string is canonicalized to an ICU format locale ID such
992 : * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
993 : * the key-value arguments.
994 : *
995 : * Starting with ICU version 54, the attributes are processed automatically by
996 : * ucol_open(), so this is only necessary for emulating this behavior on older
997 : * versions.
998 : */
999 : pg_attribute_unused()
1000 : static void
1001 0 : icu_set_collation_attributes(UCollator *collator, const char *loc,
1002 : UErrorCode *status)
1003 : {
1004 : int32_t len;
1005 : char *icu_locale_id;
1006 : char *lower_str;
1007 : char *str;
1008 : char *token;
1009 :
1010 : /*
1011 : * The input locale may be a BCP 47 language tag, e.g.
1012 : * "und-u-kc-ks-level1", which expresses the same attributes in a
1013 : * different form. It will be converted to the equivalent ICU format
1014 : * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
1015 : * uloc_canonicalize().
1016 : */
1017 0 : *status = U_ZERO_ERROR;
1018 0 : len = uloc_canonicalize(loc, NULL, 0, status);
1019 0 : icu_locale_id = palloc(len + 1);
1020 0 : *status = U_ZERO_ERROR;
1021 0 : len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
1022 0 : if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
1023 0 : return;
1024 :
1025 0 : lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
1026 :
1027 0 : pfree(icu_locale_id);
1028 :
1029 0 : str = strchr(lower_str, '@');
1030 0 : if (!str)
1031 0 : return;
1032 0 : str++;
1033 :
1034 0 : while ((token = strsep(&str, ";")))
1035 : {
1036 0 : char *e = strchr(token, '=');
1037 :
1038 0 : if (e)
1039 : {
1040 : char *name;
1041 : char *value;
1042 : UColAttribute uattr;
1043 : UColAttributeValue uvalue;
1044 :
1045 0 : *status = U_ZERO_ERROR;
1046 :
1047 0 : *e = '\0';
1048 0 : name = token;
1049 0 : value = e + 1;
1050 :
1051 : /*
1052 : * See attribute name and value lists in ICU i18n/coll.cpp
1053 : */
1054 0 : if (strcmp(name, "colstrength") == 0)
1055 0 : uattr = UCOL_STRENGTH;
1056 0 : else if (strcmp(name, "colbackwards") == 0)
1057 0 : uattr = UCOL_FRENCH_COLLATION;
1058 0 : else if (strcmp(name, "colcaselevel") == 0)
1059 0 : uattr = UCOL_CASE_LEVEL;
1060 0 : else if (strcmp(name, "colcasefirst") == 0)
1061 0 : uattr = UCOL_CASE_FIRST;
1062 0 : else if (strcmp(name, "colalternate") == 0)
1063 0 : uattr = UCOL_ALTERNATE_HANDLING;
1064 0 : else if (strcmp(name, "colnormalization") == 0)
1065 0 : uattr = UCOL_NORMALIZATION_MODE;
1066 0 : else if (strcmp(name, "colnumeric") == 0)
1067 0 : uattr = UCOL_NUMERIC_COLLATION;
1068 : else
1069 : /* ignore if unknown */
1070 0 : continue;
1071 :
1072 0 : if (strcmp(value, "primary") == 0)
1073 0 : uvalue = UCOL_PRIMARY;
1074 0 : else if (strcmp(value, "secondary") == 0)
1075 0 : uvalue = UCOL_SECONDARY;
1076 0 : else if (strcmp(value, "tertiary") == 0)
1077 0 : uvalue = UCOL_TERTIARY;
1078 0 : else if (strcmp(value, "quaternary") == 0)
1079 0 : uvalue = UCOL_QUATERNARY;
1080 0 : else if (strcmp(value, "identical") == 0)
1081 0 : uvalue = UCOL_IDENTICAL;
1082 0 : else if (strcmp(value, "no") == 0)
1083 0 : uvalue = UCOL_OFF;
1084 0 : else if (strcmp(value, "yes") == 0)
1085 0 : uvalue = UCOL_ON;
1086 0 : else if (strcmp(value, "shifted") == 0)
1087 0 : uvalue = UCOL_SHIFTED;
1088 0 : else if (strcmp(value, "non-ignorable") == 0)
1089 0 : uvalue = UCOL_NON_IGNORABLE;
1090 0 : else if (strcmp(value, "lower") == 0)
1091 0 : uvalue = UCOL_LOWER_FIRST;
1092 0 : else if (strcmp(value, "upper") == 0)
1093 0 : uvalue = UCOL_UPPER_FIRST;
1094 : else
1095 : {
1096 0 : *status = U_ILLEGAL_ARGUMENT_ERROR;
1097 0 : break;
1098 : }
1099 :
1100 0 : ucol_setAttribute(collator, uattr, uvalue, status);
1101 : }
1102 : }
1103 :
1104 0 : pfree(lower_str);
1105 : }
1106 :
1107 : #endif /* USE_ICU */
|