Line data Source code
1 : /*-----------------------------------------------------------------------
2 : *
3 : * PostgreSQL locale utilities for ICU
4 : *
5 : * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 : *
7 : * src/backend/utils/adt/pg_locale_icu.c
8 : *
9 : *-----------------------------------------------------------------------
10 : */
11 :
12 : #include "postgres.h"
13 :
14 : #ifdef USE_ICU
15 : #include <unicode/ucnv.h>
16 : #include <unicode/ustring.h>
17 :
18 : /*
19 : * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
20 : * (see
21 : * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
22 : */
23 : #if U_ICU_VERSION_MAJOR_NUM >= 53
24 : #define HAVE_UCOL_STRCOLLUTF8 1
25 : #else
26 : #undef HAVE_UCOL_STRCOLLUTF8
27 : #endif
28 :
29 : #endif
30 :
31 : #include "access/htup_details.h"
32 : #include "catalog/pg_database.h"
33 : #include "catalog/pg_collation.h"
34 : #include "mb/pg_wchar.h"
35 : #include "miscadmin.h"
36 : #include "utils/builtins.h"
37 : #include "utils/formatting.h"
38 : #include "utils/memutils.h"
39 : #include "utils/pg_locale.h"
40 : #include "utils/syscache.h"
41 :
42 : /*
43 : * Size of stack buffer to use for string transformations, used to avoid heap
44 : * allocations in typical cases. This should be large enough that most strings
45 : * will fit, but small enough that we feel comfortable putting it on the
46 : * stack.
47 : */
48 : #define TEXTBUFLEN 1024
49 :
50 : extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context);
51 :
52 : #ifdef USE_ICU
53 :
54 : extern UCollator *pg_ucol_open(const char *loc_str);
55 :
56 : static size_t strlower_icu(char *dest, size_t destsize, const char *src,
57 : ssize_t srclen, pg_locale_t locale);
58 : static size_t strtitle_icu(char *dest, size_t destsize, const char *src,
59 : ssize_t srclen, pg_locale_t locale);
60 : static size_t strupper_icu(char *dest, size_t destsize, const char *src,
61 : ssize_t srclen, pg_locale_t locale);
62 : static size_t strfold_icu(char *dest, size_t destsize, const char *src,
63 : ssize_t srclen, pg_locale_t locale);
64 : static int strncoll_icu(const char *arg1, ssize_t len1,
65 : const char *arg2, ssize_t len2,
66 : pg_locale_t locale);
67 : static size_t strnxfrm_icu(char *dest, size_t destsize,
68 : const char *src, ssize_t srclen,
69 : pg_locale_t locale);
70 : extern char *get_collation_actual_version_icu(const char *collcollate);
71 :
72 : typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity,
73 : const UChar *src, int32_t srcLength,
74 : const char *locale,
75 : UErrorCode *pErrorCode);
76 :
77 : /*
78 : * Converter object for converting between ICU's UChar strings and C strings
79 : * in database encoding. Since the database encoding doesn't change, we only
80 : * need one of these per session.
81 : */
82 : static UConverter *icu_converter = NULL;
83 :
84 : static UCollator *make_icu_collator(const char *iculocstr,
85 : const char *icurules);
86 : static int strncoll_icu(const char *arg1, ssize_t len1,
87 : const char *arg2, ssize_t len2,
88 : pg_locale_t locale);
89 : static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
90 : const char *src, ssize_t srclen,
91 : pg_locale_t locale);
92 : #ifdef HAVE_UCOL_STRCOLLUTF8
93 : static int strncoll_icu_utf8(const char *arg1, ssize_t len1,
94 : const char *arg2, ssize_t len2,
95 : pg_locale_t locale);
96 : #endif
97 : static size_t strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
98 : const char *src, ssize_t srclen,
99 : pg_locale_t locale);
100 : static void init_icu_converter(void);
101 : static size_t uchar_length(UConverter *converter,
102 : const char *str, int32_t len);
103 : static int32_t uchar_convert(UConverter *converter,
104 : UChar *dest, int32_t destlen,
105 : const char *src, int32_t srclen);
106 : static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff,
107 : size_t nbytes);
108 : static size_t icu_from_uchar(char *dest, size_t destsize,
109 : const UChar *buff_uchar, int32_t len_uchar);
110 : static void icu_set_collation_attributes(UCollator *collator, const char *loc,
111 : UErrorCode *status);
112 : static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
113 : UChar **buff_dest, UChar *buff_source,
114 : int32_t len_source);
115 : static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
116 : const UChar *src, int32_t srcLength,
117 : const char *locale,
118 : UErrorCode *pErrorCode);
119 : static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
120 : const UChar *src, int32_t srcLength,
121 : const char *locale,
122 : UErrorCode *pErrorCode);
123 :
124 : static bool
125 126 : char_is_cased_icu(char ch, pg_locale_t locale)
126 : {
127 126 : return IS_HIGHBIT_SET(ch) ||
128 252 : (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
129 : }
130 :
131 : static pg_wchar
132 108 : toupper_icu(pg_wchar wc, pg_locale_t locale)
133 : {
134 108 : return u_toupper(wc);
135 : }
136 :
137 : static pg_wchar
138 108 : tolower_icu(pg_wchar wc, pg_locale_t locale)
139 : {
140 108 : return u_tolower(wc);
141 : }
142 :
143 : static const struct collate_methods collate_methods_icu = {
144 : .strncoll = strncoll_icu,
145 : .strnxfrm = strnxfrm_icu,
146 : .strnxfrm_prefix = strnxfrm_prefix_icu,
147 : .strxfrm_is_safe = true,
148 : };
149 :
150 : static const struct collate_methods collate_methods_icu_utf8 = {
151 : #ifdef HAVE_UCOL_STRCOLLUTF8
152 : .strncoll = strncoll_icu_utf8,
153 : #else
154 : .strncoll = strncoll_icu,
155 : #endif
156 : .strnxfrm = strnxfrm_icu,
157 : .strnxfrm_prefix = strnxfrm_prefix_icu_utf8,
158 : .strxfrm_is_safe = true,
159 : };
160 :
161 : static bool
162 12288 : wc_isdigit_icu(pg_wchar wc, pg_locale_t locale)
163 : {
164 12288 : return u_isdigit(wc);
165 : }
166 :
167 : static bool
168 12288 : wc_isalpha_icu(pg_wchar wc, pg_locale_t locale)
169 : {
170 12288 : return u_isalpha(wc);
171 : }
172 :
173 : static bool
174 12288 : wc_isalnum_icu(pg_wchar wc, pg_locale_t locale)
175 : {
176 12288 : return u_isalnum(wc);
177 : }
178 :
179 : static bool
180 12288 : wc_isupper_icu(pg_wchar wc, pg_locale_t locale)
181 : {
182 12288 : return u_isupper(wc);
183 : }
184 :
185 : static bool
186 12288 : wc_islower_icu(pg_wchar wc, pg_locale_t locale)
187 : {
188 12288 : return u_islower(wc);
189 : }
190 :
191 : static bool
192 12288 : wc_isgraph_icu(pg_wchar wc, pg_locale_t locale)
193 : {
194 12288 : return u_isgraph(wc);
195 : }
196 :
197 : static bool
198 12288 : wc_isprint_icu(pg_wchar wc, pg_locale_t locale)
199 : {
200 12288 : return u_isprint(wc);
201 : }
202 :
203 : static bool
204 12288 : wc_ispunct_icu(pg_wchar wc, pg_locale_t locale)
205 : {
206 12288 : return u_ispunct(wc);
207 : }
208 :
209 : static bool
210 12288 : wc_isspace_icu(pg_wchar wc, pg_locale_t locale)
211 : {
212 12288 : return u_isspace(wc);
213 : }
214 :
215 : static bool
216 0 : wc_isxdigit_icu(pg_wchar wc, pg_locale_t locale)
217 : {
218 0 : return u_isxdigit(wc);
219 : }
220 :
221 : static const struct ctype_methods ctype_methods_icu = {
222 : .strlower = strlower_icu,
223 : .strtitle = strtitle_icu,
224 : .strupper = strupper_icu,
225 : .strfold = strfold_icu,
226 : .wc_isdigit = wc_isdigit_icu,
227 : .wc_isalpha = wc_isalpha_icu,
228 : .wc_isalnum = wc_isalnum_icu,
229 : .wc_isupper = wc_isupper_icu,
230 : .wc_islower = wc_islower_icu,
231 : .wc_isgraph = wc_isgraph_icu,
232 : .wc_isprint = wc_isprint_icu,
233 : .wc_ispunct = wc_ispunct_icu,
234 : .wc_isspace = wc_isspace_icu,
235 : .wc_isxdigit = wc_isxdigit_icu,
236 : .char_is_cased = char_is_cased_icu,
237 : .wc_toupper = toupper_icu,
238 : .wc_tolower = tolower_icu,
239 : };
240 : #endif
241 :
242 : pg_locale_t
243 210 : create_pg_locale_icu(Oid collid, MemoryContext context)
244 : {
245 : #ifdef USE_ICU
246 : bool deterministic;
247 : const char *iculocstr;
248 210 : const char *icurules = NULL;
249 : UCollator *collator;
250 : pg_locale_t result;
251 :
252 210 : if (collid == DEFAULT_COLLATION_OID)
253 : {
254 : HeapTuple tp;
255 : Datum datum;
256 : bool isnull;
257 :
258 26 : tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
259 26 : if (!HeapTupleIsValid(tp))
260 0 : elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
261 :
262 : /* default database collation is always deterministic */
263 26 : deterministic = true;
264 26 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
265 : Anum_pg_database_datlocale);
266 26 : iculocstr = TextDatumGetCString(datum);
267 26 : datum = SysCacheGetAttr(DATABASEOID, tp,
268 : Anum_pg_database_daticurules, &isnull);
269 26 : if (!isnull)
270 0 : icurules = TextDatumGetCString(datum);
271 :
272 26 : ReleaseSysCache(tp);
273 : }
274 : else
275 : {
276 : Form_pg_collation collform;
277 : HeapTuple tp;
278 : Datum datum;
279 : bool isnull;
280 :
281 184 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
282 184 : if (!HeapTupleIsValid(tp))
283 0 : elog(ERROR, "cache lookup failed for collation %u", collid);
284 184 : collform = (Form_pg_collation) GETSTRUCT(tp);
285 184 : deterministic = collform->collisdeterministic;
286 184 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
287 : Anum_pg_collation_colllocale);
288 184 : iculocstr = TextDatumGetCString(datum);
289 184 : datum = SysCacheGetAttr(COLLOID, tp,
290 : Anum_pg_collation_collicurules, &isnull);
291 184 : if (!isnull)
292 12 : icurules = TextDatumGetCString(datum);
293 :
294 184 : ReleaseSysCache(tp);
295 : }
296 :
297 210 : collator = make_icu_collator(iculocstr, icurules);
298 :
299 200 : result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
300 200 : result->icu.locale = MemoryContextStrdup(context, iculocstr);
301 200 : result->icu.ucol = collator;
302 200 : result->deterministic = deterministic;
303 200 : result->collate_is_c = false;
304 200 : result->ctype_is_c = false;
305 200 : if (GetDatabaseEncoding() == PG_UTF8)
306 200 : result->collate = &collate_methods_icu_utf8;
307 : else
308 0 : result->collate = &collate_methods_icu;
309 200 : result->ctype = &ctype_methods_icu;
310 :
311 200 : return result;
312 : #else
313 : /* could get here if a collation was created by a build with ICU */
314 : ereport(ERROR,
315 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
316 : errmsg("ICU is not supported in this build")));
317 :
318 : return NULL;
319 : #endif
320 : }
321 :
322 : #ifdef USE_ICU
323 :
324 : /*
325 : * Wrapper around ucol_open() to handle API differences for older ICU
326 : * versions.
327 : *
328 : * Ensure that no path leaks a UCollator.
329 : */
330 : UCollator *
331 78198 : pg_ucol_open(const char *loc_str)
332 : {
333 : UCollator *collator;
334 : UErrorCode status;
335 78198 : const char *orig_str = loc_str;
336 78198 : char *fixed_str = NULL;
337 :
338 : /*
339 : * Must never open default collator, because it depends on the environment
340 : * and may change at any time. Should not happen, but check here to catch
341 : * bugs that might be hard to catch otherwise.
342 : *
343 : * NB: the default collator is not the same as the collator for the root
344 : * locale. The root locale may be specified as the empty string, "und", or
345 : * "root". The default collator is opened by passing NULL to ucol_open().
346 : */
347 78198 : if (loc_str == NULL)
348 0 : elog(ERROR, "opening default collator is not supported");
349 :
350 : /*
351 : * In ICU versions 54 and earlier, "und" is not a recognized spelling of
352 : * the root locale. If the first component of the locale is "und", replace
353 : * with "root" before opening.
354 : */
355 : if (U_ICU_VERSION_MAJOR_NUM < 55)
356 : {
357 : char lang[ULOC_LANG_CAPACITY];
358 :
359 : status = U_ZERO_ERROR;
360 : uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
361 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
362 : {
363 : ereport(ERROR,
364 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
365 : errmsg("could not get language from locale \"%s\": %s",
366 : loc_str, u_errorName(status))));
367 : }
368 :
369 : if (strcmp(lang, "und") == 0)
370 : {
371 : const char *remainder = loc_str + strlen("und");
372 :
373 : fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
374 : strcpy(fixed_str, "root");
375 : strcat(fixed_str, remainder);
376 :
377 : loc_str = fixed_str;
378 : }
379 : }
380 :
381 78198 : status = U_ZERO_ERROR;
382 78198 : collator = ucol_open(loc_str, &status);
383 78198 : if (U_FAILURE(status))
384 12 : ereport(ERROR,
385 : /* use original string for error report */
386 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
387 : errmsg("could not open collator for locale \"%s\": %s",
388 : orig_str, u_errorName(status))));
389 :
390 : if (U_ICU_VERSION_MAJOR_NUM < 54)
391 : {
392 : status = U_ZERO_ERROR;
393 : icu_set_collation_attributes(collator, loc_str, &status);
394 :
395 : /*
396 : * Pretend the error came from ucol_open(), for consistent error
397 : * message across ICU versions.
398 : */
399 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
400 : {
401 : ucol_close(collator);
402 : ereport(ERROR,
403 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
404 : errmsg("could not open collator for locale \"%s\": %s",
405 : orig_str, u_errorName(status))));
406 : }
407 : }
408 :
409 78186 : if (fixed_str != NULL)
410 0 : pfree(fixed_str);
411 :
412 78186 : return collator;
413 : }
414 :
415 : /*
416 : * Create a UCollator with the given locale string and rules.
417 : *
418 : * Ensure that no path leaks a UCollator.
419 : */
420 : static UCollator *
421 210 : make_icu_collator(const char *iculocstr, const char *icurules)
422 : {
423 210 : if (!icurules)
424 : {
425 : /* simple case without rules */
426 198 : return pg_ucol_open(iculocstr);
427 : }
428 : else
429 : {
430 : UCollator *collator_std_rules;
431 : UCollator *collator_all_rules;
432 : const UChar *std_rules;
433 : UChar *my_rules;
434 : UChar *all_rules;
435 : int32_t length;
436 : int32_t total;
437 : UErrorCode status;
438 :
439 : /*
440 : * If rules are specified, we extract the rules of the standard
441 : * collation, add our own rules, and make a new collator with the
442 : * combined rules.
443 : */
444 12 : icu_to_uchar(&my_rules, icurules, strlen(icurules));
445 :
446 12 : collator_std_rules = pg_ucol_open(iculocstr);
447 :
448 12 : std_rules = ucol_getRules(collator_std_rules, &length);
449 :
450 12 : total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
451 :
452 : /* avoid leaking collator on OOM */
453 12 : all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
454 12 : if (!all_rules)
455 : {
456 0 : ucol_close(collator_std_rules);
457 0 : ereport(ERROR,
458 : (errcode(ERRCODE_OUT_OF_MEMORY),
459 : errmsg("out of memory")));
460 : }
461 :
462 12 : u_strcpy(all_rules, std_rules);
463 12 : u_strcat(all_rules, my_rules);
464 :
465 12 : ucol_close(collator_std_rules);
466 :
467 12 : status = U_ZERO_ERROR;
468 12 : collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
469 : UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
470 : NULL, &status);
471 12 : if (U_FAILURE(status))
472 : {
473 6 : ereport(ERROR,
474 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
475 : errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
476 : iculocstr, icurules, u_errorName(status))));
477 : }
478 :
479 6 : return collator_all_rules;
480 : }
481 : }
482 :
483 : static size_t
484 528 : strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
485 : pg_locale_t locale)
486 : {
487 : int32_t len_uchar;
488 : int32_t len_conv;
489 : UChar *buff_uchar;
490 : UChar *buff_conv;
491 : size_t result_len;
492 :
493 528 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
494 528 : len_conv = icu_convert_case(u_strToLower, locale,
495 : &buff_conv, buff_uchar, len_uchar);
496 528 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
497 528 : pfree(buff_uchar);
498 528 : pfree(buff_conv);
499 :
500 528 : return result_len;
501 : }
502 :
503 : static size_t
504 30 : strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
505 : pg_locale_t locale)
506 : {
507 : int32_t len_uchar;
508 : int32_t len_conv;
509 : UChar *buff_uchar;
510 : UChar *buff_conv;
511 : size_t result_len;
512 :
513 30 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
514 30 : len_conv = icu_convert_case(u_strToTitle_default_BI, locale,
515 : &buff_conv, buff_uchar, len_uchar);
516 30 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
517 30 : pfree(buff_uchar);
518 30 : pfree(buff_conv);
519 :
520 30 : return result_len;
521 : }
522 :
523 : static size_t
524 54 : strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
525 : pg_locale_t locale)
526 : {
527 : int32_t len_uchar;
528 : int32_t len_conv;
529 : UChar *buff_uchar;
530 : UChar *buff_conv;
531 : size_t result_len;
532 :
533 54 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
534 54 : len_conv = icu_convert_case(u_strToUpper, locale,
535 : &buff_conv, buff_uchar, len_uchar);
536 54 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
537 54 : pfree(buff_uchar);
538 54 : pfree(buff_conv);
539 :
540 54 : return result_len;
541 : }
542 :
543 : static size_t
544 12 : strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
545 : pg_locale_t locale)
546 : {
547 : int32_t len_uchar;
548 : int32_t len_conv;
549 : UChar *buff_uchar;
550 : UChar *buff_conv;
551 : size_t result_len;
552 :
553 12 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
554 12 : len_conv = icu_convert_case(u_strFoldCase_default, locale,
555 : &buff_conv, buff_uchar, len_uchar);
556 12 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
557 12 : pfree(buff_uchar);
558 12 : pfree(buff_conv);
559 :
560 12 : return result_len;
561 : }
562 :
563 : /*
564 : * strncoll_icu_utf8
565 : *
566 : * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
567 : * database encoding. An argument length of -1 means the string is
568 : * NUL-terminated.
569 : */
570 : #ifdef HAVE_UCOL_STRCOLLUTF8
571 : int
572 23730 : strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
573 : pg_locale_t locale)
574 : {
575 : int result;
576 : UErrorCode status;
577 :
578 : Assert(GetDatabaseEncoding() == PG_UTF8);
579 :
580 23730 : status = U_ZERO_ERROR;
581 23730 : result = ucol_strcollUTF8(locale->icu.ucol,
582 : arg1, len1,
583 : arg2, len2,
584 : &status);
585 23730 : if (U_FAILURE(status))
586 0 : ereport(ERROR,
587 : (errmsg("collation failed: %s", u_errorName(status))));
588 :
589 23730 : return result;
590 : }
591 : #endif
592 :
593 : /* 'srclen' of -1 means the strings are NUL-terminated */
594 : size_t
595 5748 : strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
596 : pg_locale_t locale)
597 : {
598 : char sbuf[TEXTBUFLEN];
599 5748 : char *buf = sbuf;
600 : UChar *uchar;
601 : int32_t ulen;
602 : size_t uchar_bsize;
603 : Size result_bsize;
604 :
605 5748 : init_icu_converter();
606 :
607 5748 : ulen = uchar_length(icu_converter, src, srclen);
608 :
609 5748 : uchar_bsize = (ulen + 1) * sizeof(UChar);
610 :
611 5748 : if (uchar_bsize > TEXTBUFLEN)
612 0 : buf = palloc(uchar_bsize);
613 :
614 5748 : uchar = (UChar *) buf;
615 :
616 5748 : ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
617 :
618 5748 : result_bsize = ucol_getSortKey(locale->icu.ucol,
619 : uchar, ulen,
620 : (uint8_t *) dest, destsize);
621 :
622 : /*
623 : * ucol_getSortKey() counts the nul-terminator in the result length, but
624 : * this function should not.
625 : */
626 : Assert(result_bsize > 0);
627 5748 : result_bsize--;
628 :
629 5748 : if (buf != sbuf)
630 0 : pfree(buf);
631 :
632 : /* if dest is defined, it should be nul-terminated */
633 : Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
634 :
635 5748 : return result_bsize;
636 : }
637 :
638 : /* 'srclen' of -1 means the strings are NUL-terminated */
639 : size_t
640 1668 : strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
641 : const char *src, ssize_t srclen,
642 : pg_locale_t locale)
643 : {
644 : size_t result;
645 : UCharIterator iter;
646 : uint32_t state[2];
647 : UErrorCode status;
648 :
649 : Assert(GetDatabaseEncoding() == PG_UTF8);
650 :
651 1668 : uiter_setUTF8(&iter, src, srclen);
652 1668 : state[0] = state[1] = 0; /* won't need that again */
653 1668 : status = U_ZERO_ERROR;
654 1668 : result = ucol_nextSortKeyPart(locale->icu.ucol,
655 : &iter,
656 : state,
657 : (uint8_t *) dest,
658 : destsize,
659 : &status);
660 1668 : if (U_FAILURE(status))
661 0 : ereport(ERROR,
662 : (errmsg("sort key generation failed: %s",
663 : u_errorName(status))));
664 :
665 1668 : return result;
666 : }
667 :
668 : char *
669 77840 : get_collation_actual_version_icu(const char *collcollate)
670 : {
671 : UCollator *collator;
672 : UVersionInfo versioninfo;
673 : char buf[U_MAX_VERSION_STRING_LENGTH];
674 :
675 77840 : collator = pg_ucol_open(collcollate);
676 :
677 77840 : ucol_getVersion(collator, versioninfo);
678 77840 : ucol_close(collator);
679 :
680 77840 : u_versionToString(versioninfo, buf);
681 77840 : return pstrdup(buf);
682 : }
683 :
684 : /*
685 : * Convert a string in the database encoding into a string of UChars.
686 : *
687 : * The source string at buff is of length nbytes
688 : * (it needn't be nul-terminated)
689 : *
690 : * *buff_uchar receives a pointer to the palloc'd result string, and
691 : * the function's result is the number of UChars generated.
692 : *
693 : * The result string is nul-terminated, though most callers rely on the
694 : * result length instead.
695 : */
696 : static int32_t
697 636 : icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
698 : {
699 : int32_t len_uchar;
700 :
701 636 : init_icu_converter();
702 :
703 636 : len_uchar = uchar_length(icu_converter, buff, nbytes);
704 :
705 636 : *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
706 636 : len_uchar = uchar_convert(icu_converter,
707 : *buff_uchar, len_uchar + 1, buff, nbytes);
708 :
709 636 : return len_uchar;
710 : }
711 :
712 : /*
713 : * Convert a string of UChars into the database encoding.
714 : *
715 : * The source string at buff_uchar is of length len_uchar
716 : * (it needn't be nul-terminated)
717 : *
718 : * *result receives a pointer to the palloc'd result string, and the
719 : * function's result is the number of bytes generated (not counting nul).
720 : *
721 : * The result string is nul-terminated.
722 : */
723 : static size_t
724 624 : icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar)
725 : {
726 : UErrorCode status;
727 : int32_t len_result;
728 :
729 624 : init_icu_converter();
730 :
731 624 : status = U_ZERO_ERROR;
732 624 : len_result = ucnv_fromUChars(icu_converter, NULL, 0,
733 : buff_uchar, len_uchar, &status);
734 624 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
735 0 : ereport(ERROR,
736 : (errmsg("%s failed: %s", "ucnv_fromUChars",
737 : u_errorName(status))));
738 :
739 624 : if (len_result + 1 > destsize)
740 60 : return len_result;
741 :
742 564 : status = U_ZERO_ERROR;
743 564 : len_result = ucnv_fromUChars(icu_converter, dest, len_result + 1,
744 : buff_uchar, len_uchar, &status);
745 564 : if (U_FAILURE(status) ||
746 564 : status == U_STRING_NOT_TERMINATED_WARNING)
747 0 : ereport(ERROR,
748 : (errmsg("%s failed: %s", "ucnv_fromUChars",
749 : u_errorName(status))));
750 :
751 564 : return len_result;
752 : }
753 :
754 : static int32_t
755 624 : icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
756 : UChar **buff_dest, UChar *buff_source, int32_t len_source)
757 : {
758 : UErrorCode status;
759 : int32_t len_dest;
760 :
761 624 : len_dest = len_source; /* try first with same length */
762 624 : *buff_dest = palloc(len_dest * sizeof(**buff_dest));
763 624 : status = U_ZERO_ERROR;
764 624 : len_dest = func(*buff_dest, len_dest, buff_source, len_source,
765 : mylocale->icu.locale, &status);
766 624 : if (status == U_BUFFER_OVERFLOW_ERROR)
767 : {
768 : /* try again with adjusted length */
769 18 : pfree(*buff_dest);
770 18 : *buff_dest = palloc(len_dest * sizeof(**buff_dest));
771 18 : status = U_ZERO_ERROR;
772 18 : len_dest = func(*buff_dest, len_dest, buff_source, len_source,
773 : mylocale->icu.locale, &status);
774 : }
775 624 : if (U_FAILURE(status))
776 0 : ereport(ERROR,
777 : (errmsg("case conversion failed: %s", u_errorName(status))));
778 624 : return len_dest;
779 : }
780 :
781 : static int32_t
782 30 : u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
783 : const UChar *src, int32_t srcLength,
784 : const char *locale,
785 : UErrorCode *pErrorCode)
786 : {
787 30 : return u_strToTitle(dest, destCapacity, src, srcLength,
788 : NULL, locale, pErrorCode);
789 : }
790 :
791 : static int32_t
792 24 : u_strFoldCase_default(UChar *dest, int32_t destCapacity,
793 : const UChar *src, int32_t srcLength,
794 : const char *locale,
795 : UErrorCode *pErrorCode)
796 : {
797 24 : uint32 options = U_FOLD_CASE_DEFAULT;
798 : char lang[3];
799 : UErrorCode status;
800 :
801 : /*
802 : * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
803 : * folding does not accept a locale. Instead it just supports a single
804 : * option relevant to Turkic languages 'az' and 'tr'; check for those
805 : * languages to enable the option.
806 : */
807 24 : status = U_ZERO_ERROR;
808 24 : uloc_getLanguage(locale, lang, 3, &status);
809 24 : if (U_SUCCESS(status))
810 : {
811 : /*
812 : * The option name is confusing, but it causes u_strFoldCase to use
813 : * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
814 : */
815 24 : if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
816 12 : options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
817 : }
818 :
819 24 : return u_strFoldCase(dest, destCapacity, src, srcLength,
820 : options, pErrorCode);
821 : }
822 :
823 : /*
824 : * strncoll_icu
825 : *
826 : * Convert the arguments from the database encoding to UChar strings, then
827 : * call ucol_strcoll(). An argument length of -1 means that the string is
828 : * NUL-terminated.
829 : *
830 : * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
831 : * caller should call that instead.
832 : */
833 : static int
834 0 : strncoll_icu(const char *arg1, ssize_t len1,
835 : const char *arg2, ssize_t len2, pg_locale_t locale)
836 : {
837 : char sbuf[TEXTBUFLEN];
838 0 : char *buf = sbuf;
839 : int32_t ulen1;
840 : int32_t ulen2;
841 : size_t bufsize1;
842 : size_t bufsize2;
843 : UChar *uchar1,
844 : *uchar2;
845 : int result;
846 :
847 : /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */
848 : #ifdef HAVE_UCOL_STRCOLLUTF8
849 : Assert(GetDatabaseEncoding() != PG_UTF8);
850 : #endif
851 :
852 0 : init_icu_converter();
853 :
854 0 : ulen1 = uchar_length(icu_converter, arg1, len1);
855 0 : ulen2 = uchar_length(icu_converter, arg2, len2);
856 :
857 0 : bufsize1 = (ulen1 + 1) * sizeof(UChar);
858 0 : bufsize2 = (ulen2 + 1) * sizeof(UChar);
859 :
860 0 : if (bufsize1 + bufsize2 > TEXTBUFLEN)
861 0 : buf = palloc(bufsize1 + bufsize2);
862 :
863 0 : uchar1 = (UChar *) buf;
864 0 : uchar2 = (UChar *) (buf + bufsize1);
865 :
866 0 : ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
867 0 : ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
868 :
869 0 : result = ucol_strcoll(locale->icu.ucol,
870 : uchar1, ulen1,
871 : uchar2, ulen2);
872 :
873 0 : if (buf != sbuf)
874 0 : pfree(buf);
875 :
876 0 : return result;
877 : }
878 :
879 : /* 'srclen' of -1 means the strings are NUL-terminated */
880 : static size_t
881 0 : strnxfrm_prefix_icu(char *dest, size_t destsize,
882 : const char *src, ssize_t srclen,
883 : pg_locale_t locale)
884 : {
885 : char sbuf[TEXTBUFLEN];
886 0 : char *buf = sbuf;
887 : UCharIterator iter;
888 : uint32_t state[2];
889 : UErrorCode status;
890 0 : int32_t ulen = -1;
891 0 : UChar *uchar = NULL;
892 : size_t uchar_bsize;
893 : Size result_bsize;
894 :
895 : /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */
896 : Assert(GetDatabaseEncoding() != PG_UTF8);
897 :
898 0 : init_icu_converter();
899 :
900 0 : ulen = uchar_length(icu_converter, src, srclen);
901 :
902 0 : uchar_bsize = (ulen + 1) * sizeof(UChar);
903 :
904 0 : if (uchar_bsize > TEXTBUFLEN)
905 0 : buf = palloc(uchar_bsize);
906 :
907 0 : uchar = (UChar *) buf;
908 :
909 0 : ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
910 :
911 0 : uiter_setString(&iter, uchar, ulen);
912 0 : state[0] = state[1] = 0; /* won't need that again */
913 0 : status = U_ZERO_ERROR;
914 0 : result_bsize = ucol_nextSortKeyPart(locale->icu.ucol,
915 : &iter,
916 : state,
917 : (uint8_t *) dest,
918 : destsize,
919 : &status);
920 0 : if (U_FAILURE(status))
921 0 : ereport(ERROR,
922 : (errmsg("sort key generation failed: %s",
923 : u_errorName(status))));
924 :
925 0 : return result_bsize;
926 : }
927 :
928 : static void
929 7008 : init_icu_converter(void)
930 : {
931 : const char *icu_encoding_name;
932 : UErrorCode status;
933 : UConverter *conv;
934 :
935 7008 : if (icu_converter)
936 7002 : return; /* already done */
937 :
938 6 : icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
939 6 : if (!icu_encoding_name)
940 0 : ereport(ERROR,
941 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
942 : errmsg("encoding \"%s\" not supported by ICU",
943 : pg_encoding_to_char(GetDatabaseEncoding()))));
944 :
945 6 : status = U_ZERO_ERROR;
946 6 : conv = ucnv_open(icu_encoding_name, &status);
947 6 : if (U_FAILURE(status))
948 0 : ereport(ERROR,
949 : (errmsg("could not open ICU converter for encoding \"%s\": %s",
950 : icu_encoding_name, u_errorName(status))));
951 :
952 6 : icu_converter = conv;
953 : }
954 :
955 : /*
956 : * Find length, in UChars, of given string if converted to UChar string.
957 : *
958 : * A length of -1 indicates that the input string is NUL-terminated.
959 : */
960 : static size_t
961 6384 : uchar_length(UConverter *converter, const char *str, int32_t len)
962 : {
963 6384 : UErrorCode status = U_ZERO_ERROR;
964 : int32_t ulen;
965 :
966 6384 : ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
967 6384 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
968 0 : ereport(ERROR,
969 : (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
970 6384 : return ulen;
971 : }
972 :
973 : /*
974 : * Convert the given source string into a UChar string, stored in dest, and
975 : * return the length (in UChars).
976 : *
977 : * A srclen of -1 indicates that the input string is NUL-terminated.
978 : */
979 : static int32_t
980 6384 : uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
981 : const char *src, int32_t srclen)
982 : {
983 6384 : UErrorCode status = U_ZERO_ERROR;
984 : int32_t ulen;
985 :
986 6384 : status = U_ZERO_ERROR;
987 6384 : ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
988 6384 : if (U_FAILURE(status))
989 0 : ereport(ERROR,
990 : (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
991 6384 : return ulen;
992 : }
993 :
994 : /*
995 : * Parse collation attributes from the given locale string and apply them to
996 : * the open collator.
997 : *
998 : * First, the locale string is canonicalized to an ICU format locale ID such
999 : * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
1000 : * the key-value arguments.
1001 : *
1002 : * Starting with ICU version 54, the attributes are processed automatically by
1003 : * ucol_open(), so this is only necessary for emulating this behavior on older
1004 : * versions.
1005 : */
1006 : pg_attribute_unused()
1007 : static void
1008 0 : icu_set_collation_attributes(UCollator *collator, const char *loc,
1009 : UErrorCode *status)
1010 : {
1011 : int32_t len;
1012 : char *icu_locale_id;
1013 : char *lower_str;
1014 : char *str;
1015 : char *token;
1016 :
1017 : /*
1018 : * The input locale may be a BCP 47 language tag, e.g.
1019 : * "und-u-kc-ks-level1", which expresses the same attributes in a
1020 : * different form. It will be converted to the equivalent ICU format
1021 : * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
1022 : * uloc_canonicalize().
1023 : */
1024 0 : *status = U_ZERO_ERROR;
1025 0 : len = uloc_canonicalize(loc, NULL, 0, status);
1026 0 : icu_locale_id = palloc(len + 1);
1027 0 : *status = U_ZERO_ERROR;
1028 0 : len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
1029 0 : if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
1030 0 : return;
1031 :
1032 0 : lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
1033 :
1034 0 : pfree(icu_locale_id);
1035 :
1036 0 : str = strchr(lower_str, '@');
1037 0 : if (!str)
1038 0 : return;
1039 0 : str++;
1040 :
1041 0 : while ((token = strsep(&str, ";")))
1042 : {
1043 0 : char *e = strchr(token, '=');
1044 :
1045 0 : if (e)
1046 : {
1047 : char *name;
1048 : char *value;
1049 : UColAttribute uattr;
1050 : UColAttributeValue uvalue;
1051 :
1052 0 : *status = U_ZERO_ERROR;
1053 :
1054 0 : *e = '\0';
1055 0 : name = token;
1056 0 : value = e + 1;
1057 :
1058 : /*
1059 : * See attribute name and value lists in ICU i18n/coll.cpp
1060 : */
1061 0 : if (strcmp(name, "colstrength") == 0)
1062 0 : uattr = UCOL_STRENGTH;
1063 0 : else if (strcmp(name, "colbackwards") == 0)
1064 0 : uattr = UCOL_FRENCH_COLLATION;
1065 0 : else if (strcmp(name, "colcaselevel") == 0)
1066 0 : uattr = UCOL_CASE_LEVEL;
1067 0 : else if (strcmp(name, "colcasefirst") == 0)
1068 0 : uattr = UCOL_CASE_FIRST;
1069 0 : else if (strcmp(name, "colalternate") == 0)
1070 0 : uattr = UCOL_ALTERNATE_HANDLING;
1071 0 : else if (strcmp(name, "colnormalization") == 0)
1072 0 : uattr = UCOL_NORMALIZATION_MODE;
1073 0 : else if (strcmp(name, "colnumeric") == 0)
1074 0 : uattr = UCOL_NUMERIC_COLLATION;
1075 : else
1076 : /* ignore if unknown */
1077 0 : continue;
1078 :
1079 0 : if (strcmp(value, "primary") == 0)
1080 0 : uvalue = UCOL_PRIMARY;
1081 0 : else if (strcmp(value, "secondary") == 0)
1082 0 : uvalue = UCOL_SECONDARY;
1083 0 : else if (strcmp(value, "tertiary") == 0)
1084 0 : uvalue = UCOL_TERTIARY;
1085 0 : else if (strcmp(value, "quaternary") == 0)
1086 0 : uvalue = UCOL_QUATERNARY;
1087 0 : else if (strcmp(value, "identical") == 0)
1088 0 : uvalue = UCOL_IDENTICAL;
1089 0 : else if (strcmp(value, "no") == 0)
1090 0 : uvalue = UCOL_OFF;
1091 0 : else if (strcmp(value, "yes") == 0)
1092 0 : uvalue = UCOL_ON;
1093 0 : else if (strcmp(value, "shifted") == 0)
1094 0 : uvalue = UCOL_SHIFTED;
1095 0 : else if (strcmp(value, "non-ignorable") == 0)
1096 0 : uvalue = UCOL_NON_IGNORABLE;
1097 0 : else if (strcmp(value, "lower") == 0)
1098 0 : uvalue = UCOL_LOWER_FIRST;
1099 0 : else if (strcmp(value, "upper") == 0)
1100 0 : uvalue = UCOL_UPPER_FIRST;
1101 : else
1102 : {
1103 0 : *status = U_ILLEGAL_ARGUMENT_ERROR;
1104 0 : break;
1105 : }
1106 :
1107 0 : ucol_setAttribute(collator, uattr, uvalue, status);
1108 : }
1109 : }
1110 :
1111 0 : pfree(lower_str);
1112 : }
1113 :
1114 : #endif /* USE_ICU */
|