Line data Source code
1 : /*-----------------------------------------------------------------------
2 : *
3 : * PostgreSQL locale utilities for ICU
4 : *
5 : * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 : *
7 : * src/backend/utils/adt/pg_locale_icu.c
8 : *
9 : *-----------------------------------------------------------------------
10 : */
11 :
12 : #include "postgres.h"
13 :
14 : #ifdef USE_ICU
15 : #include <unicode/ucnv.h>
16 : #include <unicode/ustring.h>
17 :
18 : /*
19 : * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
20 : * (see
21 : * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
22 : */
23 : #if U_ICU_VERSION_MAJOR_NUM >= 53
24 : #define HAVE_UCOL_STRCOLLUTF8 1
25 : #else
26 : #undef HAVE_UCOL_STRCOLLUTF8
27 : #endif
28 :
29 : #endif
30 :
31 : #include "access/htup_details.h"
32 : #include "catalog/pg_database.h"
33 : #include "catalog/pg_collation.h"
34 : #include "mb/pg_wchar.h"
35 : #include "miscadmin.h"
36 : #include "utils/builtins.h"
37 : #include "utils/formatting.h"
38 : #include "utils/memutils.h"
39 : #include "utils/pg_locale.h"
40 : #include "utils/syscache.h"
41 :
42 : /*
43 : * Size of stack buffer to use for string transformations, used to avoid heap
44 : * allocations in typical cases. This should be large enough that most strings
45 : * will fit, but small enough that we feel comfortable putting it on the
46 : * stack.
47 : */
48 : #define TEXTBUFLEN 1024
49 :
50 : extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context);
51 :
52 : #ifdef USE_ICU
53 :
54 : extern UCollator *pg_ucol_open(const char *loc_str);
55 :
56 : static size_t strlower_icu(char *dest, size_t destsize, const char *src,
57 : ssize_t srclen, pg_locale_t locale);
58 : static size_t strtitle_icu(char *dest, size_t destsize, const char *src,
59 : ssize_t srclen, pg_locale_t locale);
60 : static size_t strupper_icu(char *dest, size_t destsize, const char *src,
61 : ssize_t srclen, pg_locale_t locale);
62 : static size_t strfold_icu(char *dest, size_t destsize, const char *src,
63 : ssize_t srclen, pg_locale_t locale);
64 : static int strncoll_icu(const char *arg1, ssize_t len1,
65 : const char *arg2, ssize_t len2,
66 : pg_locale_t locale);
67 : static size_t strnxfrm_icu(char *dest, size_t destsize,
68 : const char *src, ssize_t srclen,
69 : pg_locale_t locale);
70 : extern char *get_collation_actual_version_icu(const char *collcollate);
71 :
72 : typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity,
73 : const UChar *src, int32_t srcLength,
74 : const char *locale,
75 : UErrorCode *pErrorCode);
76 :
77 : /*
78 : * Converter object for converting between ICU's UChar strings and C strings
79 : * in database encoding. Since the database encoding doesn't change, we only
80 : * need one of these per session.
81 : */
82 : static UConverter *icu_converter = NULL;
83 :
84 : static UCollator *make_icu_collator(const char *iculocstr,
85 : const char *icurules);
86 : static int strncoll_icu(const char *arg1, ssize_t len1,
87 : const char *arg2, ssize_t len2,
88 : pg_locale_t locale);
89 : static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
90 : const char *src, ssize_t srclen,
91 : pg_locale_t locale);
92 : #ifdef HAVE_UCOL_STRCOLLUTF8
93 : static int strncoll_icu_utf8(const char *arg1, ssize_t len1,
94 : const char *arg2, ssize_t len2,
95 : pg_locale_t locale);
96 : #endif
97 : static size_t strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
98 : const char *src, ssize_t srclen,
99 : pg_locale_t locale);
100 : static void init_icu_converter(void);
101 : static size_t uchar_length(UConverter *converter,
102 : const char *str, int32_t len);
103 : static int32_t uchar_convert(UConverter *converter,
104 : UChar *dest, int32_t destlen,
105 : const char *src, int32_t srclen);
106 : static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff,
107 : size_t nbytes);
108 : static size_t icu_from_uchar(char *dest, size_t destsize,
109 : const UChar *buff_uchar, int32_t len_uchar);
110 : static void icu_set_collation_attributes(UCollator *collator, const char *loc,
111 : UErrorCode *status);
112 : static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
113 : UChar **buff_dest, UChar *buff_source,
114 : int32_t len_source);
115 : static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
116 : const UChar *src, int32_t srcLength,
117 : const char *locale,
118 : UErrorCode *pErrorCode);
119 : static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
120 : const UChar *src, int32_t srcLength,
121 : const char *locale,
122 : UErrorCode *pErrorCode);
123 :
124 : static bool
125 126 : char_is_cased_icu(char ch, pg_locale_t locale)
126 : {
127 126 : return IS_HIGHBIT_SET(ch) ||
128 252 : (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
129 : }
130 :
131 : /*
132 : * XXX: many of the functions below rely on casts directly from pg_wchar to
133 : * UChar32, which is correct for the UTF-8 encoding, but not in general.
134 : */
135 :
136 : static pg_wchar
137 108 : toupper_icu(pg_wchar wc, pg_locale_t locale)
138 : {
139 108 : return u_toupper(wc);
140 : }
141 :
142 : static pg_wchar
143 108 : tolower_icu(pg_wchar wc, pg_locale_t locale)
144 : {
145 108 : return u_tolower(wc);
146 : }
147 :
148 : static const struct collate_methods collate_methods_icu = {
149 : .strncoll = strncoll_icu,
150 : .strnxfrm = strnxfrm_icu,
151 : .strnxfrm_prefix = strnxfrm_prefix_icu,
152 : .strxfrm_is_safe = true,
153 : };
154 :
155 : static const struct collate_methods collate_methods_icu_utf8 = {
156 : #ifdef HAVE_UCOL_STRCOLLUTF8
157 : .strncoll = strncoll_icu_utf8,
158 : #else
159 : .strncoll = strncoll_icu,
160 : #endif
161 : .strnxfrm = strnxfrm_icu,
162 : .strnxfrm_prefix = strnxfrm_prefix_icu_utf8,
163 : .strxfrm_is_safe = true,
164 : };
165 :
166 : static bool
167 12288 : wc_isdigit_icu(pg_wchar wc, pg_locale_t locale)
168 : {
169 12288 : return u_isdigit(wc);
170 : }
171 :
172 : static bool
173 12288 : wc_isalpha_icu(pg_wchar wc, pg_locale_t locale)
174 : {
175 12288 : return u_isalpha(wc);
176 : }
177 :
178 : static bool
179 12288 : wc_isalnum_icu(pg_wchar wc, pg_locale_t locale)
180 : {
181 12288 : return u_isalnum(wc);
182 : }
183 :
184 : static bool
185 12288 : wc_isupper_icu(pg_wchar wc, pg_locale_t locale)
186 : {
187 12288 : return u_isupper(wc);
188 : }
189 :
190 : static bool
191 12288 : wc_islower_icu(pg_wchar wc, pg_locale_t locale)
192 : {
193 12288 : return u_islower(wc);
194 : }
195 :
196 : static bool
197 12288 : wc_isgraph_icu(pg_wchar wc, pg_locale_t locale)
198 : {
199 12288 : return u_isgraph(wc);
200 : }
201 :
202 : static bool
203 12288 : wc_isprint_icu(pg_wchar wc, pg_locale_t locale)
204 : {
205 12288 : return u_isprint(wc);
206 : }
207 :
208 : static bool
209 12288 : wc_ispunct_icu(pg_wchar wc, pg_locale_t locale)
210 : {
211 12288 : return u_ispunct(wc);
212 : }
213 :
214 : static bool
215 12288 : wc_isspace_icu(pg_wchar wc, pg_locale_t locale)
216 : {
217 12288 : return u_isspace(wc);
218 : }
219 :
220 : static bool
221 0 : wc_isxdigit_icu(pg_wchar wc, pg_locale_t locale)
222 : {
223 0 : return u_isxdigit(wc);
224 : }
225 :
226 : static const struct ctype_methods ctype_methods_icu = {
227 : .strlower = strlower_icu,
228 : .strtitle = strtitle_icu,
229 : .strupper = strupper_icu,
230 : .strfold = strfold_icu,
231 : .wc_isdigit = wc_isdigit_icu,
232 : .wc_isalpha = wc_isalpha_icu,
233 : .wc_isalnum = wc_isalnum_icu,
234 : .wc_isupper = wc_isupper_icu,
235 : .wc_islower = wc_islower_icu,
236 : .wc_isgraph = wc_isgraph_icu,
237 : .wc_isprint = wc_isprint_icu,
238 : .wc_ispunct = wc_ispunct_icu,
239 : .wc_isspace = wc_isspace_icu,
240 : .wc_isxdigit = wc_isxdigit_icu,
241 : .char_is_cased = char_is_cased_icu,
242 : .wc_toupper = toupper_icu,
243 : .wc_tolower = tolower_icu,
244 : };
245 : #endif
246 :
247 : pg_locale_t
248 210 : create_pg_locale_icu(Oid collid, MemoryContext context)
249 : {
250 : #ifdef USE_ICU
251 : bool deterministic;
252 : const char *iculocstr;
253 210 : const char *icurules = NULL;
254 : UCollator *collator;
255 : pg_locale_t result;
256 :
257 210 : if (collid == DEFAULT_COLLATION_OID)
258 : {
259 : HeapTuple tp;
260 : Datum datum;
261 : bool isnull;
262 :
263 26 : tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
264 26 : if (!HeapTupleIsValid(tp))
265 0 : elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
266 :
267 : /* default database collation is always deterministic */
268 26 : deterministic = true;
269 26 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
270 : Anum_pg_database_datlocale);
271 26 : iculocstr = TextDatumGetCString(datum);
272 26 : datum = SysCacheGetAttr(DATABASEOID, tp,
273 : Anum_pg_database_daticurules, &isnull);
274 26 : if (!isnull)
275 0 : icurules = TextDatumGetCString(datum);
276 :
277 26 : ReleaseSysCache(tp);
278 : }
279 : else
280 : {
281 : Form_pg_collation collform;
282 : HeapTuple tp;
283 : Datum datum;
284 : bool isnull;
285 :
286 184 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
287 184 : if (!HeapTupleIsValid(tp))
288 0 : elog(ERROR, "cache lookup failed for collation %u", collid);
289 184 : collform = (Form_pg_collation) GETSTRUCT(tp);
290 184 : deterministic = collform->collisdeterministic;
291 184 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
292 : Anum_pg_collation_colllocale);
293 184 : iculocstr = TextDatumGetCString(datum);
294 184 : datum = SysCacheGetAttr(COLLOID, tp,
295 : Anum_pg_collation_collicurules, &isnull);
296 184 : if (!isnull)
297 12 : icurules = TextDatumGetCString(datum);
298 :
299 184 : ReleaseSysCache(tp);
300 : }
301 :
302 210 : collator = make_icu_collator(iculocstr, icurules);
303 :
304 200 : result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
305 200 : result->icu.locale = MemoryContextStrdup(context, iculocstr);
306 200 : result->icu.ucol = collator;
307 200 : result->deterministic = deterministic;
308 200 : result->collate_is_c = false;
309 200 : result->ctype_is_c = false;
310 200 : if (GetDatabaseEncoding() == PG_UTF8)
311 200 : result->collate = &collate_methods_icu_utf8;
312 : else
313 0 : result->collate = &collate_methods_icu;
314 200 : result->ctype = &ctype_methods_icu;
315 :
316 200 : return result;
317 : #else
318 : /* could get here if a collation was created by a build with ICU */
319 : ereport(ERROR,
320 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
321 : errmsg("ICU is not supported in this build")));
322 :
323 : return NULL;
324 : #endif
325 : }
326 :
327 : #ifdef USE_ICU
328 :
329 : /*
330 : * Wrapper around ucol_open() to handle API differences for older ICU
331 : * versions.
332 : *
333 : * Ensure that no path leaks a UCollator.
334 : */
335 : UCollator *
336 78198 : pg_ucol_open(const char *loc_str)
337 : {
338 : UCollator *collator;
339 : UErrorCode status;
340 78198 : const char *orig_str = loc_str;
341 78198 : char *fixed_str = NULL;
342 :
343 : /*
344 : * Must never open default collator, because it depends on the environment
345 : * and may change at any time. Should not happen, but check here to catch
346 : * bugs that might be hard to catch otherwise.
347 : *
348 : * NB: the default collator is not the same as the collator for the root
349 : * locale. The root locale may be specified as the empty string, "und", or
350 : * "root". The default collator is opened by passing NULL to ucol_open().
351 : */
352 78198 : if (loc_str == NULL)
353 0 : elog(ERROR, "opening default collator is not supported");
354 :
355 : /*
356 : * In ICU versions 54 and earlier, "und" is not a recognized spelling of
357 : * the root locale. If the first component of the locale is "und", replace
358 : * with "root" before opening.
359 : */
360 : if (U_ICU_VERSION_MAJOR_NUM < 55)
361 : {
362 : char lang[ULOC_LANG_CAPACITY];
363 :
364 : status = U_ZERO_ERROR;
365 : uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
366 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
367 : {
368 : ereport(ERROR,
369 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
370 : errmsg("could not get language from locale \"%s\": %s",
371 : loc_str, u_errorName(status))));
372 : }
373 :
374 : if (strcmp(lang, "und") == 0)
375 : {
376 : const char *remainder = loc_str + strlen("und");
377 :
378 : fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
379 : strcpy(fixed_str, "root");
380 : strcat(fixed_str, remainder);
381 :
382 : loc_str = fixed_str;
383 : }
384 : }
385 :
386 78198 : status = U_ZERO_ERROR;
387 78198 : collator = ucol_open(loc_str, &status);
388 78198 : if (U_FAILURE(status))
389 12 : ereport(ERROR,
390 : /* use original string for error report */
391 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
392 : errmsg("could not open collator for locale \"%s\": %s",
393 : orig_str, u_errorName(status))));
394 :
395 : if (U_ICU_VERSION_MAJOR_NUM < 54)
396 : {
397 : status = U_ZERO_ERROR;
398 : icu_set_collation_attributes(collator, loc_str, &status);
399 :
400 : /*
401 : * Pretend the error came from ucol_open(), for consistent error
402 : * message across ICU versions.
403 : */
404 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
405 : {
406 : ucol_close(collator);
407 : ereport(ERROR,
408 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
409 : errmsg("could not open collator for locale \"%s\": %s",
410 : orig_str, u_errorName(status))));
411 : }
412 : }
413 :
414 78186 : if (fixed_str != NULL)
415 0 : pfree(fixed_str);
416 :
417 78186 : return collator;
418 : }
419 :
420 : /*
421 : * Create a UCollator with the given locale string and rules.
422 : *
423 : * Ensure that no path leaks a UCollator.
424 : */
425 : static UCollator *
426 210 : make_icu_collator(const char *iculocstr, const char *icurules)
427 : {
428 210 : if (!icurules)
429 : {
430 : /* simple case without rules */
431 198 : return pg_ucol_open(iculocstr);
432 : }
433 : else
434 : {
435 : UCollator *collator_std_rules;
436 : UCollator *collator_all_rules;
437 : const UChar *std_rules;
438 : UChar *my_rules;
439 : UChar *all_rules;
440 : int32_t length;
441 : int32_t total;
442 : UErrorCode status;
443 :
444 : /*
445 : * If rules are specified, we extract the rules of the standard
446 : * collation, add our own rules, and make a new collator with the
447 : * combined rules.
448 : */
449 12 : icu_to_uchar(&my_rules, icurules, strlen(icurules));
450 :
451 12 : collator_std_rules = pg_ucol_open(iculocstr);
452 :
453 12 : std_rules = ucol_getRules(collator_std_rules, &length);
454 :
455 12 : total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
456 :
457 : /* avoid leaking collator on OOM */
458 12 : all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
459 12 : if (!all_rules)
460 : {
461 0 : ucol_close(collator_std_rules);
462 0 : ereport(ERROR,
463 : (errcode(ERRCODE_OUT_OF_MEMORY),
464 : errmsg("out of memory")));
465 : }
466 :
467 12 : u_strcpy(all_rules, std_rules);
468 12 : u_strcat(all_rules, my_rules);
469 :
470 12 : ucol_close(collator_std_rules);
471 :
472 12 : status = U_ZERO_ERROR;
473 12 : collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
474 : UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
475 : NULL, &status);
476 12 : if (U_FAILURE(status))
477 : {
478 6 : ereport(ERROR,
479 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
480 : errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
481 : iculocstr, icurules, u_errorName(status))));
482 : }
483 :
484 6 : return collator_all_rules;
485 : }
486 : }
487 :
488 : static size_t
489 528 : strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
490 : pg_locale_t locale)
491 : {
492 : int32_t len_uchar;
493 : int32_t len_conv;
494 : UChar *buff_uchar;
495 : UChar *buff_conv;
496 : size_t result_len;
497 :
498 528 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
499 528 : len_conv = icu_convert_case(u_strToLower, locale,
500 : &buff_conv, buff_uchar, len_uchar);
501 528 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
502 528 : pfree(buff_uchar);
503 528 : pfree(buff_conv);
504 :
505 528 : return result_len;
506 : }
507 :
508 : static size_t
509 30 : strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
510 : pg_locale_t locale)
511 : {
512 : int32_t len_uchar;
513 : int32_t len_conv;
514 : UChar *buff_uchar;
515 : UChar *buff_conv;
516 : size_t result_len;
517 :
518 30 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
519 30 : len_conv = icu_convert_case(u_strToTitle_default_BI, locale,
520 : &buff_conv, buff_uchar, len_uchar);
521 30 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
522 30 : pfree(buff_uchar);
523 30 : pfree(buff_conv);
524 :
525 30 : return result_len;
526 : }
527 :
528 : static size_t
529 54 : strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
530 : pg_locale_t locale)
531 : {
532 : int32_t len_uchar;
533 : int32_t len_conv;
534 : UChar *buff_uchar;
535 : UChar *buff_conv;
536 : size_t result_len;
537 :
538 54 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
539 54 : len_conv = icu_convert_case(u_strToUpper, locale,
540 : &buff_conv, buff_uchar, len_uchar);
541 54 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
542 54 : pfree(buff_uchar);
543 54 : pfree(buff_conv);
544 :
545 54 : return result_len;
546 : }
547 :
548 : static size_t
549 12 : strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
550 : pg_locale_t locale)
551 : {
552 : int32_t len_uchar;
553 : int32_t len_conv;
554 : UChar *buff_uchar;
555 : UChar *buff_conv;
556 : size_t result_len;
557 :
558 12 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
559 12 : len_conv = icu_convert_case(u_strFoldCase_default, locale,
560 : &buff_conv, buff_uchar, len_uchar);
561 12 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
562 12 : pfree(buff_uchar);
563 12 : pfree(buff_conv);
564 :
565 12 : return result_len;
566 : }
567 :
568 : /*
569 : * strncoll_icu_utf8
570 : *
571 : * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
572 : * database encoding. An argument length of -1 means the string is
573 : * NUL-terminated.
574 : */
575 : #ifdef HAVE_UCOL_STRCOLLUTF8
576 : int
577 23580 : strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
578 : pg_locale_t locale)
579 : {
580 : int result;
581 : UErrorCode status;
582 :
583 : Assert(GetDatabaseEncoding() == PG_UTF8);
584 :
585 23580 : status = U_ZERO_ERROR;
586 23580 : result = ucol_strcollUTF8(locale->icu.ucol,
587 : arg1, len1,
588 : arg2, len2,
589 : &status);
590 23580 : if (U_FAILURE(status))
591 0 : ereport(ERROR,
592 : (errmsg("collation failed: %s", u_errorName(status))));
593 :
594 23580 : return result;
595 : }
596 : #endif
597 :
598 : /* 'srclen' of -1 means the strings are NUL-terminated */
599 : size_t
600 5748 : strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
601 : pg_locale_t locale)
602 : {
603 : char sbuf[TEXTBUFLEN];
604 5748 : char *buf = sbuf;
605 : UChar *uchar;
606 : int32_t ulen;
607 : size_t uchar_bsize;
608 : Size result_bsize;
609 :
610 5748 : init_icu_converter();
611 :
612 5748 : ulen = uchar_length(icu_converter, src, srclen);
613 :
614 5748 : uchar_bsize = (ulen + 1) * sizeof(UChar);
615 :
616 5748 : if (uchar_bsize > TEXTBUFLEN)
617 0 : buf = palloc(uchar_bsize);
618 :
619 5748 : uchar = (UChar *) buf;
620 :
621 5748 : ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
622 :
623 5748 : result_bsize = ucol_getSortKey(locale->icu.ucol,
624 : uchar, ulen,
625 : (uint8_t *) dest, destsize);
626 :
627 : /*
628 : * ucol_getSortKey() counts the nul-terminator in the result length, but
629 : * this function should not.
630 : */
631 : Assert(result_bsize > 0);
632 5748 : result_bsize--;
633 :
634 5748 : if (buf != sbuf)
635 0 : pfree(buf);
636 :
637 : /* if dest is defined, it should be nul-terminated */
638 : Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
639 :
640 5748 : return result_bsize;
641 : }
642 :
643 : /* 'srclen' of -1 means the strings are NUL-terminated */
644 : size_t
645 1668 : strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
646 : const char *src, ssize_t srclen,
647 : pg_locale_t locale)
648 : {
649 : size_t result;
650 : UCharIterator iter;
651 : uint32_t state[2];
652 : UErrorCode status;
653 :
654 : Assert(GetDatabaseEncoding() == PG_UTF8);
655 :
656 1668 : uiter_setUTF8(&iter, src, srclen);
657 1668 : state[0] = state[1] = 0; /* won't need that again */
658 1668 : status = U_ZERO_ERROR;
659 1668 : result = ucol_nextSortKeyPart(locale->icu.ucol,
660 : &iter,
661 : state,
662 : (uint8_t *) dest,
663 : destsize,
664 : &status);
665 1668 : if (U_FAILURE(status))
666 0 : ereport(ERROR,
667 : (errmsg("sort key generation failed: %s",
668 : u_errorName(status))));
669 :
670 1668 : return result;
671 : }
672 :
673 : char *
674 77840 : get_collation_actual_version_icu(const char *collcollate)
675 : {
676 : UCollator *collator;
677 : UVersionInfo versioninfo;
678 : char buf[U_MAX_VERSION_STRING_LENGTH];
679 :
680 77840 : collator = pg_ucol_open(collcollate);
681 :
682 77840 : ucol_getVersion(collator, versioninfo);
683 77840 : ucol_close(collator);
684 :
685 77840 : u_versionToString(versioninfo, buf);
686 77840 : return pstrdup(buf);
687 : }
688 :
689 : /*
690 : * Convert a string in the database encoding into a string of UChars.
691 : *
692 : * The source string at buff is of length nbytes
693 : * (it needn't be nul-terminated)
694 : *
695 : * *buff_uchar receives a pointer to the palloc'd result string, and
696 : * the function's result is the number of UChars generated.
697 : *
698 : * The result string is nul-terminated, though most callers rely on the
699 : * result length instead.
700 : */
701 : static int32_t
702 636 : icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
703 : {
704 : int32_t len_uchar;
705 :
706 636 : init_icu_converter();
707 :
708 636 : len_uchar = uchar_length(icu_converter, buff, nbytes);
709 :
710 636 : *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
711 636 : len_uchar = uchar_convert(icu_converter,
712 : *buff_uchar, len_uchar + 1, buff, nbytes);
713 :
714 636 : return len_uchar;
715 : }
716 :
717 : /*
718 : * Convert a string of UChars into the database encoding.
719 : *
720 : * The source string at buff_uchar is of length len_uchar
721 : * (it needn't be nul-terminated)
722 : *
723 : * *result receives a pointer to the palloc'd result string, and the
724 : * function's result is the number of bytes generated (not counting nul).
725 : *
726 : * The result string is nul-terminated.
727 : */
728 : static size_t
729 624 : icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar)
730 : {
731 : UErrorCode status;
732 : int32_t len_result;
733 :
734 624 : init_icu_converter();
735 :
736 624 : status = U_ZERO_ERROR;
737 624 : len_result = ucnv_fromUChars(icu_converter, NULL, 0,
738 : buff_uchar, len_uchar, &status);
739 624 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
740 0 : ereport(ERROR,
741 : (errmsg("%s failed: %s", "ucnv_fromUChars",
742 : u_errorName(status))));
743 :
744 624 : if (len_result + 1 > destsize)
745 60 : return len_result;
746 :
747 564 : status = U_ZERO_ERROR;
748 564 : len_result = ucnv_fromUChars(icu_converter, dest, len_result + 1,
749 : buff_uchar, len_uchar, &status);
750 564 : if (U_FAILURE(status) ||
751 564 : status == U_STRING_NOT_TERMINATED_WARNING)
752 0 : ereport(ERROR,
753 : (errmsg("%s failed: %s", "ucnv_fromUChars",
754 : u_errorName(status))));
755 :
756 564 : return len_result;
757 : }
758 :
759 : static int32_t
760 624 : icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
761 : UChar **buff_dest, UChar *buff_source, int32_t len_source)
762 : {
763 : UErrorCode status;
764 : int32_t len_dest;
765 :
766 624 : len_dest = len_source; /* try first with same length */
767 624 : *buff_dest = palloc(len_dest * sizeof(**buff_dest));
768 624 : status = U_ZERO_ERROR;
769 624 : len_dest = func(*buff_dest, len_dest, buff_source, len_source,
770 : mylocale->icu.locale, &status);
771 624 : if (status == U_BUFFER_OVERFLOW_ERROR)
772 : {
773 : /* try again with adjusted length */
774 18 : pfree(*buff_dest);
775 18 : *buff_dest = palloc(len_dest * sizeof(**buff_dest));
776 18 : status = U_ZERO_ERROR;
777 18 : len_dest = func(*buff_dest, len_dest, buff_source, len_source,
778 : mylocale->icu.locale, &status);
779 : }
780 624 : if (U_FAILURE(status))
781 0 : ereport(ERROR,
782 : (errmsg("case conversion failed: %s", u_errorName(status))));
783 624 : return len_dest;
784 : }
785 :
786 : static int32_t
787 30 : u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
788 : const UChar *src, int32_t srcLength,
789 : const char *locale,
790 : UErrorCode *pErrorCode)
791 : {
792 30 : return u_strToTitle(dest, destCapacity, src, srcLength,
793 : NULL, locale, pErrorCode);
794 : }
795 :
796 : static int32_t
797 24 : u_strFoldCase_default(UChar *dest, int32_t destCapacity,
798 : const UChar *src, int32_t srcLength,
799 : const char *locale,
800 : UErrorCode *pErrorCode)
801 : {
802 24 : uint32 options = U_FOLD_CASE_DEFAULT;
803 : char lang[3];
804 : UErrorCode status;
805 :
806 : /*
807 : * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
808 : * folding does not accept a locale. Instead it just supports a single
809 : * option relevant to Turkic languages 'az' and 'tr'; check for those
810 : * languages to enable the option.
811 : */
812 24 : status = U_ZERO_ERROR;
813 24 : uloc_getLanguage(locale, lang, 3, &status);
814 24 : if (U_SUCCESS(status))
815 : {
816 : /*
817 : * The option name is confusing, but it causes u_strFoldCase to use
818 : * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
819 : */
820 24 : if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
821 12 : options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
822 : }
823 :
824 24 : return u_strFoldCase(dest, destCapacity, src, srcLength,
825 : options, pErrorCode);
826 : }
827 :
828 : /*
829 : * strncoll_icu
830 : *
831 : * Convert the arguments from the database encoding to UChar strings, then
832 : * call ucol_strcoll(). An argument length of -1 means that the string is
833 : * NUL-terminated.
834 : *
835 : * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
836 : * caller should call that instead.
837 : */
838 : static int
839 0 : strncoll_icu(const char *arg1, ssize_t len1,
840 : const char *arg2, ssize_t len2, pg_locale_t locale)
841 : {
842 : char sbuf[TEXTBUFLEN];
843 0 : char *buf = sbuf;
844 : int32_t ulen1;
845 : int32_t ulen2;
846 : size_t bufsize1;
847 : size_t bufsize2;
848 : UChar *uchar1,
849 : *uchar2;
850 : int result;
851 :
852 : /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */
853 : #ifdef HAVE_UCOL_STRCOLLUTF8
854 : Assert(GetDatabaseEncoding() != PG_UTF8);
855 : #endif
856 :
857 0 : init_icu_converter();
858 :
859 0 : ulen1 = uchar_length(icu_converter, arg1, len1);
860 0 : ulen2 = uchar_length(icu_converter, arg2, len2);
861 :
862 0 : bufsize1 = (ulen1 + 1) * sizeof(UChar);
863 0 : bufsize2 = (ulen2 + 1) * sizeof(UChar);
864 :
865 0 : if (bufsize1 + bufsize2 > TEXTBUFLEN)
866 0 : buf = palloc(bufsize1 + bufsize2);
867 :
868 0 : uchar1 = (UChar *) buf;
869 0 : uchar2 = (UChar *) (buf + bufsize1);
870 :
871 0 : ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
872 0 : ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
873 :
874 0 : result = ucol_strcoll(locale->icu.ucol,
875 : uchar1, ulen1,
876 : uchar2, ulen2);
877 :
878 0 : if (buf != sbuf)
879 0 : pfree(buf);
880 :
881 0 : return result;
882 : }
883 :
884 : /* 'srclen' of -1 means the strings are NUL-terminated */
885 : static size_t
886 0 : strnxfrm_prefix_icu(char *dest, size_t destsize,
887 : const char *src, ssize_t srclen,
888 : pg_locale_t locale)
889 : {
890 : char sbuf[TEXTBUFLEN];
891 0 : char *buf = sbuf;
892 : UCharIterator iter;
893 : uint32_t state[2];
894 : UErrorCode status;
895 0 : int32_t ulen = -1;
896 0 : UChar *uchar = NULL;
897 : size_t uchar_bsize;
898 : Size result_bsize;
899 :
900 : /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */
901 : Assert(GetDatabaseEncoding() != PG_UTF8);
902 :
903 0 : init_icu_converter();
904 :
905 0 : ulen = uchar_length(icu_converter, src, srclen);
906 :
907 0 : uchar_bsize = (ulen + 1) * sizeof(UChar);
908 :
909 0 : if (uchar_bsize > TEXTBUFLEN)
910 0 : buf = palloc(uchar_bsize);
911 :
912 0 : uchar = (UChar *) buf;
913 :
914 0 : ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
915 :
916 0 : uiter_setString(&iter, uchar, ulen);
917 0 : state[0] = state[1] = 0; /* won't need that again */
918 0 : status = U_ZERO_ERROR;
919 0 : result_bsize = ucol_nextSortKeyPart(locale->icu.ucol,
920 : &iter,
921 : state,
922 : (uint8_t *) dest,
923 : destsize,
924 : &status);
925 0 : if (U_FAILURE(status))
926 0 : ereport(ERROR,
927 : (errmsg("sort key generation failed: %s",
928 : u_errorName(status))));
929 :
930 0 : return result_bsize;
931 : }
932 :
933 : static void
934 7008 : init_icu_converter(void)
935 : {
936 : const char *icu_encoding_name;
937 : UErrorCode status;
938 : UConverter *conv;
939 :
940 7008 : if (icu_converter)
941 7002 : return; /* already done */
942 :
943 6 : icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
944 6 : if (!icu_encoding_name)
945 0 : ereport(ERROR,
946 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
947 : errmsg("encoding \"%s\" not supported by ICU",
948 : pg_encoding_to_char(GetDatabaseEncoding()))));
949 :
950 6 : status = U_ZERO_ERROR;
951 6 : conv = ucnv_open(icu_encoding_name, &status);
952 6 : if (U_FAILURE(status))
953 0 : ereport(ERROR,
954 : (errmsg("could not open ICU converter for encoding \"%s\": %s",
955 : icu_encoding_name, u_errorName(status))));
956 :
957 6 : icu_converter = conv;
958 : }
959 :
960 : /*
961 : * Find length, in UChars, of given string if converted to UChar string.
962 : *
963 : * A length of -1 indicates that the input string is NUL-terminated.
964 : */
965 : static size_t
966 6384 : uchar_length(UConverter *converter, const char *str, int32_t len)
967 : {
968 6384 : UErrorCode status = U_ZERO_ERROR;
969 : int32_t ulen;
970 :
971 6384 : ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
972 6384 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
973 0 : ereport(ERROR,
974 : (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
975 6384 : return ulen;
976 : }
977 :
978 : /*
979 : * Convert the given source string into a UChar string, stored in dest, and
980 : * return the length (in UChars).
981 : *
982 : * A srclen of -1 indicates that the input string is NUL-terminated.
983 : */
984 : static int32_t
985 6384 : uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
986 : const char *src, int32_t srclen)
987 : {
988 6384 : UErrorCode status = U_ZERO_ERROR;
989 : int32_t ulen;
990 :
991 6384 : status = U_ZERO_ERROR;
992 6384 : ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
993 6384 : if (U_FAILURE(status))
994 0 : ereport(ERROR,
995 : (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
996 6384 : return ulen;
997 : }
998 :
999 : /*
1000 : * Parse collation attributes from the given locale string and apply them to
1001 : * the open collator.
1002 : *
1003 : * First, the locale string is canonicalized to an ICU format locale ID such
1004 : * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
1005 : * the key-value arguments.
1006 : *
1007 : * Starting with ICU version 54, the attributes are processed automatically by
1008 : * ucol_open(), so this is only necessary for emulating this behavior on older
1009 : * versions.
1010 : */
1011 : pg_attribute_unused()
1012 : static void
1013 0 : icu_set_collation_attributes(UCollator *collator, const char *loc,
1014 : UErrorCode *status)
1015 : {
1016 : int32_t len;
1017 : char *icu_locale_id;
1018 : char *lower_str;
1019 : char *str;
1020 : char *token;
1021 :
1022 : /*
1023 : * The input locale may be a BCP 47 language tag, e.g.
1024 : * "und-u-kc-ks-level1", which expresses the same attributes in a
1025 : * different form. It will be converted to the equivalent ICU format
1026 : * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
1027 : * uloc_canonicalize().
1028 : */
1029 0 : *status = U_ZERO_ERROR;
1030 0 : len = uloc_canonicalize(loc, NULL, 0, status);
1031 0 : icu_locale_id = palloc(len + 1);
1032 0 : *status = U_ZERO_ERROR;
1033 0 : len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
1034 0 : if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
1035 0 : return;
1036 :
1037 0 : lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
1038 :
1039 0 : pfree(icu_locale_id);
1040 :
1041 0 : str = strchr(lower_str, '@');
1042 0 : if (!str)
1043 0 : return;
1044 0 : str++;
1045 :
1046 0 : while ((token = strsep(&str, ";")))
1047 : {
1048 0 : char *e = strchr(token, '=');
1049 :
1050 0 : if (e)
1051 : {
1052 : char *name;
1053 : char *value;
1054 : UColAttribute uattr;
1055 : UColAttributeValue uvalue;
1056 :
1057 0 : *status = U_ZERO_ERROR;
1058 :
1059 0 : *e = '\0';
1060 0 : name = token;
1061 0 : value = e + 1;
1062 :
1063 : /*
1064 : * See attribute name and value lists in ICU i18n/coll.cpp
1065 : */
1066 0 : if (strcmp(name, "colstrength") == 0)
1067 0 : uattr = UCOL_STRENGTH;
1068 0 : else if (strcmp(name, "colbackwards") == 0)
1069 0 : uattr = UCOL_FRENCH_COLLATION;
1070 0 : else if (strcmp(name, "colcaselevel") == 0)
1071 0 : uattr = UCOL_CASE_LEVEL;
1072 0 : else if (strcmp(name, "colcasefirst") == 0)
1073 0 : uattr = UCOL_CASE_FIRST;
1074 0 : else if (strcmp(name, "colalternate") == 0)
1075 0 : uattr = UCOL_ALTERNATE_HANDLING;
1076 0 : else if (strcmp(name, "colnormalization") == 0)
1077 0 : uattr = UCOL_NORMALIZATION_MODE;
1078 0 : else if (strcmp(name, "colnumeric") == 0)
1079 0 : uattr = UCOL_NUMERIC_COLLATION;
1080 : else
1081 : /* ignore if unknown */
1082 0 : continue;
1083 :
1084 0 : if (strcmp(value, "primary") == 0)
1085 0 : uvalue = UCOL_PRIMARY;
1086 0 : else if (strcmp(value, "secondary") == 0)
1087 0 : uvalue = UCOL_SECONDARY;
1088 0 : else if (strcmp(value, "tertiary") == 0)
1089 0 : uvalue = UCOL_TERTIARY;
1090 0 : else if (strcmp(value, "quaternary") == 0)
1091 0 : uvalue = UCOL_QUATERNARY;
1092 0 : else if (strcmp(value, "identical") == 0)
1093 0 : uvalue = UCOL_IDENTICAL;
1094 0 : else if (strcmp(value, "no") == 0)
1095 0 : uvalue = UCOL_OFF;
1096 0 : else if (strcmp(value, "yes") == 0)
1097 0 : uvalue = UCOL_ON;
1098 0 : else if (strcmp(value, "shifted") == 0)
1099 0 : uvalue = UCOL_SHIFTED;
1100 0 : else if (strcmp(value, "non-ignorable") == 0)
1101 0 : uvalue = UCOL_NON_IGNORABLE;
1102 0 : else if (strcmp(value, "lower") == 0)
1103 0 : uvalue = UCOL_LOWER_FIRST;
1104 0 : else if (strcmp(value, "upper") == 0)
1105 0 : uvalue = UCOL_UPPER_FIRST;
1106 : else
1107 : {
1108 0 : *status = U_ILLEGAL_ARGUMENT_ERROR;
1109 0 : break;
1110 : }
1111 :
1112 0 : ucol_setAttribute(collator, uattr, uvalue, status);
1113 : }
1114 : }
1115 :
1116 0 : pfree(lower_str);
1117 : }
1118 :
1119 : #endif /* USE_ICU */
|