Line data Source code
1 : /*-----------------------------------------------------------------------
2 : *
3 : * PostgreSQL locale utilities for ICU
4 : *
5 : * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 : *
7 : * src/backend/utils/adt/pg_locale_icu.c
8 : *
9 : *-----------------------------------------------------------------------
10 : */
11 :
12 : #include "postgres.h"
13 :
14 : #ifdef USE_ICU
15 : #include <unicode/ucnv.h>
16 : #include <unicode/ustring.h>
17 :
18 : /*
19 : * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
20 : * (see
21 : * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
22 : */
23 : #if U_ICU_VERSION_MAJOR_NUM >= 53
24 : #define HAVE_UCOL_STRCOLLUTF8 1
25 : #else
26 : #undef HAVE_UCOL_STRCOLLUTF8
27 : #endif
28 :
29 : #endif
30 :
31 : #include "access/htup_details.h"
32 : #include "catalog/pg_database.h"
33 : #include "catalog/pg_collation.h"
34 : #include "mb/pg_wchar.h"
35 : #include "miscadmin.h"
36 : #include "utils/builtins.h"
37 : #include "utils/formatting.h"
38 : #include "utils/memutils.h"
39 : #include "utils/pg_locale.h"
40 : #include "utils/syscache.h"
41 :
42 : /*
43 : * Size of stack buffer to use for string transformations, used to avoid heap
44 : * allocations in typical cases. This should be large enough that most strings
45 : * will fit, but small enough that we feel comfortable putting it on the
46 : * stack.
47 : */
48 : #define TEXTBUFLEN 1024
49 :
50 : extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context);
51 :
52 : #ifdef USE_ICU
53 :
54 : extern UCollator *pg_ucol_open(const char *loc_str);
55 :
56 : static size_t strlower_icu(char *dest, size_t destsize, const char *src,
57 : ssize_t srclen, pg_locale_t locale);
58 : static size_t strtitle_icu(char *dest, size_t destsize, const char *src,
59 : ssize_t srclen, pg_locale_t locale);
60 : static size_t strupper_icu(char *dest, size_t destsize, const char *src,
61 : ssize_t srclen, pg_locale_t locale);
62 : static size_t strfold_icu(char *dest, size_t destsize, const char *src,
63 : ssize_t srclen, pg_locale_t locale);
64 : static size_t downcase_ident_icu(char *dst, size_t dstsize, const char *src,
65 : ssize_t srclen, pg_locale_t locale);
66 : static int strncoll_icu(const char *arg1, ssize_t len1,
67 : const char *arg2, ssize_t len2,
68 : pg_locale_t locale);
69 : static size_t strnxfrm_icu(char *dest, size_t destsize,
70 : const char *src, ssize_t srclen,
71 : pg_locale_t locale);
72 : extern char *get_collation_actual_version_icu(const char *collcollate);
73 :
74 : typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity,
75 : const UChar *src, int32_t srcLength,
76 : const char *locale,
77 : UErrorCode *pErrorCode);
78 :
79 : /*
80 : * Converter object for converting between ICU's UChar strings and C strings
81 : * in database encoding. Since the database encoding doesn't change, we only
82 : * need one of these per session.
83 : */
84 : static UConverter *icu_converter = NULL;
85 :
86 : static UCollator *make_icu_collator(const char *iculocstr,
87 : const char *icurules);
88 : static int strncoll_icu(const char *arg1, ssize_t len1,
89 : const char *arg2, ssize_t len2,
90 : pg_locale_t locale);
91 : static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
92 : const char *src, ssize_t srclen,
93 : pg_locale_t locale);
94 : #ifdef HAVE_UCOL_STRCOLLUTF8
95 : static int strncoll_icu_utf8(const char *arg1, ssize_t len1,
96 : const char *arg2, ssize_t len2,
97 : pg_locale_t locale);
98 : #endif
99 : static size_t strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
100 : const char *src, ssize_t srclen,
101 : pg_locale_t locale);
102 : static void init_icu_converter(void);
103 : static size_t uchar_length(UConverter *converter,
104 : const char *str, int32_t len);
105 : static int32_t uchar_convert(UConverter *converter,
106 : UChar *dest, int32_t destlen,
107 : const char *src, int32_t srclen);
108 : static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff,
109 : size_t nbytes);
110 : static size_t icu_from_uchar(char *dest, size_t destsize,
111 : const UChar *buff_uchar, int32_t len_uchar);
112 : static void icu_set_collation_attributes(UCollator *collator, const char *loc,
113 : UErrorCode *status);
114 : static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
115 : UChar **buff_dest, UChar *buff_source,
116 : int32_t len_source);
117 : static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
118 : const UChar *src, int32_t srcLength,
119 : const char *locale,
120 : UErrorCode *pErrorCode);
121 : static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
122 : const UChar *src, int32_t srcLength,
123 : const char *locale,
124 : UErrorCode *pErrorCode);
125 :
126 : /*
127 : * XXX: many of the functions below rely on casts directly from pg_wchar to
128 : * UChar32, which is correct for UTF-8 and LATIN1, but not in general.
129 : */
130 :
131 : static pg_wchar
132 108 : toupper_icu(pg_wchar wc, pg_locale_t locale)
133 : {
134 108 : return u_toupper(wc);
135 : }
136 :
137 : static pg_wchar
138 108 : tolower_icu(pg_wchar wc, pg_locale_t locale)
139 : {
140 108 : return u_tolower(wc);
141 : }
142 :
143 : static const struct collate_methods collate_methods_icu = {
144 : .strncoll = strncoll_icu,
145 : .strnxfrm = strnxfrm_icu,
146 : .strnxfrm_prefix = strnxfrm_prefix_icu,
147 : .strxfrm_is_safe = true,
148 : };
149 :
150 : static const struct collate_methods collate_methods_icu_utf8 = {
151 : #ifdef HAVE_UCOL_STRCOLLUTF8
152 : .strncoll = strncoll_icu_utf8,
153 : #else
154 : .strncoll = strncoll_icu,
155 : #endif
156 : .strnxfrm = strnxfrm_icu,
157 : .strnxfrm_prefix = strnxfrm_prefix_icu_utf8,
158 : .strxfrm_is_safe = true,
159 : };
160 :
161 : static bool
162 12288 : wc_isdigit_icu(pg_wchar wc, pg_locale_t locale)
163 : {
164 12288 : return u_isdigit(wc);
165 : }
166 :
167 : static bool
168 12288 : wc_isalpha_icu(pg_wchar wc, pg_locale_t locale)
169 : {
170 12288 : return u_isalpha(wc);
171 : }
172 :
173 : static bool
174 12288 : wc_isalnum_icu(pg_wchar wc, pg_locale_t locale)
175 : {
176 12288 : return u_isalnum(wc);
177 : }
178 :
179 : static bool
180 12288 : wc_isupper_icu(pg_wchar wc, pg_locale_t locale)
181 : {
182 12288 : return u_isupper(wc);
183 : }
184 :
185 : static bool
186 12288 : wc_islower_icu(pg_wchar wc, pg_locale_t locale)
187 : {
188 12288 : return u_islower(wc);
189 : }
190 :
191 : static bool
192 12288 : wc_isgraph_icu(pg_wchar wc, pg_locale_t locale)
193 : {
194 12288 : return u_isgraph(wc);
195 : }
196 :
197 : static bool
198 12288 : wc_isprint_icu(pg_wchar wc, pg_locale_t locale)
199 : {
200 12288 : return u_isprint(wc);
201 : }
202 :
203 : static bool
204 12288 : wc_ispunct_icu(pg_wchar wc, pg_locale_t locale)
205 : {
206 12288 : return u_ispunct(wc);
207 : }
208 :
209 : static bool
210 12288 : wc_isspace_icu(pg_wchar wc, pg_locale_t locale)
211 : {
212 12288 : return u_isspace(wc);
213 : }
214 :
215 : static bool
216 0 : wc_isxdigit_icu(pg_wchar wc, pg_locale_t locale)
217 : {
218 0 : return u_isxdigit(wc);
219 : }
220 :
221 : static bool
222 126 : wc_iscased_icu(pg_wchar wc, pg_locale_t locale)
223 : {
224 126 : return u_hasBinaryProperty(wc, UCHAR_CASED);
225 : }
226 :
227 : static const struct ctype_methods ctype_methods_icu = {
228 : .strlower = strlower_icu,
229 : .strtitle = strtitle_icu,
230 : .strupper = strupper_icu,
231 : .strfold = strfold_icu,
232 : .downcase_ident = downcase_ident_icu,
233 : .wc_isdigit = wc_isdigit_icu,
234 : .wc_isalpha = wc_isalpha_icu,
235 : .wc_isalnum = wc_isalnum_icu,
236 : .wc_isupper = wc_isupper_icu,
237 : .wc_islower = wc_islower_icu,
238 : .wc_isgraph = wc_isgraph_icu,
239 : .wc_isprint = wc_isprint_icu,
240 : .wc_ispunct = wc_ispunct_icu,
241 : .wc_isspace = wc_isspace_icu,
242 : .wc_isxdigit = wc_isxdigit_icu,
243 : .wc_iscased = wc_iscased_icu,
244 : .wc_toupper = toupper_icu,
245 : .wc_tolower = tolower_icu,
246 : };
247 :
248 : /*
249 : * ICU still depends on libc for compatibility with certain historical
250 : * behavior for single-byte encodings. See downcase_ident_icu().
251 : *
252 : * XXX: consider fixing by decoding the single byte into a code point, and
253 : * using u_tolower().
254 : */
255 : static locale_t
256 0 : make_libc_ctype_locale(const char *ctype)
257 : {
258 : locale_t loc;
259 :
260 : #ifndef WIN32
261 0 : loc = newlocale(LC_CTYPE_MASK, ctype, NULL);
262 : #else
263 : loc = _create_locale(LC_ALL, ctype);
264 : #endif
265 0 : if (!loc)
266 0 : report_newlocale_failure(ctype);
267 :
268 0 : return loc;
269 : }
270 : #endif
271 :
272 : pg_locale_t
273 210 : create_pg_locale_icu(Oid collid, MemoryContext context)
274 : {
275 : #ifdef USE_ICU
276 : bool deterministic;
277 : const char *iculocstr;
278 210 : const char *icurules = NULL;
279 : UCollator *collator;
280 210 : locale_t loc = (locale_t) 0;
281 : pg_locale_t result;
282 :
283 210 : if (collid == DEFAULT_COLLATION_OID)
284 : {
285 : HeapTuple tp;
286 : Datum datum;
287 : bool isnull;
288 :
289 26 : tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
290 26 : if (!HeapTupleIsValid(tp))
291 0 : elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
292 :
293 : /* default database collation is always deterministic */
294 26 : deterministic = true;
295 26 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
296 : Anum_pg_database_datlocale);
297 26 : iculocstr = TextDatumGetCString(datum);
298 26 : datum = SysCacheGetAttr(DATABASEOID, tp,
299 : Anum_pg_database_daticurules, &isnull);
300 26 : if (!isnull)
301 0 : icurules = TextDatumGetCString(datum);
302 :
303 : /* libc only needed for default locale and single-byte encoding */
304 26 : if (pg_database_encoding_max_length() == 1)
305 : {
306 : const char *ctype;
307 :
308 0 : datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
309 : Anum_pg_database_datctype);
310 0 : ctype = TextDatumGetCString(datum);
311 :
312 0 : loc = make_libc_ctype_locale(ctype);
313 : }
314 :
315 26 : ReleaseSysCache(tp);
316 : }
317 : else
318 : {
319 : Form_pg_collation collform;
320 : HeapTuple tp;
321 : Datum datum;
322 : bool isnull;
323 :
324 184 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
325 184 : if (!HeapTupleIsValid(tp))
326 0 : elog(ERROR, "cache lookup failed for collation %u", collid);
327 184 : collform = (Form_pg_collation) GETSTRUCT(tp);
328 184 : deterministic = collform->collisdeterministic;
329 184 : datum = SysCacheGetAttrNotNull(COLLOID, tp,
330 : Anum_pg_collation_colllocale);
331 184 : iculocstr = TextDatumGetCString(datum);
332 184 : datum = SysCacheGetAttr(COLLOID, tp,
333 : Anum_pg_collation_collicurules, &isnull);
334 184 : if (!isnull)
335 12 : icurules = TextDatumGetCString(datum);
336 :
337 184 : ReleaseSysCache(tp);
338 : }
339 :
340 210 : collator = make_icu_collator(iculocstr, icurules);
341 :
342 200 : result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
343 200 : result->icu.locale = MemoryContextStrdup(context, iculocstr);
344 200 : result->icu.ucol = collator;
345 200 : result->icu.lt = loc;
346 200 : result->deterministic = deterministic;
347 200 : result->collate_is_c = false;
348 200 : result->ctype_is_c = false;
349 200 : if (GetDatabaseEncoding() == PG_UTF8)
350 200 : result->collate = &collate_methods_icu_utf8;
351 : else
352 0 : result->collate = &collate_methods_icu;
353 200 : result->ctype = &ctype_methods_icu;
354 :
355 200 : return result;
356 : #else
357 : /* could get here if a collation was created by a build with ICU */
358 : ereport(ERROR,
359 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
360 : errmsg("ICU is not supported in this build")));
361 :
362 : return NULL;
363 : #endif
364 : }
365 :
366 : #ifdef USE_ICU
367 :
368 : /*
369 : * Wrapper around ucol_open() to handle API differences for older ICU
370 : * versions.
371 : *
372 : * Ensure that no path leaks a UCollator.
373 : */
374 : UCollator *
375 79812 : pg_ucol_open(const char *loc_str)
376 : {
377 : UCollator *collator;
378 : UErrorCode status;
379 79812 : const char *orig_str = loc_str;
380 79812 : char *fixed_str = NULL;
381 :
382 : /*
383 : * Must never open default collator, because it depends on the environment
384 : * and may change at any time. Should not happen, but check here to catch
385 : * bugs that might be hard to catch otherwise.
386 : *
387 : * NB: the default collator is not the same as the collator for the root
388 : * locale. The root locale may be specified as the empty string, "und", or
389 : * "root". The default collator is opened by passing NULL to ucol_open().
390 : */
391 79812 : if (loc_str == NULL)
392 0 : elog(ERROR, "opening default collator is not supported");
393 :
394 : /*
395 : * In ICU versions 54 and earlier, "und" is not a recognized spelling of
396 : * the root locale. If the first component of the locale is "und", replace
397 : * with "root" before opening.
398 : */
399 : if (U_ICU_VERSION_MAJOR_NUM < 55)
400 : {
401 : char lang[ULOC_LANG_CAPACITY];
402 :
403 : status = U_ZERO_ERROR;
404 : uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
405 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
406 : {
407 : ereport(ERROR,
408 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
409 : errmsg("could not get language from locale \"%s\": %s",
410 : loc_str, u_errorName(status))));
411 : }
412 :
413 : if (strcmp(lang, "und") == 0)
414 : {
415 : const char *remainder = loc_str + strlen("und");
416 :
417 : fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
418 : strcpy(fixed_str, "root");
419 : strcat(fixed_str, remainder);
420 :
421 : loc_str = fixed_str;
422 : }
423 : }
424 :
425 79812 : status = U_ZERO_ERROR;
426 79812 : collator = ucol_open(loc_str, &status);
427 79812 : if (U_FAILURE(status))
428 12 : ereport(ERROR,
429 : /* use original string for error report */
430 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
431 : errmsg("could not open collator for locale \"%s\": %s",
432 : orig_str, u_errorName(status))));
433 :
434 : if (U_ICU_VERSION_MAJOR_NUM < 54)
435 : {
436 : status = U_ZERO_ERROR;
437 : icu_set_collation_attributes(collator, loc_str, &status);
438 :
439 : /*
440 : * Pretend the error came from ucol_open(), for consistent error
441 : * message across ICU versions.
442 : */
443 : if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
444 : {
445 : ucol_close(collator);
446 : ereport(ERROR,
447 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
448 : errmsg("could not open collator for locale \"%s\": %s",
449 : orig_str, u_errorName(status))));
450 : }
451 : }
452 :
453 79800 : if (fixed_str != NULL)
454 0 : pfree(fixed_str);
455 :
456 79800 : return collator;
457 : }
458 :
459 : /*
460 : * Create a UCollator with the given locale string and rules.
461 : *
462 : * Ensure that no path leaks a UCollator.
463 : */
464 : static UCollator *
465 210 : make_icu_collator(const char *iculocstr, const char *icurules)
466 : {
467 210 : if (!icurules)
468 : {
469 : /* simple case without rules */
470 198 : return pg_ucol_open(iculocstr);
471 : }
472 : else
473 : {
474 : UCollator *collator_std_rules;
475 : UCollator *collator_all_rules;
476 : const UChar *std_rules;
477 : UChar *my_rules;
478 : UChar *all_rules;
479 : int32_t length;
480 : int32_t total;
481 : UErrorCode status;
482 :
483 : /*
484 : * If rules are specified, we extract the rules of the standard
485 : * collation, add our own rules, and make a new collator with the
486 : * combined rules.
487 : */
488 12 : icu_to_uchar(&my_rules, icurules, strlen(icurules));
489 :
490 12 : collator_std_rules = pg_ucol_open(iculocstr);
491 :
492 12 : std_rules = ucol_getRules(collator_std_rules, &length);
493 :
494 12 : total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
495 :
496 : /* avoid leaking collator on OOM */
497 12 : all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
498 12 : if (!all_rules)
499 : {
500 0 : ucol_close(collator_std_rules);
501 0 : ereport(ERROR,
502 : (errcode(ERRCODE_OUT_OF_MEMORY),
503 : errmsg("out of memory")));
504 : }
505 :
506 12 : u_strcpy(all_rules, std_rules);
507 12 : u_strcat(all_rules, my_rules);
508 :
509 12 : ucol_close(collator_std_rules);
510 :
511 12 : status = U_ZERO_ERROR;
512 12 : collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
513 : UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
514 : NULL, &status);
515 12 : if (U_FAILURE(status))
516 : {
517 6 : ereport(ERROR,
518 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
519 : errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
520 : iculocstr, icurules, u_errorName(status))));
521 : }
522 :
523 6 : return collator_all_rules;
524 : }
525 : }
526 :
527 : static size_t
528 528 : strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
529 : pg_locale_t locale)
530 : {
531 : int32_t len_uchar;
532 : int32_t len_conv;
533 : UChar *buff_uchar;
534 : UChar *buff_conv;
535 : size_t result_len;
536 :
537 528 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
538 528 : len_conv = icu_convert_case(u_strToLower, locale,
539 : &buff_conv, buff_uchar, len_uchar);
540 528 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
541 528 : pfree(buff_uchar);
542 528 : pfree(buff_conv);
543 :
544 528 : return result_len;
545 : }
546 :
547 : static size_t
548 30 : strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
549 : pg_locale_t locale)
550 : {
551 : int32_t len_uchar;
552 : int32_t len_conv;
553 : UChar *buff_uchar;
554 : UChar *buff_conv;
555 : size_t result_len;
556 :
557 30 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
558 30 : len_conv = icu_convert_case(u_strToTitle_default_BI, locale,
559 : &buff_conv, buff_uchar, len_uchar);
560 30 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
561 30 : pfree(buff_uchar);
562 30 : pfree(buff_conv);
563 :
564 30 : return result_len;
565 : }
566 :
567 : static size_t
568 54 : strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
569 : pg_locale_t locale)
570 : {
571 : int32_t len_uchar;
572 : int32_t len_conv;
573 : UChar *buff_uchar;
574 : UChar *buff_conv;
575 : size_t result_len;
576 :
577 54 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
578 54 : len_conv = icu_convert_case(u_strToUpper, locale,
579 : &buff_conv, buff_uchar, len_uchar);
580 54 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
581 54 : pfree(buff_uchar);
582 54 : pfree(buff_conv);
583 :
584 54 : return result_len;
585 : }
586 :
587 : static size_t
588 12 : strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
589 : pg_locale_t locale)
590 : {
591 : int32_t len_uchar;
592 : int32_t len_conv;
593 : UChar *buff_uchar;
594 : UChar *buff_conv;
595 : size_t result_len;
596 :
597 12 : len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
598 12 : len_conv = icu_convert_case(u_strFoldCase_default, locale,
599 : &buff_conv, buff_uchar, len_uchar);
600 12 : result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
601 12 : pfree(buff_uchar);
602 12 : pfree(buff_conv);
603 :
604 12 : return result_len;
605 : }
606 :
607 : /*
608 : * For historical compatibility, behavior is not multibyte-aware.
609 : *
610 : * NB: uses libc tolower() for single-byte encodings (also for historical
611 : * compatibility), and therefore relies on the global LC_CTYPE setting.
612 : */
613 : static size_t
614 64968 : downcase_ident_icu(char *dst, size_t dstsize, const char *src,
615 : ssize_t srclen, pg_locale_t locale)
616 : {
617 : int i;
618 : bool libc_lower;
619 64968 : locale_t lt = locale->icu.lt;
620 :
621 64968 : libc_lower = lt && (pg_database_encoding_max_length() == 1);
622 :
623 612630 : for (i = 0; i < srclen && i < dstsize; i++)
624 : {
625 547662 : unsigned char ch = (unsigned char) src[i];
626 :
627 547662 : if (ch >= 'A' && ch <= 'Z')
628 11770 : ch = pg_ascii_tolower(ch);
629 535892 : else if (libc_lower && IS_HIGHBIT_SET(ch) && isupper_l(ch, lt))
630 0 : ch = tolower_l(ch, lt);
631 547662 : dst[i] = (char) ch;
632 : }
633 :
634 64968 : if (i < dstsize)
635 64968 : dst[i] = '\0';
636 :
637 64968 : return srclen;
638 : }
639 :
640 : /*
641 : * strncoll_icu_utf8
642 : *
643 : * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
644 : * database encoding. An argument length of -1 means the string is
645 : * NUL-terminated.
646 : */
647 : #ifdef HAVE_UCOL_STRCOLLUTF8
648 : int
649 23770 : strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
650 : pg_locale_t locale)
651 : {
652 : int result;
653 : UErrorCode status;
654 :
655 : Assert(GetDatabaseEncoding() == PG_UTF8);
656 :
657 23770 : status = U_ZERO_ERROR;
658 23770 : result = ucol_strcollUTF8(locale->icu.ucol,
659 : arg1, len1,
660 : arg2, len2,
661 : &status);
662 23770 : if (U_FAILURE(status))
663 0 : ereport(ERROR,
664 : (errmsg("collation failed: %s", u_errorName(status))));
665 :
666 23770 : return result;
667 : }
668 : #endif
669 :
670 : /* 'srclen' of -1 means the strings are NUL-terminated */
671 : size_t
672 5748 : strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
673 : pg_locale_t locale)
674 : {
675 : char sbuf[TEXTBUFLEN];
676 5748 : char *buf = sbuf;
677 : UChar *uchar;
678 : int32_t ulen;
679 : size_t uchar_bsize;
680 : Size result_bsize;
681 :
682 5748 : init_icu_converter();
683 :
684 5748 : ulen = uchar_length(icu_converter, src, srclen);
685 :
686 5748 : uchar_bsize = (ulen + 1) * sizeof(UChar);
687 :
688 5748 : if (uchar_bsize > TEXTBUFLEN)
689 0 : buf = palloc(uchar_bsize);
690 :
691 5748 : uchar = (UChar *) buf;
692 :
693 5748 : ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
694 :
695 5748 : result_bsize = ucol_getSortKey(locale->icu.ucol,
696 : uchar, ulen,
697 : (uint8_t *) dest, destsize);
698 :
699 : /*
700 : * ucol_getSortKey() counts the nul-terminator in the result length, but
701 : * this function should not.
702 : */
703 : Assert(result_bsize > 0);
704 5748 : result_bsize--;
705 :
706 5748 : if (buf != sbuf)
707 0 : pfree(buf);
708 :
709 : /* if dest is defined, it should be nul-terminated */
710 : Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
711 :
712 5748 : return result_bsize;
713 : }
714 :
715 : /* 'srclen' of -1 means the strings are NUL-terminated */
716 : size_t
717 1668 : strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
718 : const char *src, ssize_t srclen,
719 : pg_locale_t locale)
720 : {
721 : size_t result;
722 : UCharIterator iter;
723 : uint32_t state[2];
724 : UErrorCode status;
725 :
726 : Assert(GetDatabaseEncoding() == PG_UTF8);
727 :
728 1668 : uiter_setUTF8(&iter, src, srclen);
729 1668 : state[0] = state[1] = 0; /* won't need that again */
730 1668 : status = U_ZERO_ERROR;
731 1668 : result = ucol_nextSortKeyPart(locale->icu.ucol,
732 : &iter,
733 : state,
734 : (uint8_t *) dest,
735 : destsize,
736 : &status);
737 1668 : if (U_FAILURE(status))
738 0 : ereport(ERROR,
739 : (errmsg("sort key generation failed: %s",
740 : u_errorName(status))));
741 :
742 1668 : return result;
743 : }
744 :
745 : char *
746 79454 : get_collation_actual_version_icu(const char *collcollate)
747 : {
748 : UCollator *collator;
749 : UVersionInfo versioninfo;
750 : char buf[U_MAX_VERSION_STRING_LENGTH];
751 :
752 79454 : collator = pg_ucol_open(collcollate);
753 :
754 79454 : ucol_getVersion(collator, versioninfo);
755 79454 : ucol_close(collator);
756 :
757 79454 : u_versionToString(versioninfo, buf);
758 79454 : return pstrdup(buf);
759 : }
760 :
761 : /*
762 : * Convert a string in the database encoding into a string of UChars.
763 : *
764 : * The source string at buff is of length nbytes
765 : * (it needn't be nul-terminated)
766 : *
767 : * *buff_uchar receives a pointer to the palloc'd result string, and
768 : * the function's result is the number of UChars generated.
769 : *
770 : * The result string is nul-terminated, though most callers rely on the
771 : * result length instead.
772 : */
773 : static int32_t
774 636 : icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
775 : {
776 : int32_t len_uchar;
777 :
778 636 : init_icu_converter();
779 :
780 636 : len_uchar = uchar_length(icu_converter, buff, nbytes);
781 :
782 636 : *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
783 636 : len_uchar = uchar_convert(icu_converter,
784 : *buff_uchar, len_uchar + 1, buff, nbytes);
785 :
786 636 : return len_uchar;
787 : }
788 :
789 : /*
790 : * Convert a string of UChars into the database encoding.
791 : *
792 : * The source string at buff_uchar is of length len_uchar
793 : * (it needn't be nul-terminated)
794 : *
795 : * *result receives a pointer to the palloc'd result string, and the
796 : * function's result is the number of bytes generated (not counting nul).
797 : *
798 : * The result string is nul-terminated.
799 : */
800 : static size_t
801 624 : icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar)
802 : {
803 : UErrorCode status;
804 : int32_t len_result;
805 :
806 624 : init_icu_converter();
807 :
808 624 : status = U_ZERO_ERROR;
809 624 : len_result = ucnv_fromUChars(icu_converter, NULL, 0,
810 : buff_uchar, len_uchar, &status);
811 624 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
812 0 : ereport(ERROR,
813 : (errmsg("%s failed: %s", "ucnv_fromUChars",
814 : u_errorName(status))));
815 :
816 624 : if (len_result + 1 > destsize)
817 60 : return len_result;
818 :
819 564 : status = U_ZERO_ERROR;
820 564 : len_result = ucnv_fromUChars(icu_converter, dest, len_result + 1,
821 : buff_uchar, len_uchar, &status);
822 564 : if (U_FAILURE(status) ||
823 564 : status == U_STRING_NOT_TERMINATED_WARNING)
824 0 : ereport(ERROR,
825 : (errmsg("%s failed: %s", "ucnv_fromUChars",
826 : u_errorName(status))));
827 :
828 564 : return len_result;
829 : }
830 :
831 : static int32_t
832 624 : icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
833 : UChar **buff_dest, UChar *buff_source, int32_t len_source)
834 : {
835 : UErrorCode status;
836 : int32_t len_dest;
837 :
838 624 : len_dest = len_source; /* try first with same length */
839 624 : *buff_dest = palloc(len_dest * sizeof(**buff_dest));
840 624 : status = U_ZERO_ERROR;
841 624 : len_dest = func(*buff_dest, len_dest, buff_source, len_source,
842 : mylocale->icu.locale, &status);
843 624 : if (status == U_BUFFER_OVERFLOW_ERROR)
844 : {
845 : /* try again with adjusted length */
846 18 : pfree(*buff_dest);
847 18 : *buff_dest = palloc(len_dest * sizeof(**buff_dest));
848 18 : status = U_ZERO_ERROR;
849 18 : len_dest = func(*buff_dest, len_dest, buff_source, len_source,
850 : mylocale->icu.locale, &status);
851 : }
852 624 : if (U_FAILURE(status))
853 0 : ereport(ERROR,
854 : (errmsg("case conversion failed: %s", u_errorName(status))));
855 624 : return len_dest;
856 : }
857 :
858 : static int32_t
859 30 : u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
860 : const UChar *src, int32_t srcLength,
861 : const char *locale,
862 : UErrorCode *pErrorCode)
863 : {
864 30 : return u_strToTitle(dest, destCapacity, src, srcLength,
865 : NULL, locale, pErrorCode);
866 : }
867 :
868 : static int32_t
869 24 : u_strFoldCase_default(UChar *dest, int32_t destCapacity,
870 : const UChar *src, int32_t srcLength,
871 : const char *locale,
872 : UErrorCode *pErrorCode)
873 : {
874 24 : uint32 options = U_FOLD_CASE_DEFAULT;
875 : char lang[3];
876 : UErrorCode status;
877 :
878 : /*
879 : * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
880 : * folding does not accept a locale. Instead it just supports a single
881 : * option relevant to Turkic languages 'az' and 'tr'; check for those
882 : * languages to enable the option.
883 : */
884 24 : status = U_ZERO_ERROR;
885 24 : uloc_getLanguage(locale, lang, 3, &status);
886 24 : if (U_SUCCESS(status))
887 : {
888 : /*
889 : * The option name is confusing, but it causes u_strFoldCase to use
890 : * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
891 : */
892 24 : if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
893 12 : options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
894 : }
895 :
896 24 : return u_strFoldCase(dest, destCapacity, src, srcLength,
897 : options, pErrorCode);
898 : }
899 :
900 : /*
901 : * strncoll_icu
902 : *
903 : * Convert the arguments from the database encoding to UChar strings, then
904 : * call ucol_strcoll(). An argument length of -1 means that the string is
905 : * NUL-terminated.
906 : *
907 : * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
908 : * caller should call that instead.
909 : */
910 : static int
911 0 : strncoll_icu(const char *arg1, ssize_t len1,
912 : const char *arg2, ssize_t len2, pg_locale_t locale)
913 : {
914 : char sbuf[TEXTBUFLEN];
915 0 : char *buf = sbuf;
916 : int32_t ulen1;
917 : int32_t ulen2;
918 : size_t bufsize1;
919 : size_t bufsize2;
920 : UChar *uchar1,
921 : *uchar2;
922 : int result;
923 :
924 : /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */
925 : #ifdef HAVE_UCOL_STRCOLLUTF8
926 : Assert(GetDatabaseEncoding() != PG_UTF8);
927 : #endif
928 :
929 0 : init_icu_converter();
930 :
931 0 : ulen1 = uchar_length(icu_converter, arg1, len1);
932 0 : ulen2 = uchar_length(icu_converter, arg2, len2);
933 :
934 0 : bufsize1 = (ulen1 + 1) * sizeof(UChar);
935 0 : bufsize2 = (ulen2 + 1) * sizeof(UChar);
936 :
937 0 : if (bufsize1 + bufsize2 > TEXTBUFLEN)
938 0 : buf = palloc(bufsize1 + bufsize2);
939 :
940 0 : uchar1 = (UChar *) buf;
941 0 : uchar2 = (UChar *) (buf + bufsize1);
942 :
943 0 : ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
944 0 : ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
945 :
946 0 : result = ucol_strcoll(locale->icu.ucol,
947 : uchar1, ulen1,
948 : uchar2, ulen2);
949 :
950 0 : if (buf != sbuf)
951 0 : pfree(buf);
952 :
953 0 : return result;
954 : }
955 :
956 : /* 'srclen' of -1 means the strings are NUL-terminated */
957 : static size_t
958 0 : strnxfrm_prefix_icu(char *dest, size_t destsize,
959 : const char *src, ssize_t srclen,
960 : pg_locale_t locale)
961 : {
962 : char sbuf[TEXTBUFLEN];
963 0 : char *buf = sbuf;
964 : UCharIterator iter;
965 : uint32_t state[2];
966 : UErrorCode status;
967 0 : int32_t ulen = -1;
968 0 : UChar *uchar = NULL;
969 : size_t uchar_bsize;
970 : Size result_bsize;
971 :
972 : /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */
973 : Assert(GetDatabaseEncoding() != PG_UTF8);
974 :
975 0 : init_icu_converter();
976 :
977 0 : ulen = uchar_length(icu_converter, src, srclen);
978 :
979 0 : uchar_bsize = (ulen + 1) * sizeof(UChar);
980 :
981 0 : if (uchar_bsize > TEXTBUFLEN)
982 0 : buf = palloc(uchar_bsize);
983 :
984 0 : uchar = (UChar *) buf;
985 :
986 0 : ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
987 :
988 0 : uiter_setString(&iter, uchar, ulen);
989 0 : state[0] = state[1] = 0; /* won't need that again */
990 0 : status = U_ZERO_ERROR;
991 0 : result_bsize = ucol_nextSortKeyPart(locale->icu.ucol,
992 : &iter,
993 : state,
994 : (uint8_t *) dest,
995 : destsize,
996 : &status);
997 0 : if (U_FAILURE(status))
998 0 : ereport(ERROR,
999 : (errmsg("sort key generation failed: %s",
1000 : u_errorName(status))));
1001 :
1002 0 : return result_bsize;
1003 : }
1004 :
1005 : static void
1006 7008 : init_icu_converter(void)
1007 : {
1008 : const char *icu_encoding_name;
1009 : UErrorCode status;
1010 : UConverter *conv;
1011 :
1012 7008 : if (icu_converter)
1013 7002 : return; /* already done */
1014 :
1015 6 : icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
1016 6 : if (!icu_encoding_name)
1017 0 : ereport(ERROR,
1018 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1019 : errmsg("encoding \"%s\" not supported by ICU",
1020 : pg_encoding_to_char(GetDatabaseEncoding()))));
1021 :
1022 6 : status = U_ZERO_ERROR;
1023 6 : conv = ucnv_open(icu_encoding_name, &status);
1024 6 : if (U_FAILURE(status))
1025 0 : ereport(ERROR,
1026 : (errmsg("could not open ICU converter for encoding \"%s\": %s",
1027 : icu_encoding_name, u_errorName(status))));
1028 :
1029 6 : icu_converter = conv;
1030 : }
1031 :
1032 : /*
1033 : * Find length, in UChars, of given string if converted to UChar string.
1034 : *
1035 : * A length of -1 indicates that the input string is NUL-terminated.
1036 : */
1037 : static size_t
1038 6384 : uchar_length(UConverter *converter, const char *str, int32_t len)
1039 : {
1040 6384 : UErrorCode status = U_ZERO_ERROR;
1041 : int32_t ulen;
1042 :
1043 6384 : ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
1044 6384 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
1045 0 : ereport(ERROR,
1046 : (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
1047 6384 : return ulen;
1048 : }
1049 :
1050 : /*
1051 : * Convert the given source string into a UChar string, stored in dest, and
1052 : * return the length (in UChars).
1053 : *
1054 : * A srclen of -1 indicates that the input string is NUL-terminated.
1055 : */
1056 : static int32_t
1057 6384 : uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
1058 : const char *src, int32_t srclen)
1059 : {
1060 6384 : UErrorCode status = U_ZERO_ERROR;
1061 : int32_t ulen;
1062 :
1063 6384 : status = U_ZERO_ERROR;
1064 6384 : ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
1065 6384 : if (U_FAILURE(status))
1066 0 : ereport(ERROR,
1067 : (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
1068 6384 : return ulen;
1069 : }
1070 :
1071 : /*
1072 : * Parse collation attributes from the given locale string and apply them to
1073 : * the open collator.
1074 : *
1075 : * First, the locale string is canonicalized to an ICU format locale ID such
1076 : * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
1077 : * the key-value arguments.
1078 : *
1079 : * Starting with ICU version 54, the attributes are processed automatically by
1080 : * ucol_open(), so this is only necessary for emulating this behavior on older
1081 : * versions.
1082 : */
1083 : pg_attribute_unused()
1084 : static void
1085 0 : icu_set_collation_attributes(UCollator *collator, const char *loc,
1086 : UErrorCode *status)
1087 : {
1088 : int32_t len;
1089 : char *icu_locale_id;
1090 : char *lower_str;
1091 : char *str;
1092 : char *token;
1093 :
1094 : /*
1095 : * The input locale may be a BCP 47 language tag, e.g.
1096 : * "und-u-kc-ks-level1", which expresses the same attributes in a
1097 : * different form. It will be converted to the equivalent ICU format
1098 : * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
1099 : * uloc_canonicalize().
1100 : */
1101 0 : *status = U_ZERO_ERROR;
1102 0 : len = uloc_canonicalize(loc, NULL, 0, status);
1103 0 : icu_locale_id = palloc(len + 1);
1104 0 : *status = U_ZERO_ERROR;
1105 0 : len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
1106 0 : if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
1107 0 : return;
1108 :
1109 0 : lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
1110 :
1111 0 : pfree(icu_locale_id);
1112 :
1113 0 : str = strchr(lower_str, '@');
1114 0 : if (!str)
1115 0 : return;
1116 0 : str++;
1117 :
1118 0 : while ((token = strsep(&str, ";")))
1119 : {
1120 0 : char *e = strchr(token, '=');
1121 :
1122 0 : if (e)
1123 : {
1124 : char *name;
1125 : char *value;
1126 : UColAttribute uattr;
1127 : UColAttributeValue uvalue;
1128 :
1129 0 : *status = U_ZERO_ERROR;
1130 :
1131 0 : *e = '\0';
1132 0 : name = token;
1133 0 : value = e + 1;
1134 :
1135 : /*
1136 : * See attribute name and value lists in ICU i18n/coll.cpp
1137 : */
1138 0 : if (strcmp(name, "colstrength") == 0)
1139 0 : uattr = UCOL_STRENGTH;
1140 0 : else if (strcmp(name, "colbackwards") == 0)
1141 0 : uattr = UCOL_FRENCH_COLLATION;
1142 0 : else if (strcmp(name, "colcaselevel") == 0)
1143 0 : uattr = UCOL_CASE_LEVEL;
1144 0 : else if (strcmp(name, "colcasefirst") == 0)
1145 0 : uattr = UCOL_CASE_FIRST;
1146 0 : else if (strcmp(name, "colalternate") == 0)
1147 0 : uattr = UCOL_ALTERNATE_HANDLING;
1148 0 : else if (strcmp(name, "colnormalization") == 0)
1149 0 : uattr = UCOL_NORMALIZATION_MODE;
1150 0 : else if (strcmp(name, "colnumeric") == 0)
1151 0 : uattr = UCOL_NUMERIC_COLLATION;
1152 : else
1153 : /* ignore if unknown */
1154 0 : continue;
1155 :
1156 0 : if (strcmp(value, "primary") == 0)
1157 0 : uvalue = UCOL_PRIMARY;
1158 0 : else if (strcmp(value, "secondary") == 0)
1159 0 : uvalue = UCOL_SECONDARY;
1160 0 : else if (strcmp(value, "tertiary") == 0)
1161 0 : uvalue = UCOL_TERTIARY;
1162 0 : else if (strcmp(value, "quaternary") == 0)
1163 0 : uvalue = UCOL_QUATERNARY;
1164 0 : else if (strcmp(value, "identical") == 0)
1165 0 : uvalue = UCOL_IDENTICAL;
1166 0 : else if (strcmp(value, "no") == 0)
1167 0 : uvalue = UCOL_OFF;
1168 0 : else if (strcmp(value, "yes") == 0)
1169 0 : uvalue = UCOL_ON;
1170 0 : else if (strcmp(value, "shifted") == 0)
1171 0 : uvalue = UCOL_SHIFTED;
1172 0 : else if (strcmp(value, "non-ignorable") == 0)
1173 0 : uvalue = UCOL_NON_IGNORABLE;
1174 0 : else if (strcmp(value, "lower") == 0)
1175 0 : uvalue = UCOL_LOWER_FIRST;
1176 0 : else if (strcmp(value, "upper") == 0)
1177 0 : uvalue = UCOL_UPPER_FIRST;
1178 : else
1179 : {
1180 0 : *status = U_ILLEGAL_ARGUMENT_ERROR;
1181 0 : break;
1182 : }
1183 :
1184 0 : ucol_setAttribute(collator, uattr, uvalue, status);
1185 : }
1186 : }
1187 :
1188 0 : pfree(lower_str);
1189 : }
1190 :
1191 : #endif /* USE_ICU */
|