LCOV - code coverage report
Current view: top level - src/backend/utils/adt - pg_locale_icu.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 202 318 63.5 %
Date: 2025-07-04 01:18:35 Functions: 31 34 91.2 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-----------------------------------------------------------------------
       2             :  *
       3             :  * PostgreSQL locale utilities for ICU
       4             :  *
       5             :  * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
       6             :  *
       7             :  * src/backend/utils/adt/pg_locale_icu.c
       8             :  *
       9             :  *-----------------------------------------------------------------------
      10             :  */
      11             : 
      12             : #include "postgres.h"
      13             : 
      14             : #ifdef USE_ICU
      15             : #include <unicode/ucnv.h>
      16             : #include <unicode/ustring.h>
      17             : 
      18             : /*
      19             :  * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
      20             :  * (see
      21             :  * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
      22             :  */
      23             : #if U_ICU_VERSION_MAJOR_NUM >= 53
      24             : #define HAVE_UCOL_STRCOLLUTF8 1
      25             : #else
      26             : #undef HAVE_UCOL_STRCOLLUTF8
      27             : #endif
      28             : 
      29             : #endif
      30             : 
      31             : #include "access/htup_details.h"
      32             : #include "catalog/pg_database.h"
      33             : #include "catalog/pg_collation.h"
      34             : #include "mb/pg_wchar.h"
      35             : #include "miscadmin.h"
      36             : #include "utils/builtins.h"
      37             : #include "utils/formatting.h"
      38             : #include "utils/memutils.h"
      39             : #include "utils/pg_locale.h"
      40             : #include "utils/syscache.h"
      41             : 
      42             : /*
      43             :  * Size of stack buffer to use for string transformations, used to avoid heap
      44             :  * allocations in typical cases. This should be large enough that most strings
      45             :  * will fit, but small enough that we feel comfortable putting it on the
      46             :  * stack.
      47             :  */
      48             : #define     TEXTBUFLEN          1024
      49             : 
      50             : extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context);
      51             : 
      52             : #ifdef USE_ICU
      53             : 
      54             : extern UCollator *pg_ucol_open(const char *loc_str);
      55             : 
      56             : static size_t strlower_icu(char *dest, size_t destsize, const char *src,
      57             :                            ssize_t srclen, pg_locale_t locale);
      58             : static size_t strtitle_icu(char *dest, size_t destsize, const char *src,
      59             :                            ssize_t srclen, pg_locale_t locale);
      60             : static size_t strupper_icu(char *dest, size_t destsize, const char *src,
      61             :                            ssize_t srclen, pg_locale_t locale);
      62             : static size_t strfold_icu(char *dest, size_t destsize, const char *src,
      63             :                           ssize_t srclen, pg_locale_t locale);
      64             : static int  strncoll_icu(const char *arg1, ssize_t len1,
      65             :                          const char *arg2, ssize_t len2,
      66             :                          pg_locale_t locale);
      67             : static size_t strnxfrm_icu(char *dest, size_t destsize,
      68             :                            const char *src, ssize_t srclen,
      69             :                            pg_locale_t locale);
      70             : extern char *get_collation_actual_version_icu(const char *collcollate);
      71             : 
      72             : typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity,
      73             :                                      const UChar *src, int32_t srcLength,
      74             :                                      const char *locale,
      75             :                                      UErrorCode *pErrorCode);
      76             : 
      77             : /*
      78             :  * Converter object for converting between ICU's UChar strings and C strings
      79             :  * in database encoding.  Since the database encoding doesn't change, we only
      80             :  * need one of these per session.
      81             :  */
      82             : static UConverter *icu_converter = NULL;
      83             : 
      84             : static UCollator *make_icu_collator(const char *iculocstr,
      85             :                                     const char *icurules);
      86             : static int  strncoll_icu(const char *arg1, ssize_t len1,
      87             :                          const char *arg2, ssize_t len2,
      88             :                          pg_locale_t locale);
      89             : static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
      90             :                                   const char *src, ssize_t srclen,
      91             :                                   pg_locale_t locale);
      92             : #ifdef HAVE_UCOL_STRCOLLUTF8
      93             : static int  strncoll_icu_utf8(const char *arg1, ssize_t len1,
      94             :                               const char *arg2, ssize_t len2,
      95             :                               pg_locale_t locale);
      96             : #endif
      97             : static size_t strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
      98             :                                        const char *src, ssize_t srclen,
      99             :                                        pg_locale_t locale);
     100             : static void init_icu_converter(void);
     101             : static size_t uchar_length(UConverter *converter,
     102             :                            const char *str, int32_t len);
     103             : static int32_t uchar_convert(UConverter *converter,
     104             :                              UChar *dest, int32_t destlen,
     105             :                              const char *src, int32_t srclen);
     106             : static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff,
     107             :                             size_t nbytes);
     108             : static size_t icu_from_uchar(char *dest, size_t destsize,
     109             :                              const UChar *buff_uchar, int32_t len_uchar);
     110             : static void icu_set_collation_attributes(UCollator *collator, const char *loc,
     111             :                                          UErrorCode *status);
     112             : static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
     113             :                                 UChar **buff_dest, UChar *buff_source,
     114             :                                 int32_t len_source);
     115             : static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
     116             :                                        const UChar *src, int32_t srcLength,
     117             :                                        const char *locale,
     118             :                                        UErrorCode *pErrorCode);
     119             : static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
     120             :                                      const UChar *src, int32_t srcLength,
     121             :                                      const char *locale,
     122             :                                      UErrorCode *pErrorCode);
     123             : 
     124             : static bool
     125         126 : char_is_cased_icu(char ch, pg_locale_t locale)
     126             : {
     127         126 :     return IS_HIGHBIT_SET(ch) ||
     128         252 :         (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
     129             : }
     130             : 
     131             : static pg_wchar
     132         108 : toupper_icu(pg_wchar wc, pg_locale_t locale)
     133             : {
     134         108 :     return u_toupper(wc);
     135             : }
     136             : 
     137             : static pg_wchar
     138         108 : tolower_icu(pg_wchar wc, pg_locale_t locale)
     139             : {
     140         108 :     return u_tolower(wc);
     141             : }
     142             : 
     143             : static const struct collate_methods collate_methods_icu = {
     144             :     .strncoll = strncoll_icu,
     145             :     .strnxfrm = strnxfrm_icu,
     146             :     .strnxfrm_prefix = strnxfrm_prefix_icu,
     147             :     .strxfrm_is_safe = true,
     148             : };
     149             : 
     150             : static const struct collate_methods collate_methods_icu_utf8 = {
     151             : #ifdef HAVE_UCOL_STRCOLLUTF8
     152             :     .strncoll = strncoll_icu_utf8,
     153             : #else
     154             :     .strncoll = strncoll_icu,
     155             : #endif
     156             :     .strnxfrm = strnxfrm_icu,
     157             :     .strnxfrm_prefix = strnxfrm_prefix_icu_utf8,
     158             :     .strxfrm_is_safe = true,
     159             : };
     160             : 
     161             : static bool
     162       12288 : wc_isdigit_icu(pg_wchar wc, pg_locale_t locale)
     163             : {
     164       12288 :     return u_isdigit(wc);
     165             : }
     166             : 
     167             : static bool
     168       12288 : wc_isalpha_icu(pg_wchar wc, pg_locale_t locale)
     169             : {
     170       12288 :     return u_isalpha(wc);
     171             : }
     172             : 
     173             : static bool
     174       12288 : wc_isalnum_icu(pg_wchar wc, pg_locale_t locale)
     175             : {
     176       12288 :     return u_isalnum(wc);
     177             : }
     178             : 
     179             : static bool
     180       12288 : wc_isupper_icu(pg_wchar wc, pg_locale_t locale)
     181             : {
     182       12288 :     return u_isupper(wc);
     183             : }
     184             : 
     185             : static bool
     186       12288 : wc_islower_icu(pg_wchar wc, pg_locale_t locale)
     187             : {
     188       12288 :     return u_islower(wc);
     189             : }
     190             : 
     191             : static bool
     192       12288 : wc_isgraph_icu(pg_wchar wc, pg_locale_t locale)
     193             : {
     194       12288 :     return u_isgraph(wc);
     195             : }
     196             : 
     197             : static bool
     198       12288 : wc_isprint_icu(pg_wchar wc, pg_locale_t locale)
     199             : {
     200       12288 :     return u_isprint(wc);
     201             : }
     202             : 
     203             : static bool
     204       12288 : wc_ispunct_icu(pg_wchar wc, pg_locale_t locale)
     205             : {
     206       12288 :     return u_ispunct(wc);
     207             : }
     208             : 
     209             : static bool
     210       12288 : wc_isspace_icu(pg_wchar wc, pg_locale_t locale)
     211             : {
     212       12288 :     return u_isspace(wc);
     213             : }
     214             : 
     215             : static const struct ctype_methods ctype_methods_icu = {
     216             :     .strlower = strlower_icu,
     217             :     .strtitle = strtitle_icu,
     218             :     .strupper = strupper_icu,
     219             :     .strfold = strfold_icu,
     220             :     .wc_isdigit = wc_isdigit_icu,
     221             :     .wc_isalpha = wc_isalpha_icu,
     222             :     .wc_isalnum = wc_isalnum_icu,
     223             :     .wc_isupper = wc_isupper_icu,
     224             :     .wc_islower = wc_islower_icu,
     225             :     .wc_isgraph = wc_isgraph_icu,
     226             :     .wc_isprint = wc_isprint_icu,
     227             :     .wc_ispunct = wc_ispunct_icu,
     228             :     .wc_isspace = wc_isspace_icu,
     229             :     .char_is_cased = char_is_cased_icu,
     230             :     .wc_toupper = toupper_icu,
     231             :     .wc_tolower = tolower_icu,
     232             : };
     233             : #endif
     234             : 
     235             : pg_locale_t
     236         212 : create_pg_locale_icu(Oid collid, MemoryContext context)
     237             : {
     238             : #ifdef USE_ICU
     239             :     bool        deterministic;
     240             :     const char *iculocstr;
     241         212 :     const char *icurules = NULL;
     242             :     UCollator  *collator;
     243             :     pg_locale_t result;
     244             : 
     245         212 :     if (collid == DEFAULT_COLLATION_OID)
     246             :     {
     247             :         HeapTuple   tp;
     248             :         Datum       datum;
     249             :         bool        isnull;
     250             : 
     251          26 :         tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
     252          26 :         if (!HeapTupleIsValid(tp))
     253           0 :             elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
     254             : 
     255             :         /* default database collation is always deterministic */
     256          26 :         deterministic = true;
     257          26 :         datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
     258             :                                        Anum_pg_database_datlocale);
     259          26 :         iculocstr = TextDatumGetCString(datum);
     260          26 :         datum = SysCacheGetAttr(DATABASEOID, tp,
     261             :                                 Anum_pg_database_daticurules, &isnull);
     262          26 :         if (!isnull)
     263           0 :             icurules = TextDatumGetCString(datum);
     264             : 
     265          26 :         ReleaseSysCache(tp);
     266             :     }
     267             :     else
     268             :     {
     269             :         Form_pg_collation collform;
     270             :         HeapTuple   tp;
     271             :         Datum       datum;
     272             :         bool        isnull;
     273             : 
     274         186 :         tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
     275         186 :         if (!HeapTupleIsValid(tp))
     276           0 :             elog(ERROR, "cache lookup failed for collation %u", collid);
     277         186 :         collform = (Form_pg_collation) GETSTRUCT(tp);
     278         186 :         deterministic = collform->collisdeterministic;
     279         186 :         datum = SysCacheGetAttrNotNull(COLLOID, tp,
     280             :                                        Anum_pg_collation_colllocale);
     281         186 :         iculocstr = TextDatumGetCString(datum);
     282         186 :         datum = SysCacheGetAttr(COLLOID, tp,
     283             :                                 Anum_pg_collation_collicurules, &isnull);
     284         186 :         if (!isnull)
     285          12 :             icurules = TextDatumGetCString(datum);
     286             : 
     287         186 :         ReleaseSysCache(tp);
     288             :     }
     289             : 
     290         212 :     collator = make_icu_collator(iculocstr, icurules);
     291             : 
     292         202 :     result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
     293         202 :     result->info.icu.locale = MemoryContextStrdup(context, iculocstr);
     294         202 :     result->info.icu.ucol = collator;
     295         202 :     result->deterministic = deterministic;
     296         202 :     result->collate_is_c = false;
     297         202 :     result->ctype_is_c = false;
     298         202 :     if (GetDatabaseEncoding() == PG_UTF8)
     299         202 :         result->collate = &collate_methods_icu_utf8;
     300             :     else
     301           0 :         result->collate = &collate_methods_icu;
     302         202 :     result->ctype = &ctype_methods_icu;
     303             : 
     304         202 :     return result;
     305             : #else
     306             :     /* could get here if a collation was created by a build with ICU */
     307             :     ereport(ERROR,
     308             :             (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     309             :              errmsg("ICU is not supported in this build")));
     310             : 
     311             :     return NULL;
     312             : #endif
     313             : }
     314             : 
     315             : #ifdef USE_ICU
     316             : 
     317             : /*
     318             :  * Wrapper around ucol_open() to handle API differences for older ICU
     319             :  * versions.
     320             :  *
     321             :  * Ensure that no path leaks a UCollator.
     322             :  */
     323             : UCollator *
     324       78206 : pg_ucol_open(const char *loc_str)
     325             : {
     326             :     UCollator  *collator;
     327             :     UErrorCode  status;
     328       78206 :     const char *orig_str = loc_str;
     329       78206 :     char       *fixed_str = NULL;
     330             : 
     331             :     /*
     332             :      * Must never open default collator, because it depends on the environment
     333             :      * and may change at any time. Should not happen, but check here to catch
     334             :      * bugs that might be hard to catch otherwise.
     335             :      *
     336             :      * NB: the default collator is not the same as the collator for the root
     337             :      * locale. The root locale may be specified as the empty string, "und", or
     338             :      * "root". The default collator is opened by passing NULL to ucol_open().
     339             :      */
     340       78206 :     if (loc_str == NULL)
     341           0 :         elog(ERROR, "opening default collator is not supported");
     342             : 
     343             :     /*
     344             :      * In ICU versions 54 and earlier, "und" is not a recognized spelling of
     345             :      * the root locale. If the first component of the locale is "und", replace
     346             :      * with "root" before opening.
     347             :      */
     348             :     if (U_ICU_VERSION_MAJOR_NUM < 55)
     349             :     {
     350             :         char        lang[ULOC_LANG_CAPACITY];
     351             : 
     352             :         status = U_ZERO_ERROR;
     353             :         uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
     354             :         if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
     355             :         {
     356             :             ereport(ERROR,
     357             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     358             :                      errmsg("could not get language from locale \"%s\": %s",
     359             :                             loc_str, u_errorName(status))));
     360             :         }
     361             : 
     362             :         if (strcmp(lang, "und") == 0)
     363             :         {
     364             :             const char *remainder = loc_str + strlen("und");
     365             : 
     366             :             fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
     367             :             strcpy(fixed_str, "root");
     368             :             strcat(fixed_str, remainder);
     369             : 
     370             :             loc_str = fixed_str;
     371             :         }
     372             :     }
     373             : 
     374       78206 :     status = U_ZERO_ERROR;
     375       78206 :     collator = ucol_open(loc_str, &status);
     376       78206 :     if (U_FAILURE(status))
     377          12 :         ereport(ERROR,
     378             :         /* use original string for error report */
     379             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     380             :                  errmsg("could not open collator for locale \"%s\": %s",
     381             :                         orig_str, u_errorName(status))));
     382             : 
     383             :     if (U_ICU_VERSION_MAJOR_NUM < 54)
     384             :     {
     385             :         status = U_ZERO_ERROR;
     386             :         icu_set_collation_attributes(collator, loc_str, &status);
     387             : 
     388             :         /*
     389             :          * Pretend the error came from ucol_open(), for consistent error
     390             :          * message across ICU versions.
     391             :          */
     392             :         if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
     393             :         {
     394             :             ucol_close(collator);
     395             :             ereport(ERROR,
     396             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     397             :                      errmsg("could not open collator for locale \"%s\": %s",
     398             :                             orig_str, u_errorName(status))));
     399             :         }
     400             :     }
     401             : 
     402       78194 :     if (fixed_str != NULL)
     403           0 :         pfree(fixed_str);
     404             : 
     405       78194 :     return collator;
     406             : }
     407             : 
     408             : /*
     409             :  * Create a UCollator with the given locale string and rules.
     410             :  *
     411             :  * Ensure that no path leaks a UCollator.
     412             :  */
     413             : static UCollator *
     414         212 : make_icu_collator(const char *iculocstr, const char *icurules)
     415             : {
     416         212 :     if (!icurules)
     417             :     {
     418             :         /* simple case without rules */
     419         200 :         return pg_ucol_open(iculocstr);
     420             :     }
     421             :     else
     422             :     {
     423             :         UCollator  *collator_std_rules;
     424             :         UCollator  *collator_all_rules;
     425             :         const UChar *std_rules;
     426             :         UChar      *my_rules;
     427             :         UChar      *all_rules;
     428             :         int32_t     length;
     429             :         int32_t     total;
     430             :         UErrorCode  status;
     431             : 
     432             :         /*
     433             :          * If rules are specified, we extract the rules of the standard
     434             :          * collation, add our own rules, and make a new collator with the
     435             :          * combined rules.
     436             :          */
     437          12 :         icu_to_uchar(&my_rules, icurules, strlen(icurules));
     438             : 
     439          12 :         collator_std_rules = pg_ucol_open(iculocstr);
     440             : 
     441          12 :         std_rules = ucol_getRules(collator_std_rules, &length);
     442             : 
     443          12 :         total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
     444             : 
     445             :         /* avoid leaking collator on OOM */
     446          12 :         all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
     447          12 :         if (!all_rules)
     448             :         {
     449           0 :             ucol_close(collator_std_rules);
     450           0 :             ereport(ERROR,
     451             :                     (errcode(ERRCODE_OUT_OF_MEMORY),
     452             :                      errmsg("out of memory")));
     453             :         }
     454             : 
     455          12 :         u_strcpy(all_rules, std_rules);
     456          12 :         u_strcat(all_rules, my_rules);
     457             : 
     458          12 :         ucol_close(collator_std_rules);
     459             : 
     460          12 :         status = U_ZERO_ERROR;
     461          12 :         collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
     462             :                                             UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
     463             :                                             NULL, &status);
     464          12 :         if (U_FAILURE(status))
     465             :         {
     466           6 :             ereport(ERROR,
     467             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     468             :                      errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
     469             :                             iculocstr, icurules, u_errorName(status))));
     470             :         }
     471             : 
     472           6 :         return collator_all_rules;
     473             :     }
     474             : }
     475             : 
     476             : static size_t
     477         528 : strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
     478             :              pg_locale_t locale)
     479             : {
     480             :     int32_t     len_uchar;
     481             :     int32_t     len_conv;
     482             :     UChar      *buff_uchar;
     483             :     UChar      *buff_conv;
     484             :     size_t      result_len;
     485             : 
     486         528 :     len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
     487         528 :     len_conv = icu_convert_case(u_strToLower, locale,
     488             :                                 &buff_conv, buff_uchar, len_uchar);
     489         528 :     result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
     490         528 :     pfree(buff_uchar);
     491         528 :     pfree(buff_conv);
     492             : 
     493         528 :     return result_len;
     494             : }
     495             : 
     496             : static size_t
     497          30 : strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
     498             :              pg_locale_t locale)
     499             : {
     500             :     int32_t     len_uchar;
     501             :     int32_t     len_conv;
     502             :     UChar      *buff_uchar;
     503             :     UChar      *buff_conv;
     504             :     size_t      result_len;
     505             : 
     506          30 :     len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
     507          30 :     len_conv = icu_convert_case(u_strToTitle_default_BI, locale,
     508             :                                 &buff_conv, buff_uchar, len_uchar);
     509          30 :     result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
     510          30 :     pfree(buff_uchar);
     511          30 :     pfree(buff_conv);
     512             : 
     513          30 :     return result_len;
     514             : }
     515             : 
     516             : static size_t
     517          54 : strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
     518             :              pg_locale_t locale)
     519             : {
     520             :     int32_t     len_uchar;
     521             :     int32_t     len_conv;
     522             :     UChar      *buff_uchar;
     523             :     UChar      *buff_conv;
     524             :     size_t      result_len;
     525             : 
     526          54 :     len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
     527          54 :     len_conv = icu_convert_case(u_strToUpper, locale,
     528             :                                 &buff_conv, buff_uchar, len_uchar);
     529          54 :     result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
     530          54 :     pfree(buff_uchar);
     531          54 :     pfree(buff_conv);
     532             : 
     533          54 :     return result_len;
     534             : }
     535             : 
     536             : static size_t
     537          12 : strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
     538             :             pg_locale_t locale)
     539             : {
     540             :     int32_t     len_uchar;
     541             :     int32_t     len_conv;
     542             :     UChar      *buff_uchar;
     543             :     UChar      *buff_conv;
     544             :     size_t      result_len;
     545             : 
     546          12 :     len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
     547          12 :     len_conv = icu_convert_case(u_strFoldCase_default, locale,
     548             :                                 &buff_conv, buff_uchar, len_uchar);
     549          12 :     result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
     550          12 :     pfree(buff_uchar);
     551          12 :     pfree(buff_conv);
     552             : 
     553          12 :     return result_len;
     554             : }
     555             : 
     556             : /*
     557             :  * strncoll_icu_utf8
     558             :  *
     559             :  * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
     560             :  * database encoding. An argument length of -1 means the string is
     561             :  * NUL-terminated.
     562             :  */
     563             : #ifdef HAVE_UCOL_STRCOLLUTF8
     564             : int
     565       25856 : strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
     566             :                   pg_locale_t locale)
     567             : {
     568             :     int         result;
     569             :     UErrorCode  status;
     570             : 
     571             :     Assert(GetDatabaseEncoding() == PG_UTF8);
     572             : 
     573       25856 :     status = U_ZERO_ERROR;
     574       25856 :     result = ucol_strcollUTF8(locale->info.icu.ucol,
     575             :                               arg1, len1,
     576             :                               arg2, len2,
     577             :                               &status);
     578       25856 :     if (U_FAILURE(status))
     579           0 :         ereport(ERROR,
     580             :                 (errmsg("collation failed: %s", u_errorName(status))));
     581             : 
     582       25856 :     return result;
     583             : }
     584             : #endif
     585             : 
     586             : /* 'srclen' of -1 means the strings are NUL-terminated */
     587             : size_t
     588       10020 : strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
     589             :              pg_locale_t locale)
     590             : {
     591             :     char        sbuf[TEXTBUFLEN];
     592       10020 :     char       *buf = sbuf;
     593             :     UChar      *uchar;
     594             :     int32_t     ulen;
     595             :     size_t      uchar_bsize;
     596             :     Size        result_bsize;
     597             : 
     598       10020 :     init_icu_converter();
     599             : 
     600       10020 :     ulen = uchar_length(icu_converter, src, srclen);
     601             : 
     602       10020 :     uchar_bsize = (ulen + 1) * sizeof(UChar);
     603             : 
     604       10020 :     if (uchar_bsize > TEXTBUFLEN)
     605           0 :         buf = palloc(uchar_bsize);
     606             : 
     607       10020 :     uchar = (UChar *) buf;
     608             : 
     609       10020 :     ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
     610             : 
     611       10020 :     result_bsize = ucol_getSortKey(locale->info.icu.ucol,
     612             :                                    uchar, ulen,
     613             :                                    (uint8_t *) dest, destsize);
     614             : 
     615             :     /*
     616             :      * ucol_getSortKey() counts the nul-terminator in the result length, but
     617             :      * this function should not.
     618             :      */
     619             :     Assert(result_bsize > 0);
     620       10020 :     result_bsize--;
     621             : 
     622       10020 :     if (buf != sbuf)
     623           0 :         pfree(buf);
     624             : 
     625             :     /* if dest is defined, it should be nul-terminated */
     626             :     Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
     627             : 
     628       10020 :     return result_bsize;
     629             : }
     630             : 
     631             : /* 'srclen' of -1 means the strings are NUL-terminated */
     632             : size_t
     633        1668 : strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
     634             :                          const char *src, ssize_t srclen,
     635             :                          pg_locale_t locale)
     636             : {
     637             :     size_t      result;
     638             :     UCharIterator iter;
     639             :     uint32_t    state[2];
     640             :     UErrorCode  status;
     641             : 
     642             :     Assert(GetDatabaseEncoding() == PG_UTF8);
     643             : 
     644        1668 :     uiter_setUTF8(&iter, src, srclen);
     645        1668 :     state[0] = state[1] = 0;    /* won't need that again */
     646        1668 :     status = U_ZERO_ERROR;
     647        1668 :     result = ucol_nextSortKeyPart(locale->info.icu.ucol,
     648             :                                   &iter,
     649             :                                   state,
     650             :                                   (uint8_t *) dest,
     651             :                                   destsize,
     652             :                                   &status);
     653        1668 :     if (U_FAILURE(status))
     654           0 :         ereport(ERROR,
     655             :                 (errmsg("sort key generation failed: %s",
     656             :                         u_errorName(status))));
     657             : 
     658        1668 :     return result;
     659             : }
     660             : 
     661             : char *
     662       77844 : get_collation_actual_version_icu(const char *collcollate)
     663             : {
     664             :     UCollator  *collator;
     665             :     UVersionInfo versioninfo;
     666             :     char        buf[U_MAX_VERSION_STRING_LENGTH];
     667             : 
     668       77844 :     collator = pg_ucol_open(collcollate);
     669             : 
     670       77844 :     ucol_getVersion(collator, versioninfo);
     671       77844 :     ucol_close(collator);
     672             : 
     673       77844 :     u_versionToString(versioninfo, buf);
     674       77844 :     return pstrdup(buf);
     675             : }
     676             : 
     677             : /*
     678             :  * Convert a string in the database encoding into a string of UChars.
     679             :  *
     680             :  * The source string at buff is of length nbytes
     681             :  * (it needn't be nul-terminated)
     682             :  *
     683             :  * *buff_uchar receives a pointer to the palloc'd result string, and
     684             :  * the function's result is the number of UChars generated.
     685             :  *
     686             :  * The result string is nul-terminated, though most callers rely on the
     687             :  * result length instead.
     688             :  */
     689             : static int32_t
     690         636 : icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
     691             : {
     692             :     int32_t     len_uchar;
     693             : 
     694         636 :     init_icu_converter();
     695             : 
     696         636 :     len_uchar = uchar_length(icu_converter, buff, nbytes);
     697             : 
     698         636 :     *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
     699         636 :     len_uchar = uchar_convert(icu_converter,
     700             :                               *buff_uchar, len_uchar + 1, buff, nbytes);
     701             : 
     702         636 :     return len_uchar;
     703             : }
     704             : 
     705             : /*
     706             :  * Convert a string of UChars into the database encoding.
     707             :  *
     708             :  * The source string at buff_uchar is of length len_uchar
     709             :  * (it needn't be nul-terminated)
     710             :  *
     711             :  * *result receives a pointer to the palloc'd result string, and the
     712             :  * function's result is the number of bytes generated (not counting nul).
     713             :  *
     714             :  * The result string is nul-terminated.
     715             :  */
     716             : static size_t
     717         624 : icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar)
     718             : {
     719             :     UErrorCode  status;
     720             :     int32_t     len_result;
     721             : 
     722         624 :     init_icu_converter();
     723             : 
     724         624 :     status = U_ZERO_ERROR;
     725         624 :     len_result = ucnv_fromUChars(icu_converter, NULL, 0,
     726             :                                  buff_uchar, len_uchar, &status);
     727         624 :     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
     728           0 :         ereport(ERROR,
     729             :                 (errmsg("%s failed: %s", "ucnv_fromUChars",
     730             :                         u_errorName(status))));
     731             : 
     732         624 :     if (len_result + 1 > destsize)
     733          60 :         return len_result;
     734             : 
     735         564 :     status = U_ZERO_ERROR;
     736         564 :     len_result = ucnv_fromUChars(icu_converter, dest, len_result + 1,
     737             :                                  buff_uchar, len_uchar, &status);
     738         564 :     if (U_FAILURE(status) ||
     739         564 :         status == U_STRING_NOT_TERMINATED_WARNING)
     740           0 :         ereport(ERROR,
     741             :                 (errmsg("%s failed: %s", "ucnv_fromUChars",
     742             :                         u_errorName(status))));
     743             : 
     744         564 :     return len_result;
     745             : }
     746             : 
     747             : static int32_t
     748         624 : icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
     749             :                  UChar **buff_dest, UChar *buff_source, int32_t len_source)
     750             : {
     751             :     UErrorCode  status;
     752             :     int32_t     len_dest;
     753             : 
     754         624 :     len_dest = len_source;      /* try first with same length */
     755         624 :     *buff_dest = palloc(len_dest * sizeof(**buff_dest));
     756         624 :     status = U_ZERO_ERROR;
     757         624 :     len_dest = func(*buff_dest, len_dest, buff_source, len_source,
     758             :                     mylocale->info.icu.locale, &status);
     759         624 :     if (status == U_BUFFER_OVERFLOW_ERROR)
     760             :     {
     761             :         /* try again with adjusted length */
     762          18 :         pfree(*buff_dest);
     763          18 :         *buff_dest = palloc(len_dest * sizeof(**buff_dest));
     764          18 :         status = U_ZERO_ERROR;
     765          18 :         len_dest = func(*buff_dest, len_dest, buff_source, len_source,
     766             :                         mylocale->info.icu.locale, &status);
     767             :     }
     768         624 :     if (U_FAILURE(status))
     769           0 :         ereport(ERROR,
     770             :                 (errmsg("case conversion failed: %s", u_errorName(status))));
     771         624 :     return len_dest;
     772             : }
     773             : 
     774             : static int32_t
     775          30 : u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
     776             :                         const UChar *src, int32_t srcLength,
     777             :                         const char *locale,
     778             :                         UErrorCode *pErrorCode)
     779             : {
     780          30 :     return u_strToTitle(dest, destCapacity, src, srcLength,
     781             :                         NULL, locale, pErrorCode);
     782             : }
     783             : 
     784             : static int32_t
     785          24 : u_strFoldCase_default(UChar *dest, int32_t destCapacity,
     786             :                       const UChar *src, int32_t srcLength,
     787             :                       const char *locale,
     788             :                       UErrorCode *pErrorCode)
     789             : {
     790          24 :     uint32      options = U_FOLD_CASE_DEFAULT;
     791             :     char        lang[3];
     792             :     UErrorCode  status;
     793             : 
     794             :     /*
     795             :      * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
     796             :      * folding does not accept a locale. Instead it just supports a single
     797             :      * option relevant to Turkic languages 'az' and 'tr'; check for those
     798             :      * languages to enable the option.
     799             :      */
     800          24 :     status = U_ZERO_ERROR;
     801          24 :     uloc_getLanguage(locale, lang, 3, &status);
     802          24 :     if (U_SUCCESS(status))
     803             :     {
     804             :         /*
     805             :          * The option name is confusing, but it causes u_strFoldCase to use
     806             :          * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
     807             :          */
     808          24 :         if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
     809          12 :             options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
     810             :     }
     811             : 
     812          24 :     return u_strFoldCase(dest, destCapacity, src, srcLength,
     813             :                          options, pErrorCode);
     814             : }
     815             : 
     816             : /*
     817             :  * strncoll_icu
     818             :  *
     819             :  * Convert the arguments from the database encoding to UChar strings, then
     820             :  * call ucol_strcoll(). An argument length of -1 means that the string is
     821             :  * NUL-terminated.
     822             :  *
     823             :  * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
     824             :  * caller should call that instead.
     825             :  */
     826             : static int
     827           0 : strncoll_icu(const char *arg1, ssize_t len1,
     828             :              const char *arg2, ssize_t len2, pg_locale_t locale)
     829             : {
     830             :     char        sbuf[TEXTBUFLEN];
     831           0 :     char       *buf = sbuf;
     832             :     int32_t     ulen1;
     833             :     int32_t     ulen2;
     834             :     size_t      bufsize1;
     835             :     size_t      bufsize2;
     836             :     UChar      *uchar1,
     837             :                *uchar2;
     838             :     int         result;
     839             : 
     840             :     /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */
     841             : #ifdef HAVE_UCOL_STRCOLLUTF8
     842             :     Assert(GetDatabaseEncoding() != PG_UTF8);
     843             : #endif
     844             : 
     845           0 :     init_icu_converter();
     846             : 
     847           0 :     ulen1 = uchar_length(icu_converter, arg1, len1);
     848           0 :     ulen2 = uchar_length(icu_converter, arg2, len2);
     849             : 
     850           0 :     bufsize1 = (ulen1 + 1) * sizeof(UChar);
     851           0 :     bufsize2 = (ulen2 + 1) * sizeof(UChar);
     852             : 
     853           0 :     if (bufsize1 + bufsize2 > TEXTBUFLEN)
     854           0 :         buf = palloc(bufsize1 + bufsize2);
     855             : 
     856           0 :     uchar1 = (UChar *) buf;
     857           0 :     uchar2 = (UChar *) (buf + bufsize1);
     858             : 
     859           0 :     ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
     860           0 :     ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
     861             : 
     862           0 :     result = ucol_strcoll(locale->info.icu.ucol,
     863             :                           uchar1, ulen1,
     864             :                           uchar2, ulen2);
     865             : 
     866           0 :     if (buf != sbuf)
     867           0 :         pfree(buf);
     868             : 
     869           0 :     return result;
     870             : }
     871             : 
     872             : /* 'srclen' of -1 means the strings are NUL-terminated */
     873             : static size_t
     874           0 : strnxfrm_prefix_icu(char *dest, size_t destsize,
     875             :                     const char *src, ssize_t srclen,
     876             :                     pg_locale_t locale)
     877             : {
     878             :     char        sbuf[TEXTBUFLEN];
     879           0 :     char       *buf = sbuf;
     880             :     UCharIterator iter;
     881             :     uint32_t    state[2];
     882             :     UErrorCode  status;
     883           0 :     int32_t     ulen = -1;
     884           0 :     UChar      *uchar = NULL;
     885             :     size_t      uchar_bsize;
     886             :     Size        result_bsize;
     887             : 
     888             :     /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */
     889             :     Assert(GetDatabaseEncoding() != PG_UTF8);
     890             : 
     891           0 :     init_icu_converter();
     892             : 
     893           0 :     ulen = uchar_length(icu_converter, src, srclen);
     894             : 
     895           0 :     uchar_bsize = (ulen + 1) * sizeof(UChar);
     896             : 
     897           0 :     if (uchar_bsize > TEXTBUFLEN)
     898           0 :         buf = palloc(uchar_bsize);
     899             : 
     900           0 :     uchar = (UChar *) buf;
     901             : 
     902           0 :     ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
     903             : 
     904           0 :     uiter_setString(&iter, uchar, ulen);
     905           0 :     state[0] = state[1] = 0;    /* won't need that again */
     906           0 :     status = U_ZERO_ERROR;
     907           0 :     result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol,
     908             :                                         &iter,
     909             :                                         state,
     910             :                                         (uint8_t *) dest,
     911             :                                         destsize,
     912             :                                         &status);
     913           0 :     if (U_FAILURE(status))
     914           0 :         ereport(ERROR,
     915             :                 (errmsg("sort key generation failed: %s",
     916             :                         u_errorName(status))));
     917             : 
     918           0 :     return result_bsize;
     919             : }
     920             : 
     921             : static void
     922       11280 : init_icu_converter(void)
     923             : {
     924             :     const char *icu_encoding_name;
     925             :     UErrorCode  status;
     926             :     UConverter *conv;
     927             : 
     928       11280 :     if (icu_converter)
     929       11274 :         return;                 /* already done */
     930             : 
     931           6 :     icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
     932           6 :     if (!icu_encoding_name)
     933           0 :         ereport(ERROR,
     934             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     935             :                  errmsg("encoding \"%s\" not supported by ICU",
     936             :                         pg_encoding_to_char(GetDatabaseEncoding()))));
     937             : 
     938           6 :     status = U_ZERO_ERROR;
     939           6 :     conv = ucnv_open(icu_encoding_name, &status);
     940           6 :     if (U_FAILURE(status))
     941           0 :         ereport(ERROR,
     942             :                 (errmsg("could not open ICU converter for encoding \"%s\": %s",
     943             :                         icu_encoding_name, u_errorName(status))));
     944             : 
     945           6 :     icu_converter = conv;
     946             : }
     947             : 
     948             : /*
     949             :  * Find length, in UChars, of given string if converted to UChar string.
     950             :  *
     951             :  * A length of -1 indicates that the input string is NUL-terminated.
     952             :  */
     953             : static size_t
     954       10656 : uchar_length(UConverter *converter, const char *str, int32_t len)
     955             : {
     956       10656 :     UErrorCode  status = U_ZERO_ERROR;
     957             :     int32_t     ulen;
     958             : 
     959       10656 :     ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
     960       10656 :     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
     961           0 :         ereport(ERROR,
     962             :                 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
     963       10656 :     return ulen;
     964             : }
     965             : 
     966             : /*
     967             :  * Convert the given source string into a UChar string, stored in dest, and
     968             :  * return the length (in UChars).
     969             :  *
     970             :  * A srclen of -1 indicates that the input string is NUL-terminated.
     971             :  */
     972             : static int32_t
     973       10656 : uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
     974             :               const char *src, int32_t srclen)
     975             : {
     976       10656 :     UErrorCode  status = U_ZERO_ERROR;
     977             :     int32_t     ulen;
     978             : 
     979       10656 :     status = U_ZERO_ERROR;
     980       10656 :     ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
     981       10656 :     if (U_FAILURE(status))
     982           0 :         ereport(ERROR,
     983             :                 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
     984       10656 :     return ulen;
     985             : }
     986             : 
     987             : /*
     988             :  * Parse collation attributes from the given locale string and apply them to
     989             :  * the open collator.
     990             :  *
     991             :  * First, the locale string is canonicalized to an ICU format locale ID such
     992             :  * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
     993             :  * the key-value arguments.
     994             :  *
     995             :  * Starting with ICU version 54, the attributes are processed automatically by
     996             :  * ucol_open(), so this is only necessary for emulating this behavior on older
     997             :  * versions.
     998             :  */
     999             : pg_attribute_unused()
    1000             : static void
    1001           0 : icu_set_collation_attributes(UCollator *collator, const char *loc,
    1002             :                              UErrorCode *status)
    1003             : {
    1004             :     int32_t     len;
    1005             :     char       *icu_locale_id;
    1006             :     char       *lower_str;
    1007             :     char       *str;
    1008             :     char       *token;
    1009             : 
    1010             :     /*
    1011             :      * The input locale may be a BCP 47 language tag, e.g.
    1012             :      * "und-u-kc-ks-level1", which expresses the same attributes in a
    1013             :      * different form. It will be converted to the equivalent ICU format
    1014             :      * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
    1015             :      * uloc_canonicalize().
    1016             :      */
    1017           0 :     *status = U_ZERO_ERROR;
    1018           0 :     len = uloc_canonicalize(loc, NULL, 0, status);
    1019           0 :     icu_locale_id = palloc(len + 1);
    1020           0 :     *status = U_ZERO_ERROR;
    1021           0 :     len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
    1022           0 :     if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
    1023           0 :         return;
    1024             : 
    1025           0 :     lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
    1026             : 
    1027           0 :     pfree(icu_locale_id);
    1028             : 
    1029           0 :     str = strchr(lower_str, '@');
    1030           0 :     if (!str)
    1031           0 :         return;
    1032           0 :     str++;
    1033             : 
    1034           0 :     while ((token = strsep(&str, ";")))
    1035             :     {
    1036           0 :         char       *e = strchr(token, '=');
    1037             : 
    1038           0 :         if (e)
    1039             :         {
    1040             :             char       *name;
    1041             :             char       *value;
    1042             :             UColAttribute uattr;
    1043             :             UColAttributeValue uvalue;
    1044             : 
    1045           0 :             *status = U_ZERO_ERROR;
    1046             : 
    1047           0 :             *e = '\0';
    1048           0 :             name = token;
    1049           0 :             value = e + 1;
    1050             : 
    1051             :             /*
    1052             :              * See attribute name and value lists in ICU i18n/coll.cpp
    1053             :              */
    1054           0 :             if (strcmp(name, "colstrength") == 0)
    1055           0 :                 uattr = UCOL_STRENGTH;
    1056           0 :             else if (strcmp(name, "colbackwards") == 0)
    1057           0 :                 uattr = UCOL_FRENCH_COLLATION;
    1058           0 :             else if (strcmp(name, "colcaselevel") == 0)
    1059           0 :                 uattr = UCOL_CASE_LEVEL;
    1060           0 :             else if (strcmp(name, "colcasefirst") == 0)
    1061           0 :                 uattr = UCOL_CASE_FIRST;
    1062           0 :             else if (strcmp(name, "colalternate") == 0)
    1063           0 :                 uattr = UCOL_ALTERNATE_HANDLING;
    1064           0 :             else if (strcmp(name, "colnormalization") == 0)
    1065           0 :                 uattr = UCOL_NORMALIZATION_MODE;
    1066           0 :             else if (strcmp(name, "colnumeric") == 0)
    1067           0 :                 uattr = UCOL_NUMERIC_COLLATION;
    1068             :             else
    1069             :                 /* ignore if unknown */
    1070           0 :                 continue;
    1071             : 
    1072           0 :             if (strcmp(value, "primary") == 0)
    1073           0 :                 uvalue = UCOL_PRIMARY;
    1074           0 :             else if (strcmp(value, "secondary") == 0)
    1075           0 :                 uvalue = UCOL_SECONDARY;
    1076           0 :             else if (strcmp(value, "tertiary") == 0)
    1077           0 :                 uvalue = UCOL_TERTIARY;
    1078           0 :             else if (strcmp(value, "quaternary") == 0)
    1079           0 :                 uvalue = UCOL_QUATERNARY;
    1080           0 :             else if (strcmp(value, "identical") == 0)
    1081           0 :                 uvalue = UCOL_IDENTICAL;
    1082           0 :             else if (strcmp(value, "no") == 0)
    1083           0 :                 uvalue = UCOL_OFF;
    1084           0 :             else if (strcmp(value, "yes") == 0)
    1085           0 :                 uvalue = UCOL_ON;
    1086           0 :             else if (strcmp(value, "shifted") == 0)
    1087           0 :                 uvalue = UCOL_SHIFTED;
    1088           0 :             else if (strcmp(value, "non-ignorable") == 0)
    1089           0 :                 uvalue = UCOL_NON_IGNORABLE;
    1090           0 :             else if (strcmp(value, "lower") == 0)
    1091           0 :                 uvalue = UCOL_LOWER_FIRST;
    1092           0 :             else if (strcmp(value, "upper") == 0)
    1093           0 :                 uvalue = UCOL_UPPER_FIRST;
    1094             :             else
    1095             :             {
    1096           0 :                 *status = U_ILLEGAL_ARGUMENT_ERROR;
    1097           0 :                 break;
    1098             :             }
    1099             : 
    1100           0 :             ucol_setAttribute(collator, uattr, uvalue, status);
    1101             :         }
    1102             :     }
    1103             : 
    1104           0 :     pfree(lower_str);
    1105             : }
    1106             : 
    1107             : #endif                          /* USE_ICU */

Generated by: LCOV version 1.16