LCOV - code coverage report
Current view: top level - src/backend/utils/adt - pg_locale_icu.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 177 293 60.4 %
Date: 2025-04-24 12:15:10 Functions: 19 22 86.4 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-----------------------------------------------------------------------
       2             :  *
       3             :  * PostgreSQL locale utilities for ICU
       4             :  *
       5             :  * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
       6             :  *
       7             :  * src/backend/utils/adt/pg_locale_icu.c
       8             :  *
       9             :  *-----------------------------------------------------------------------
      10             :  */
      11             : 
      12             : #include "postgres.h"
      13             : 
      14             : #ifdef USE_ICU
      15             : #include <unicode/ucnv.h>
      16             : #include <unicode/ustring.h>
      17             : 
      18             : /*
      19             :  * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
      20             :  * (see
      21             :  * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
      22             :  */
      23             : #if U_ICU_VERSION_MAJOR_NUM >= 53
      24             : #define HAVE_UCOL_STRCOLLUTF8 1
      25             : #else
      26             : #undef HAVE_UCOL_STRCOLLUTF8
      27             : #endif
      28             : 
      29             : #endif
      30             : 
      31             : #include "access/htup_details.h"
      32             : #include "catalog/pg_database.h"
      33             : #include "catalog/pg_collation.h"
      34             : #include "mb/pg_wchar.h"
      35             : #include "miscadmin.h"
      36             : #include "utils/builtins.h"
      37             : #include "utils/formatting.h"
      38             : #include "utils/memutils.h"
      39             : #include "utils/pg_locale.h"
      40             : #include "utils/syscache.h"
      41             : 
      42             : /*
      43             :  * Size of stack buffer to use for string transformations, used to avoid heap
      44             :  * allocations in typical cases. This should be large enough that most strings
      45             :  * will fit, but small enough that we feel comfortable putting it on the
      46             :  * stack.
      47             :  */
      48             : #define     TEXTBUFLEN          1024
      49             : 
      50             : extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context);
      51             : extern size_t strlower_icu(char *dest, size_t destsize, const char *src,
      52             :                            ssize_t srclen, pg_locale_t locale);
      53             : extern size_t strtitle_icu(char *dest, size_t destsize, const char *src,
      54             :                            ssize_t srclen, pg_locale_t locale);
      55             : extern size_t strupper_icu(char *dest, size_t destsize, const char *src,
      56             :                            ssize_t srclen, pg_locale_t locale);
      57             : extern size_t strfold_icu(char *dest, size_t destsize, const char *src,
      58             :                           ssize_t srclen, pg_locale_t locale);
      59             : 
      60             : #ifdef USE_ICU
      61             : 
      62             : extern UCollator *pg_ucol_open(const char *loc_str);
      63             : 
      64             : static size_t strnxfrm_icu(char *dest, size_t destsize,
      65             :                            const char *src, ssize_t srclen,
      66             :                            pg_locale_t locale);
      67             : extern char *get_collation_actual_version_icu(const char *collcollate);
      68             : 
      69             : typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity,
      70             :                                      const UChar *src, int32_t srcLength,
      71             :                                      const char *locale,
      72             :                                      UErrorCode *pErrorCode);
      73             : 
      74             : /*
      75             :  * Converter object for converting between ICU's UChar strings and C strings
      76             :  * in database encoding.  Since the database encoding doesn't change, we only
      77             :  * need one of these per session.
      78             :  */
      79             : static UConverter *icu_converter = NULL;
      80             : 
      81             : static UCollator *make_icu_collator(const char *iculocstr,
      82             :                                     const char *icurules);
      83             : static int  strncoll_icu(const char *arg1, ssize_t len1,
      84             :                          const char *arg2, ssize_t len2,
      85             :                          pg_locale_t locale);
      86             : static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
      87             :                                   const char *src, ssize_t srclen,
      88             :                                   pg_locale_t locale);
      89             : #ifdef HAVE_UCOL_STRCOLLUTF8
      90             : static int  strncoll_icu_utf8(const char *arg1, ssize_t len1,
      91             :                               const char *arg2, ssize_t len2,
      92             :                               pg_locale_t locale);
      93             : #endif
      94             : static size_t strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
      95             :                                        const char *src, ssize_t srclen,
      96             :                                        pg_locale_t locale);
      97             : static void init_icu_converter(void);
      98             : static size_t uchar_length(UConverter *converter,
      99             :                            const char *str, int32_t len);
     100             : static int32_t uchar_convert(UConverter *converter,
     101             :                              UChar *dest, int32_t destlen,
     102             :                              const char *src, int32_t srclen);
     103             : static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff,
     104             :                             size_t nbytes);
     105             : static size_t icu_from_uchar(char *dest, size_t destsize,
     106             :                              const UChar *buff_uchar, int32_t len_uchar);
     107             : static void icu_set_collation_attributes(UCollator *collator, const char *loc,
     108             :                                          UErrorCode *status);
     109             : static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
     110             :                                 UChar **buff_dest, UChar *buff_source,
     111             :                                 int32_t len_source);
     112             : static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
     113             :                                        const UChar *src, int32_t srcLength,
     114             :                                        const char *locale,
     115             :                                        UErrorCode *pErrorCode);
     116             : static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
     117             :                                      const UChar *src, int32_t srcLength,
     118             :                                      const char *locale,
     119             :                                      UErrorCode *pErrorCode);
     120             : 
     121             : static const struct collate_methods collate_methods_icu = {
     122             :     .strncoll = strncoll_icu,
     123             :     .strnxfrm = strnxfrm_icu,
     124             :     .strnxfrm_prefix = strnxfrm_prefix_icu,
     125             :     .strxfrm_is_safe = true,
     126             : };
     127             : 
     128             : static const struct collate_methods collate_methods_icu_utf8 = {
     129             : #ifdef HAVE_UCOL_STRCOLLUTF8
     130             :     .strncoll = strncoll_icu_utf8,
     131             : #else
     132             :     .strncoll = strncoll_icu,
     133             : #endif
     134             :     .strnxfrm = strnxfrm_icu,
     135             :     .strnxfrm_prefix = strnxfrm_prefix_icu_utf8,
     136             :     .strxfrm_is_safe = true,
     137             : };
     138             : 
     139             : #endif
     140             : 
     141             : pg_locale_t
     142         212 : create_pg_locale_icu(Oid collid, MemoryContext context)
     143             : {
     144             : #ifdef USE_ICU
     145             :     bool        deterministic;
     146             :     const char *iculocstr;
     147         212 :     const char *icurules = NULL;
     148             :     UCollator  *collator;
     149             :     pg_locale_t result;
     150             : 
     151         212 :     if (collid == DEFAULT_COLLATION_OID)
     152             :     {
     153             :         HeapTuple   tp;
     154             :         Datum       datum;
     155             :         bool        isnull;
     156             : 
     157          26 :         tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
     158          26 :         if (!HeapTupleIsValid(tp))
     159           0 :             elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
     160             : 
     161             :         /* default database collation is always deterministic */
     162          26 :         deterministic = true;
     163          26 :         datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
     164             :                                        Anum_pg_database_datlocale);
     165          26 :         iculocstr = TextDatumGetCString(datum);
     166          26 :         datum = SysCacheGetAttr(DATABASEOID, tp,
     167             :                                 Anum_pg_database_daticurules, &isnull);
     168          26 :         if (!isnull)
     169           0 :             icurules = TextDatumGetCString(datum);
     170             : 
     171          26 :         ReleaseSysCache(tp);
     172             :     }
     173             :     else
     174             :     {
     175             :         Form_pg_collation collform;
     176             :         HeapTuple   tp;
     177             :         Datum       datum;
     178             :         bool        isnull;
     179             : 
     180         186 :         tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
     181         186 :         if (!HeapTupleIsValid(tp))
     182           0 :             elog(ERROR, "cache lookup failed for collation %u", collid);
     183         186 :         collform = (Form_pg_collation) GETSTRUCT(tp);
     184         186 :         deterministic = collform->collisdeterministic;
     185         186 :         datum = SysCacheGetAttrNotNull(COLLOID, tp,
     186             :                                        Anum_pg_collation_colllocale);
     187         186 :         iculocstr = TextDatumGetCString(datum);
     188         186 :         datum = SysCacheGetAttr(COLLOID, tp,
     189             :                                 Anum_pg_collation_collicurules, &isnull);
     190         186 :         if (!isnull)
     191          12 :             icurules = TextDatumGetCString(datum);
     192             : 
     193         186 :         ReleaseSysCache(tp);
     194             :     }
     195             : 
     196         212 :     collator = make_icu_collator(iculocstr, icurules);
     197             : 
     198         202 :     result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
     199         202 :     result->info.icu.locale = MemoryContextStrdup(context, iculocstr);
     200         202 :     result->info.icu.ucol = collator;
     201         202 :     result->provider = COLLPROVIDER_ICU;
     202         202 :     result->deterministic = deterministic;
     203         202 :     result->collate_is_c = false;
     204         202 :     result->ctype_is_c = false;
     205         202 :     if (GetDatabaseEncoding() == PG_UTF8)
     206         202 :         result->collate = &collate_methods_icu_utf8;
     207             :     else
     208           0 :         result->collate = &collate_methods_icu;
     209             : 
     210         202 :     return result;
     211             : #else
     212             :     /* could get here if a collation was created by a build with ICU */
     213             :     ereport(ERROR,
     214             :             (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     215             :              errmsg("ICU is not supported in this build")));
     216             : 
     217             :     return NULL;
     218             : #endif
     219             : }
     220             : 
     221             : #ifdef USE_ICU
     222             : 
     223             : /*
     224             :  * Wrapper around ucol_open() to handle API differences for older ICU
     225             :  * versions.
     226             :  *
     227             :  * Ensure that no path leaks a UCollator.
     228             :  */
     229             : UCollator *
     230       74430 : pg_ucol_open(const char *loc_str)
     231             : {
     232             :     UCollator  *collator;
     233             :     UErrorCode  status;
     234       74430 :     const char *orig_str = loc_str;
     235       74430 :     char       *fixed_str = NULL;
     236             : 
     237             :     /*
     238             :      * Must never open default collator, because it depends on the environment
     239             :      * and may change at any time. Should not happen, but check here to catch
     240             :      * bugs that might be hard to catch otherwise.
     241             :      *
     242             :      * NB: the default collator is not the same as the collator for the root
     243             :      * locale. The root locale may be specified as the empty string, "und", or
     244             :      * "root". The default collator is opened by passing NULL to ucol_open().
     245             :      */
     246       74430 :     if (loc_str == NULL)
     247           0 :         elog(ERROR, "opening default collator is not supported");
     248             : 
     249             :     /*
     250             :      * In ICU versions 54 and earlier, "und" is not a recognized spelling of
     251             :      * the root locale. If the first component of the locale is "und", replace
     252             :      * with "root" before opening.
     253             :      */
     254             :     if (U_ICU_VERSION_MAJOR_NUM < 55)
     255             :     {
     256             :         char        lang[ULOC_LANG_CAPACITY];
     257             : 
     258             :         status = U_ZERO_ERROR;
     259             :         uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
     260             :         if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
     261             :         {
     262             :             ereport(ERROR,
     263             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     264             :                      errmsg("could not get language from locale \"%s\": %s",
     265             :                             loc_str, u_errorName(status))));
     266             :         }
     267             : 
     268             :         if (strcmp(lang, "und") == 0)
     269             :         {
     270             :             const char *remainder = loc_str + strlen("und");
     271             : 
     272             :             fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
     273             :             strcpy(fixed_str, "root");
     274             :             strcat(fixed_str, remainder);
     275             : 
     276             :             loc_str = fixed_str;
     277             :         }
     278             :     }
     279             : 
     280       74430 :     status = U_ZERO_ERROR;
     281       74430 :     collator = ucol_open(loc_str, &status);
     282       74430 :     if (U_FAILURE(status))
     283          12 :         ereport(ERROR,
     284             :         /* use original string for error report */
     285             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     286             :                  errmsg("could not open collator for locale \"%s\": %s",
     287             :                         orig_str, u_errorName(status))));
     288             : 
     289             :     if (U_ICU_VERSION_MAJOR_NUM < 54)
     290             :     {
     291             :         status = U_ZERO_ERROR;
     292             :         icu_set_collation_attributes(collator, loc_str, &status);
     293             : 
     294             :         /*
     295             :          * Pretend the error came from ucol_open(), for consistent error
     296             :          * message across ICU versions.
     297             :          */
     298             :         if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
     299             :         {
     300             :             ucol_close(collator);
     301             :             ereport(ERROR,
     302             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     303             :                      errmsg("could not open collator for locale \"%s\": %s",
     304             :                             orig_str, u_errorName(status))));
     305             :         }
     306             :     }
     307             : 
     308       74418 :     if (fixed_str != NULL)
     309           0 :         pfree(fixed_str);
     310             : 
     311       74418 :     return collator;
     312             : }
     313             : 
     314             : /*
     315             :  * Create a UCollator with the given locale string and rules.
     316             :  *
     317             :  * Ensure that no path leaks a UCollator.
     318             :  */
     319             : static UCollator *
     320         212 : make_icu_collator(const char *iculocstr, const char *icurules)
     321             : {
     322         212 :     if (!icurules)
     323             :     {
     324             :         /* simple case without rules */
     325         200 :         return pg_ucol_open(iculocstr);
     326             :     }
     327             :     else
     328             :     {
     329             :         UCollator  *collator_std_rules;
     330             :         UCollator  *collator_all_rules;
     331             :         const UChar *std_rules;
     332             :         UChar      *my_rules;
     333             :         UChar      *all_rules;
     334             :         int32_t     length;
     335             :         int32_t     total;
     336             :         UErrorCode  status;
     337             : 
     338             :         /*
     339             :          * If rules are specified, we extract the rules of the standard
     340             :          * collation, add our own rules, and make a new collator with the
     341             :          * combined rules.
     342             :          */
     343          12 :         icu_to_uchar(&my_rules, icurules, strlen(icurules));
     344             : 
     345          12 :         collator_std_rules = pg_ucol_open(iculocstr);
     346             : 
     347          12 :         std_rules = ucol_getRules(collator_std_rules, &length);
     348             : 
     349          12 :         total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
     350             : 
     351             :         /* avoid leaking collator on OOM */
     352          12 :         all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
     353          12 :         if (!all_rules)
     354             :         {
     355           0 :             ucol_close(collator_std_rules);
     356           0 :             ereport(ERROR,
     357             :                     (errcode(ERRCODE_OUT_OF_MEMORY),
     358             :                      errmsg("out of memory")));
     359             :         }
     360             : 
     361          12 :         u_strcpy(all_rules, std_rules);
     362          12 :         u_strcat(all_rules, my_rules);
     363             : 
     364          12 :         ucol_close(collator_std_rules);
     365             : 
     366          12 :         status = U_ZERO_ERROR;
     367          12 :         collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
     368             :                                             UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
     369             :                                             NULL, &status);
     370          12 :         if (U_FAILURE(status))
     371             :         {
     372           6 :             ereport(ERROR,
     373             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     374             :                      errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
     375             :                             iculocstr, icurules, u_errorName(status))));
     376             :         }
     377             : 
     378           6 :         return collator_all_rules;
     379             :     }
     380             : }
     381             : 
     382             : size_t
     383         528 : strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
     384             :              pg_locale_t locale)
     385             : {
     386             :     int32_t     len_uchar;
     387             :     int32_t     len_conv;
     388             :     UChar      *buff_uchar;
     389             :     UChar      *buff_conv;
     390             :     size_t      result_len;
     391             : 
     392         528 :     len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
     393         528 :     len_conv = icu_convert_case(u_strToLower, locale,
     394             :                                 &buff_conv, buff_uchar, len_uchar);
     395         528 :     result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
     396         528 :     pfree(buff_uchar);
     397         528 :     pfree(buff_conv);
     398             : 
     399         528 :     return result_len;
     400             : }
     401             : 
     402             : size_t
     403          30 : strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
     404             :              pg_locale_t locale)
     405             : {
     406             :     int32_t     len_uchar;
     407             :     int32_t     len_conv;
     408             :     UChar      *buff_uchar;
     409             :     UChar      *buff_conv;
     410             :     size_t      result_len;
     411             : 
     412          30 :     len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
     413          30 :     len_conv = icu_convert_case(u_strToTitle_default_BI, locale,
     414             :                                 &buff_conv, buff_uchar, len_uchar);
     415          30 :     result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
     416          30 :     pfree(buff_uchar);
     417          30 :     pfree(buff_conv);
     418             : 
     419          30 :     return result_len;
     420             : }
     421             : 
     422             : size_t
     423          54 : strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
     424             :              pg_locale_t locale)
     425             : {
     426             :     int32_t     len_uchar;
     427             :     int32_t     len_conv;
     428             :     UChar      *buff_uchar;
     429             :     UChar      *buff_conv;
     430             :     size_t      result_len;
     431             : 
     432          54 :     len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
     433          54 :     len_conv = icu_convert_case(u_strToUpper, locale,
     434             :                                 &buff_conv, buff_uchar, len_uchar);
     435          54 :     result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
     436          54 :     pfree(buff_uchar);
     437          54 :     pfree(buff_conv);
     438             : 
     439          54 :     return result_len;
     440             : }
     441             : 
     442             : size_t
     443          12 : strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
     444             :             pg_locale_t locale)
     445             : {
     446             :     int32_t     len_uchar;
     447             :     int32_t     len_conv;
     448             :     UChar      *buff_uchar;
     449             :     UChar      *buff_conv;
     450             :     size_t      result_len;
     451             : 
     452          12 :     len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
     453          12 :     len_conv = icu_convert_case(u_strFoldCase_default, locale,
     454             :                                 &buff_conv, buff_uchar, len_uchar);
     455          12 :     result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
     456          12 :     pfree(buff_uchar);
     457          12 :     pfree(buff_conv);
     458             : 
     459          12 :     return result_len;
     460             : }
     461             : 
     462             : /*
     463             :  * strncoll_icu_utf8
     464             :  *
     465             :  * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
     466             :  * database encoding. An argument length of -1 means the string is
     467             :  * NUL-terminated.
     468             :  */
     469             : #ifdef HAVE_UCOL_STRCOLLUTF8
     470             : int
     471       25470 : strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
     472             :                   pg_locale_t locale)
     473             : {
     474             :     int         result;
     475             :     UErrorCode  status;
     476             : 
     477             :     Assert(locale->provider == COLLPROVIDER_ICU);
     478             : 
     479             :     Assert(GetDatabaseEncoding() == PG_UTF8);
     480             : 
     481       25470 :     status = U_ZERO_ERROR;
     482       25470 :     result = ucol_strcollUTF8(locale->info.icu.ucol,
     483             :                               arg1, len1,
     484             :                               arg2, len2,
     485             :                               &status);
     486       25470 :     if (U_FAILURE(status))
     487           0 :         ereport(ERROR,
     488             :                 (errmsg("collation failed: %s", u_errorName(status))));
     489             : 
     490       25470 :     return result;
     491             : }
     492             : #endif
     493             : 
     494             : /* 'srclen' of -1 means the strings are NUL-terminated */
     495             : size_t
     496       10020 : strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
     497             :              pg_locale_t locale)
     498             : {
     499             :     char        sbuf[TEXTBUFLEN];
     500       10020 :     char       *buf = sbuf;
     501             :     UChar      *uchar;
     502             :     int32_t     ulen;
     503             :     size_t      uchar_bsize;
     504             :     Size        result_bsize;
     505             : 
     506             :     Assert(locale->provider == COLLPROVIDER_ICU);
     507             : 
     508       10020 :     init_icu_converter();
     509             : 
     510       10020 :     ulen = uchar_length(icu_converter, src, srclen);
     511             : 
     512       10020 :     uchar_bsize = (ulen + 1) * sizeof(UChar);
     513             : 
     514       10020 :     if (uchar_bsize > TEXTBUFLEN)
     515           0 :         buf = palloc(uchar_bsize);
     516             : 
     517       10020 :     uchar = (UChar *) buf;
     518             : 
     519       10020 :     ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
     520             : 
     521       10020 :     result_bsize = ucol_getSortKey(locale->info.icu.ucol,
     522             :                                    uchar, ulen,
     523             :                                    (uint8_t *) dest, destsize);
     524             : 
     525             :     /*
     526             :      * ucol_getSortKey() counts the nul-terminator in the result length, but
     527             :      * this function should not.
     528             :      */
     529             :     Assert(result_bsize > 0);
     530       10020 :     result_bsize--;
     531             : 
     532       10020 :     if (buf != sbuf)
     533           0 :         pfree(buf);
     534             : 
     535             :     /* if dest is defined, it should be nul-terminated */
     536             :     Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
     537             : 
     538       10020 :     return result_bsize;
     539             : }
     540             : 
     541             : /* 'srclen' of -1 means the strings are NUL-terminated */
     542             : size_t
     543        1668 : strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
     544             :                          const char *src, ssize_t srclen,
     545             :                          pg_locale_t locale)
     546             : {
     547             :     size_t      result;
     548             :     UCharIterator iter;
     549             :     uint32_t    state[2];
     550             :     UErrorCode  status;
     551             : 
     552             :     Assert(locale->provider == COLLPROVIDER_ICU);
     553             : 
     554             :     Assert(GetDatabaseEncoding() == PG_UTF8);
     555             : 
     556        1668 :     uiter_setUTF8(&iter, src, srclen);
     557        1668 :     state[0] = state[1] = 0;    /* won't need that again */
     558        1668 :     status = U_ZERO_ERROR;
     559        1668 :     result = ucol_nextSortKeyPart(locale->info.icu.ucol,
     560             :                                   &iter,
     561             :                                   state,
     562             :                                   (uint8_t *) dest,
     563             :                                   destsize,
     564             :                                   &status);
     565        1668 :     if (U_FAILURE(status))
     566           0 :         ereport(ERROR,
     567             :                 (errmsg("sort key generation failed: %s",
     568             :                         u_errorName(status))));
     569             : 
     570        1668 :     return result;
     571             : }
     572             : 
     573             : char *
     574       74068 : get_collation_actual_version_icu(const char *collcollate)
     575             : {
     576             :     UCollator  *collator;
     577             :     UVersionInfo versioninfo;
     578             :     char        buf[U_MAX_VERSION_STRING_LENGTH];
     579             : 
     580       74068 :     collator = pg_ucol_open(collcollate);
     581             : 
     582       74068 :     ucol_getVersion(collator, versioninfo);
     583       74068 :     ucol_close(collator);
     584             : 
     585       74068 :     u_versionToString(versioninfo, buf);
     586       74068 :     return pstrdup(buf);
     587             : }
     588             : 
     589             : /*
     590             :  * Convert a string in the database encoding into a string of UChars.
     591             :  *
     592             :  * The source string at buff is of length nbytes
     593             :  * (it needn't be nul-terminated)
     594             :  *
     595             :  * *buff_uchar receives a pointer to the palloc'd result string, and
     596             :  * the function's result is the number of UChars generated.
     597             :  *
     598             :  * The result string is nul-terminated, though most callers rely on the
     599             :  * result length instead.
     600             :  */
     601             : static int32_t
     602         636 : icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
     603             : {
     604             :     int32_t     len_uchar;
     605             : 
     606         636 :     init_icu_converter();
     607             : 
     608         636 :     len_uchar = uchar_length(icu_converter, buff, nbytes);
     609             : 
     610         636 :     *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
     611         636 :     len_uchar = uchar_convert(icu_converter,
     612             :                               *buff_uchar, len_uchar + 1, buff, nbytes);
     613             : 
     614         636 :     return len_uchar;
     615             : }
     616             : 
     617             : /*
     618             :  * Convert a string of UChars into the database encoding.
     619             :  *
     620             :  * The source string at buff_uchar is of length len_uchar
     621             :  * (it needn't be nul-terminated)
     622             :  *
     623             :  * *result receives a pointer to the palloc'd result string, and the
     624             :  * function's result is the number of bytes generated (not counting nul).
     625             :  *
     626             :  * The result string is nul-terminated.
     627             :  */
     628             : static size_t
     629         624 : icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar)
     630             : {
     631             :     UErrorCode  status;
     632             :     int32_t     len_result;
     633             : 
     634         624 :     init_icu_converter();
     635             : 
     636         624 :     status = U_ZERO_ERROR;
     637         624 :     len_result = ucnv_fromUChars(icu_converter, NULL, 0,
     638             :                                  buff_uchar, len_uchar, &status);
     639         624 :     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
     640           0 :         ereport(ERROR,
     641             :                 (errmsg("%s failed: %s", "ucnv_fromUChars",
     642             :                         u_errorName(status))));
     643             : 
     644         624 :     if (len_result + 1 > destsize)
     645          60 :         return len_result;
     646             : 
     647         564 :     status = U_ZERO_ERROR;
     648         564 :     len_result = ucnv_fromUChars(icu_converter, dest, len_result + 1,
     649             :                                  buff_uchar, len_uchar, &status);
     650         564 :     if (U_FAILURE(status) ||
     651         564 :         status == U_STRING_NOT_TERMINATED_WARNING)
     652           0 :         ereport(ERROR,
     653             :                 (errmsg("%s failed: %s", "ucnv_fromUChars",
     654             :                         u_errorName(status))));
     655             : 
     656         564 :     return len_result;
     657             : }
     658             : 
     659             : static int32_t
     660         624 : icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
     661             :                  UChar **buff_dest, UChar *buff_source, int32_t len_source)
     662             : {
     663             :     UErrorCode  status;
     664             :     int32_t     len_dest;
     665             : 
     666         624 :     len_dest = len_source;      /* try first with same length */
     667         624 :     *buff_dest = palloc(len_dest * sizeof(**buff_dest));
     668         624 :     status = U_ZERO_ERROR;
     669         624 :     len_dest = func(*buff_dest, len_dest, buff_source, len_source,
     670             :                     mylocale->info.icu.locale, &status);
     671         624 :     if (status == U_BUFFER_OVERFLOW_ERROR)
     672             :     {
     673             :         /* try again with adjusted length */
     674          18 :         pfree(*buff_dest);
     675          18 :         *buff_dest = palloc(len_dest * sizeof(**buff_dest));
     676          18 :         status = U_ZERO_ERROR;
     677          18 :         len_dest = func(*buff_dest, len_dest, buff_source, len_source,
     678             :                         mylocale->info.icu.locale, &status);
     679             :     }
     680         624 :     if (U_FAILURE(status))
     681           0 :         ereport(ERROR,
     682             :                 (errmsg("case conversion failed: %s", u_errorName(status))));
     683         624 :     return len_dest;
     684             : }
     685             : 
     686             : static int32_t
     687          30 : u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
     688             :                         const UChar *src, int32_t srcLength,
     689             :                         const char *locale,
     690             :                         UErrorCode *pErrorCode)
     691             : {
     692          30 :     return u_strToTitle(dest, destCapacity, src, srcLength,
     693             :                         NULL, locale, pErrorCode);
     694             : }
     695             : 
     696             : static int32_t
     697          24 : u_strFoldCase_default(UChar *dest, int32_t destCapacity,
     698             :                       const UChar *src, int32_t srcLength,
     699             :                       const char *locale,
     700             :                       UErrorCode *pErrorCode)
     701             : {
     702          24 :     uint32      options = U_FOLD_CASE_DEFAULT;
     703             :     char        lang[3];
     704             :     UErrorCode  status;
     705             : 
     706             :     /*
     707             :      * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
     708             :      * folding does not accept a locale. Instead it just supports a single
     709             :      * option relevant to Turkic languages 'az' and 'tr'; check for those
     710             :      * languages to enable the option.
     711             :      */
     712          24 :     status = U_ZERO_ERROR;
     713          24 :     uloc_getLanguage(locale, lang, 3, &status);
     714          24 :     if (U_SUCCESS(status))
     715             :     {
     716             :         /*
     717             :          * The option name is confusing, but it causes u_strFoldCase to use
     718             :          * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
     719             :          */
     720          24 :         if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
     721          12 :             options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
     722             :     }
     723             : 
     724          24 :     return u_strFoldCase(dest, destCapacity, src, srcLength,
     725             :                          options, pErrorCode);
     726             : }
     727             : 
     728             : /*
     729             :  * strncoll_icu
     730             :  *
     731             :  * Convert the arguments from the database encoding to UChar strings, then
     732             :  * call ucol_strcoll(). An argument length of -1 means that the string is
     733             :  * NUL-terminated.
     734             :  *
     735             :  * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
     736             :  * caller should call that instead.
     737             :  */
     738             : static int
     739           0 : strncoll_icu(const char *arg1, ssize_t len1,
     740             :              const char *arg2, ssize_t len2, pg_locale_t locale)
     741             : {
     742             :     char        sbuf[TEXTBUFLEN];
     743           0 :     char       *buf = sbuf;
     744             :     int32_t     ulen1;
     745             :     int32_t     ulen2;
     746             :     size_t      bufsize1;
     747             :     size_t      bufsize2;
     748             :     UChar      *uchar1,
     749             :                *uchar2;
     750             :     int         result;
     751             : 
     752             :     Assert(locale->provider == COLLPROVIDER_ICU);
     753             : 
     754             :     /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */
     755             : #ifdef HAVE_UCOL_STRCOLLUTF8
     756             :     Assert(GetDatabaseEncoding() != PG_UTF8);
     757             : #endif
     758             : 
     759           0 :     init_icu_converter();
     760             : 
     761           0 :     ulen1 = uchar_length(icu_converter, arg1, len1);
     762           0 :     ulen2 = uchar_length(icu_converter, arg2, len2);
     763             : 
     764           0 :     bufsize1 = (ulen1 + 1) * sizeof(UChar);
     765           0 :     bufsize2 = (ulen2 + 1) * sizeof(UChar);
     766             : 
     767           0 :     if (bufsize1 + bufsize2 > TEXTBUFLEN)
     768           0 :         buf = palloc(bufsize1 + bufsize2);
     769             : 
     770           0 :     uchar1 = (UChar *) buf;
     771           0 :     uchar2 = (UChar *) (buf + bufsize1);
     772             : 
     773           0 :     ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
     774           0 :     ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
     775             : 
     776           0 :     result = ucol_strcoll(locale->info.icu.ucol,
     777             :                           uchar1, ulen1,
     778             :                           uchar2, ulen2);
     779             : 
     780           0 :     if (buf != sbuf)
     781           0 :         pfree(buf);
     782             : 
     783           0 :     return result;
     784             : }
     785             : 
     786             : /* 'srclen' of -1 means the strings are NUL-terminated */
     787             : static size_t
     788           0 : strnxfrm_prefix_icu(char *dest, size_t destsize,
     789             :                     const char *src, ssize_t srclen,
     790             :                     pg_locale_t locale)
     791             : {
     792             :     char        sbuf[TEXTBUFLEN];
     793           0 :     char       *buf = sbuf;
     794             :     UCharIterator iter;
     795             :     uint32_t    state[2];
     796             :     UErrorCode  status;
     797           0 :     int32_t     ulen = -1;
     798           0 :     UChar      *uchar = NULL;
     799             :     size_t      uchar_bsize;
     800             :     Size        result_bsize;
     801             : 
     802             :     Assert(locale->provider == COLLPROVIDER_ICU);
     803             : 
     804             :     /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */
     805             :     Assert(GetDatabaseEncoding() != PG_UTF8);
     806             : 
     807           0 :     init_icu_converter();
     808             : 
     809           0 :     ulen = uchar_length(icu_converter, src, srclen);
     810             : 
     811           0 :     uchar_bsize = (ulen + 1) * sizeof(UChar);
     812             : 
     813           0 :     if (uchar_bsize > TEXTBUFLEN)
     814           0 :         buf = palloc(uchar_bsize);
     815             : 
     816           0 :     uchar = (UChar *) buf;
     817             : 
     818           0 :     ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
     819             : 
     820           0 :     uiter_setString(&iter, uchar, ulen);
     821           0 :     state[0] = state[1] = 0;    /* won't need that again */
     822           0 :     status = U_ZERO_ERROR;
     823           0 :     result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol,
     824             :                                         &iter,
     825             :                                         state,
     826             :                                         (uint8_t *) dest,
     827             :                                         destsize,
     828             :                                         &status);
     829           0 :     if (U_FAILURE(status))
     830           0 :         ereport(ERROR,
     831             :                 (errmsg("sort key generation failed: %s",
     832             :                         u_errorName(status))));
     833             : 
     834           0 :     return result_bsize;
     835             : }
     836             : 
     837             : static void
     838       11280 : init_icu_converter(void)
     839             : {
     840             :     const char *icu_encoding_name;
     841             :     UErrorCode  status;
     842             :     UConverter *conv;
     843             : 
     844       11280 :     if (icu_converter)
     845       11274 :         return;                 /* already done */
     846             : 
     847           6 :     icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
     848           6 :     if (!icu_encoding_name)
     849           0 :         ereport(ERROR,
     850             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     851             :                  errmsg("encoding \"%s\" not supported by ICU",
     852             :                         pg_encoding_to_char(GetDatabaseEncoding()))));
     853             : 
     854           6 :     status = U_ZERO_ERROR;
     855           6 :     conv = ucnv_open(icu_encoding_name, &status);
     856           6 :     if (U_FAILURE(status))
     857           0 :         ereport(ERROR,
     858             :                 (errmsg("could not open ICU converter for encoding \"%s\": %s",
     859             :                         icu_encoding_name, u_errorName(status))));
     860             : 
     861           6 :     icu_converter = conv;
     862             : }
     863             : 
     864             : /*
     865             :  * Find length, in UChars, of given string if converted to UChar string.
     866             :  *
     867             :  * A length of -1 indicates that the input string is NUL-terminated.
     868             :  */
     869             : static size_t
     870       10656 : uchar_length(UConverter *converter, const char *str, int32_t len)
     871             : {
     872       10656 :     UErrorCode  status = U_ZERO_ERROR;
     873             :     int32_t     ulen;
     874             : 
     875       10656 :     ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
     876       10656 :     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
     877           0 :         ereport(ERROR,
     878             :                 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
     879       10656 :     return ulen;
     880             : }
     881             : 
     882             : /*
     883             :  * Convert the given source string into a UChar string, stored in dest, and
     884             :  * return the length (in UChars).
     885             :  *
     886             :  * A srclen of -1 indicates that the input string is NUL-terminated.
     887             :  */
     888             : static int32_t
     889       10656 : uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
     890             :               const char *src, int32_t srclen)
     891             : {
     892       10656 :     UErrorCode  status = U_ZERO_ERROR;
     893             :     int32_t     ulen;
     894             : 
     895       10656 :     status = U_ZERO_ERROR;
     896       10656 :     ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
     897       10656 :     if (U_FAILURE(status))
     898           0 :         ereport(ERROR,
     899             :                 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
     900       10656 :     return ulen;
     901             : }
     902             : 
     903             : /*
     904             :  * Parse collation attributes from the given locale string and apply them to
     905             :  * the open collator.
     906             :  *
     907             :  * First, the locale string is canonicalized to an ICU format locale ID such
     908             :  * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
     909             :  * the key-value arguments.
     910             :  *
     911             :  * Starting with ICU version 54, the attributes are processed automatically by
     912             :  * ucol_open(), so this is only necessary for emulating this behavior on older
     913             :  * versions.
     914             :  */
     915             : pg_attribute_unused()
     916             : static void
     917           0 : icu_set_collation_attributes(UCollator *collator, const char *loc,
     918             :                              UErrorCode *status)
     919             : {
     920             :     int32_t     len;
     921             :     char       *icu_locale_id;
     922             :     char       *lower_str;
     923             :     char       *str;
     924             :     char       *token;
     925             : 
     926             :     /*
     927             :      * The input locale may be a BCP 47 language tag, e.g.
     928             :      * "und-u-kc-ks-level1", which expresses the same attributes in a
     929             :      * different form. It will be converted to the equivalent ICU format
     930             :      * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
     931             :      * uloc_canonicalize().
     932             :      */
     933           0 :     *status = U_ZERO_ERROR;
     934           0 :     len = uloc_canonicalize(loc, NULL, 0, status);
     935           0 :     icu_locale_id = palloc(len + 1);
     936           0 :     *status = U_ZERO_ERROR;
     937           0 :     len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
     938           0 :     if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
     939           0 :         return;
     940             : 
     941           0 :     lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
     942             : 
     943           0 :     pfree(icu_locale_id);
     944             : 
     945           0 :     str = strchr(lower_str, '@');
     946           0 :     if (!str)
     947           0 :         return;
     948           0 :     str++;
     949             : 
     950           0 :     while ((token = strsep(&str, ";")))
     951             :     {
     952           0 :         char       *e = strchr(token, '=');
     953             : 
     954           0 :         if (e)
     955             :         {
     956             :             char       *name;
     957             :             char       *value;
     958             :             UColAttribute uattr;
     959             :             UColAttributeValue uvalue;
     960             : 
     961           0 :             *status = U_ZERO_ERROR;
     962             : 
     963           0 :             *e = '\0';
     964           0 :             name = token;
     965           0 :             value = e + 1;
     966             : 
     967             :             /*
     968             :              * See attribute name and value lists in ICU i18n/coll.cpp
     969             :              */
     970           0 :             if (strcmp(name, "colstrength") == 0)
     971           0 :                 uattr = UCOL_STRENGTH;
     972           0 :             else if (strcmp(name, "colbackwards") == 0)
     973           0 :                 uattr = UCOL_FRENCH_COLLATION;
     974           0 :             else if (strcmp(name, "colcaselevel") == 0)
     975           0 :                 uattr = UCOL_CASE_LEVEL;
     976           0 :             else if (strcmp(name, "colcasefirst") == 0)
     977           0 :                 uattr = UCOL_CASE_FIRST;
     978           0 :             else if (strcmp(name, "colalternate") == 0)
     979           0 :                 uattr = UCOL_ALTERNATE_HANDLING;
     980           0 :             else if (strcmp(name, "colnormalization") == 0)
     981           0 :                 uattr = UCOL_NORMALIZATION_MODE;
     982           0 :             else if (strcmp(name, "colnumeric") == 0)
     983           0 :                 uattr = UCOL_NUMERIC_COLLATION;
     984             :             else
     985             :                 /* ignore if unknown */
     986           0 :                 continue;
     987             : 
     988           0 :             if (strcmp(value, "primary") == 0)
     989           0 :                 uvalue = UCOL_PRIMARY;
     990           0 :             else if (strcmp(value, "secondary") == 0)
     991           0 :                 uvalue = UCOL_SECONDARY;
     992           0 :             else if (strcmp(value, "tertiary") == 0)
     993           0 :                 uvalue = UCOL_TERTIARY;
     994           0 :             else if (strcmp(value, "quaternary") == 0)
     995           0 :                 uvalue = UCOL_QUATERNARY;
     996           0 :             else if (strcmp(value, "identical") == 0)
     997           0 :                 uvalue = UCOL_IDENTICAL;
     998           0 :             else if (strcmp(value, "no") == 0)
     999           0 :                 uvalue = UCOL_OFF;
    1000           0 :             else if (strcmp(value, "yes") == 0)
    1001           0 :                 uvalue = UCOL_ON;
    1002           0 :             else if (strcmp(value, "shifted") == 0)
    1003           0 :                 uvalue = UCOL_SHIFTED;
    1004           0 :             else if (strcmp(value, "non-ignorable") == 0)
    1005           0 :                 uvalue = UCOL_NON_IGNORABLE;
    1006           0 :             else if (strcmp(value, "lower") == 0)
    1007           0 :                 uvalue = UCOL_LOWER_FIRST;
    1008           0 :             else if (strcmp(value, "upper") == 0)
    1009           0 :                 uvalue = UCOL_UPPER_FIRST;
    1010             :             else
    1011             :             {
    1012           0 :                 *status = U_ILLEGAL_ARGUMENT_ERROR;
    1013           0 :                 break;
    1014             :             }
    1015             : 
    1016           0 :             ucol_setAttribute(collator, uattr, uvalue, status);
    1017             :         }
    1018             :     }
    1019             : 
    1020           0 :     pfree(lower_str);
    1021             : }
    1022             : 
    1023             : #endif                          /* USE_ICU */

Generated by: LCOV version 1.14