LCOV - code coverage report
Current view: top level - src/backend/utils/adt - pg_locale_icu.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 177 293 60.4 %
Date: 2025-02-21 15:15:02 Functions: 19 22 86.4 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-----------------------------------------------------------------------
       2             :  *
       3             :  * PostgreSQL locale utilities for ICU
       4             :  *
       5             :  * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
       6             :  *
       7             :  * src/backend/utils/adt/pg_locale_icu.c
       8             :  *
       9             :  *-----------------------------------------------------------------------
      10             :  */
      11             : 
      12             : #include "postgres.h"
      13             : 
      14             : #ifdef USE_ICU
      15             : #include <unicode/ucnv.h>
      16             : #include <unicode/ustring.h>
      17             : 
      18             : /*
      19             :  * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
      20             :  * (see
      21             :  * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
      22             :  */
      23             : #if U_ICU_VERSION_MAJOR_NUM >= 53
      24             : #define HAVE_UCOL_STRCOLLUTF8 1
      25             : #else
      26             : #undef HAVE_UCOL_STRCOLLUTF8
      27             : #endif
      28             : 
      29             : #endif
      30             : 
      31             : #include "access/htup_details.h"
      32             : #include "catalog/pg_database.h"
      33             : #include "catalog/pg_collation.h"
      34             : #include "mb/pg_wchar.h"
      35             : #include "miscadmin.h"
      36             : #include "utils/builtins.h"
      37             : #include "utils/formatting.h"
      38             : #include "utils/memutils.h"
      39             : #include "utils/pg_locale.h"
      40             : #include "utils/syscache.h"
      41             : 
      42             : /*
      43             :  * Size of stack buffer to use for string transformations, used to avoid heap
      44             :  * allocations in typical cases. This should be large enough that most strings
      45             :  * will fit, but small enough that we feel comfortable putting it on the
      46             :  * stack.
      47             :  */
      48             : #define     TEXTBUFLEN          1024
      49             : 
      50             : extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context);
      51             : extern size_t strlower_icu(char *dst, size_t dstsize, const char *src,
      52             :                            ssize_t srclen, pg_locale_t locale);
      53             : extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src,
      54             :                            ssize_t srclen, pg_locale_t locale);
      55             : extern size_t strupper_icu(char *dst, size_t dstsize, const char *src,
      56             :                            ssize_t srclen, pg_locale_t locale);
      57             : extern size_t strfold_icu(char *dst, size_t dstsize, const char *src,
      58             :                           ssize_t srclen, pg_locale_t locale);
      59             : 
      60             : #ifdef USE_ICU
      61             : 
      62             : extern UCollator *pg_ucol_open(const char *loc_str);
      63             : 
      64             : static int  strncoll_icu(const char *arg1, ssize_t len1,
      65             :                          const char *arg2, ssize_t len2,
      66             :                          pg_locale_t locale);
      67             : static size_t strnxfrm_icu(char *dest, size_t destsize,
      68             :                            const char *src, ssize_t srclen,
      69             :                            pg_locale_t locale);
      70             : static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
      71             :                                   const char *src, ssize_t srclen,
      72             :                                   pg_locale_t locale);
      73             : extern char *get_collation_actual_version_icu(const char *collcollate);
      74             : 
      75             : typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity,
      76             :                                      const UChar *src, int32_t srcLength,
      77             :                                      const char *locale,
      78             :                                      UErrorCode *pErrorCode);
      79             : 
      80             : /*
      81             :  * Converter object for converting between ICU's UChar strings and C strings
      82             :  * in database encoding.  Since the database encoding doesn't change, we only
      83             :  * need one of these per session.
      84             :  */
      85             : static UConverter *icu_converter = NULL;
      86             : 
      87             : static UCollator *make_icu_collator(const char *iculocstr,
      88             :                                     const char *icurules);
      89             : static int  strncoll_icu(const char *arg1, ssize_t len1,
      90             :                          const char *arg2, ssize_t len2,
      91             :                          pg_locale_t locale);
      92             : static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
      93             :                                   const char *src, ssize_t srclen,
      94             :                                   pg_locale_t locale);
      95             : #ifdef HAVE_UCOL_STRCOLLUTF8
      96             : static int  strncoll_icu_utf8(const char *arg1, ssize_t len1,
      97             :                               const char *arg2, ssize_t len2,
      98             :                               pg_locale_t locale);
      99             : #endif
     100             : static size_t strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
     101             :                                        const char *src, ssize_t srclen,
     102             :                                        pg_locale_t locale);
     103             : static void init_icu_converter(void);
     104             : static size_t uchar_length(UConverter *converter,
     105             :                            const char *str, int32_t len);
     106             : static int32_t uchar_convert(UConverter *converter,
     107             :                              UChar *dest, int32_t destlen,
     108             :                              const char *src, int32_t srclen);
     109             : static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff,
     110             :                             size_t nbytes);
     111             : static size_t icu_from_uchar(char *dest, size_t destsize,
     112             :                              const UChar *buff_uchar, int32_t len_uchar);
     113             : static void icu_set_collation_attributes(UCollator *collator, const char *loc,
     114             :                                          UErrorCode *status);
     115             : static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
     116             :                                 UChar **buff_dest, UChar *buff_source,
     117             :                                 int32_t len_source);
     118             : static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
     119             :                                        const UChar *src, int32_t srcLength,
     120             :                                        const char *locale,
     121             :                                        UErrorCode *pErrorCode);
     122             : static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
     123             :                                      const UChar *src, int32_t srcLength,
     124             :                                      const char *locale,
     125             :                                      UErrorCode *pErrorCode);
     126             : 
     127             : static const struct collate_methods collate_methods_icu = {
     128             :     .strncoll = strncoll_icu,
     129             :     .strnxfrm = strnxfrm_icu,
     130             :     .strnxfrm_prefix = strnxfrm_prefix_icu,
     131             :     .strxfrm_is_safe = true,
     132             : };
     133             : 
     134             : static const struct collate_methods collate_methods_icu_utf8 = {
     135             : #ifdef HAVE_UCOL_STRCOLLUTF8
     136             :     .strncoll = strncoll_icu_utf8,
     137             : #else
     138             :     .strncoll = strncoll_icu,
     139             : #endif
     140             :     .strnxfrm = strnxfrm_icu,
     141             :     .strnxfrm_prefix = strnxfrm_prefix_icu_utf8,
     142             :     .strxfrm_is_safe = true,
     143             : };
     144             : 
     145             : #endif
     146             : 
     147             : pg_locale_t
     148         210 : create_pg_locale_icu(Oid collid, MemoryContext context)
     149             : {
     150             : #ifdef USE_ICU
     151             :     bool        deterministic;
     152             :     const char *iculocstr;
     153         210 :     const char *icurules = NULL;
     154             :     UCollator  *collator;
     155             :     pg_locale_t result;
     156             : 
     157         210 :     if (collid == DEFAULT_COLLATION_OID)
     158             :     {
     159             :         HeapTuple   tp;
     160             :         Datum       datum;
     161             :         bool        isnull;
     162             : 
     163          26 :         tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
     164          26 :         if (!HeapTupleIsValid(tp))
     165           0 :             elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
     166             : 
     167             :         /* default database collation is always deterministic */
     168          26 :         deterministic = true;
     169          26 :         datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
     170             :                                        Anum_pg_database_datlocale);
     171          26 :         iculocstr = TextDatumGetCString(datum);
     172          26 :         datum = SysCacheGetAttr(DATABASEOID, tp,
     173             :                                 Anum_pg_database_daticurules, &isnull);
     174          26 :         if (!isnull)
     175           0 :             icurules = TextDatumGetCString(datum);
     176             : 
     177          26 :         ReleaseSysCache(tp);
     178             :     }
     179             :     else
     180             :     {
     181             :         Form_pg_collation collform;
     182             :         HeapTuple   tp;
     183             :         Datum       datum;
     184             :         bool        isnull;
     185             : 
     186         184 :         tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
     187         184 :         if (!HeapTupleIsValid(tp))
     188           0 :             elog(ERROR, "cache lookup failed for collation %u", collid);
     189         184 :         collform = (Form_pg_collation) GETSTRUCT(tp);
     190         184 :         deterministic = collform->collisdeterministic;
     191         184 :         datum = SysCacheGetAttrNotNull(COLLOID, tp,
     192             :                                        Anum_pg_collation_colllocale);
     193         184 :         iculocstr = TextDatumGetCString(datum);
     194         184 :         datum = SysCacheGetAttr(COLLOID, tp,
     195             :                                 Anum_pg_collation_collicurules, &isnull);
     196         184 :         if (!isnull)
     197          12 :             icurules = TextDatumGetCString(datum);
     198             : 
     199         184 :         ReleaseSysCache(tp);
     200             :     }
     201             : 
     202         210 :     collator = make_icu_collator(iculocstr, icurules);
     203             : 
     204         200 :     result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
     205         200 :     result->info.icu.locale = MemoryContextStrdup(context, iculocstr);
     206         200 :     result->info.icu.ucol = collator;
     207         200 :     result->provider = COLLPROVIDER_ICU;
     208         200 :     result->deterministic = deterministic;
     209         200 :     result->collate_is_c = false;
     210         200 :     result->ctype_is_c = false;
     211         200 :     if (GetDatabaseEncoding() == PG_UTF8)
     212         200 :         result->collate = &collate_methods_icu_utf8;
     213             :     else
     214           0 :         result->collate = &collate_methods_icu;
     215             : 
     216         200 :     return result;
     217             : #else
     218             :     /* could get here if a collation was created by a build with ICU */
     219             :     ereport(ERROR,
     220             :             (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     221             :              errmsg("ICU is not supported in this build")));
     222             : 
     223             :     return NULL;
     224             : #endif
     225             : }
     226             : 
     227             : #ifdef USE_ICU
     228             : 
     229             : /*
     230             :  * Wrapper around ucol_open() to handle API differences for older ICU
     231             :  * versions.
     232             :  *
     233             :  * Ensure that no path leaks a UCollator.
     234             :  */
     235             : UCollator *
     236       68150 : pg_ucol_open(const char *loc_str)
     237             : {
     238             :     UCollator  *collator;
     239             :     UErrorCode  status;
     240       68150 :     const char *orig_str = loc_str;
     241       68150 :     char       *fixed_str = NULL;
     242             : 
     243             :     /*
     244             :      * Must never open default collator, because it depends on the environment
     245             :      * and may change at any time. Should not happen, but check here to catch
     246             :      * bugs that might be hard to catch otherwise.
     247             :      *
     248             :      * NB: the default collator is not the same as the collator for the root
     249             :      * locale. The root locale may be specified as the empty string, "und", or
     250             :      * "root". The default collator is opened by passing NULL to ucol_open().
     251             :      */
     252       68150 :     if (loc_str == NULL)
     253           0 :         elog(ERROR, "opening default collator is not supported");
     254             : 
     255             :     /*
     256             :      * In ICU versions 54 and earlier, "und" is not a recognized spelling of
     257             :      * the root locale. If the first component of the locale is "und", replace
     258             :      * with "root" before opening.
     259             :      */
     260             :     if (U_ICU_VERSION_MAJOR_NUM < 55)
     261             :     {
     262             :         char        lang[ULOC_LANG_CAPACITY];
     263             : 
     264             :         status = U_ZERO_ERROR;
     265             :         uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
     266             :         if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
     267             :         {
     268             :             ereport(ERROR,
     269             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     270             :                      errmsg("could not get language from locale \"%s\": %s",
     271             :                             loc_str, u_errorName(status))));
     272             :         }
     273             : 
     274             :         if (strcmp(lang, "und") == 0)
     275             :         {
     276             :             const char *remainder = loc_str + strlen("und");
     277             : 
     278             :             fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
     279             :             strcpy(fixed_str, "root");
     280             :             strcat(fixed_str, remainder);
     281             : 
     282             :             loc_str = fixed_str;
     283             :         }
     284             :     }
     285             : 
     286       68150 :     status = U_ZERO_ERROR;
     287       68150 :     collator = ucol_open(loc_str, &status);
     288       68150 :     if (U_FAILURE(status))
     289          12 :         ereport(ERROR,
     290             :         /* use original string for error report */
     291             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     292             :                  errmsg("could not open collator for locale \"%s\": %s",
     293             :                         orig_str, u_errorName(status))));
     294             : 
     295             :     if (U_ICU_VERSION_MAJOR_NUM < 54)
     296             :     {
     297             :         status = U_ZERO_ERROR;
     298             :         icu_set_collation_attributes(collator, loc_str, &status);
     299             : 
     300             :         /*
     301             :          * Pretend the error came from ucol_open(), for consistent error
     302             :          * message across ICU versions.
     303             :          */
     304             :         if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
     305             :         {
     306             :             ucol_close(collator);
     307             :             ereport(ERROR,
     308             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     309             :                      errmsg("could not open collator for locale \"%s\": %s",
     310             :                             orig_str, u_errorName(status))));
     311             :         }
     312             :     }
     313             : 
     314       68138 :     if (fixed_str != NULL)
     315           0 :         pfree(fixed_str);
     316             : 
     317       68138 :     return collator;
     318             : }
     319             : 
     320             : /*
     321             :  * Create a UCollator with the given locale string and rules.
     322             :  *
     323             :  * Ensure that no path leaks a UCollator.
     324             :  */
     325             : static UCollator *
     326         210 : make_icu_collator(const char *iculocstr, const char *icurules)
     327             : {
     328         210 :     if (!icurules)
     329             :     {
     330             :         /* simple case without rules */
     331         198 :         return pg_ucol_open(iculocstr);
     332             :     }
     333             :     else
     334             :     {
     335             :         UCollator  *collator_std_rules;
     336             :         UCollator  *collator_all_rules;
     337             :         const UChar *std_rules;
     338             :         UChar      *my_rules;
     339             :         UChar      *all_rules;
     340             :         int32_t     length;
     341             :         int32_t     total;
     342             :         UErrorCode  status;
     343             : 
     344             :         /*
     345             :          * If rules are specified, we extract the rules of the standard
     346             :          * collation, add our own rules, and make a new collator with the
     347             :          * combined rules.
     348             :          */
     349          12 :         icu_to_uchar(&my_rules, icurules, strlen(icurules));
     350             : 
     351          12 :         collator_std_rules = pg_ucol_open(iculocstr);
     352             : 
     353          12 :         std_rules = ucol_getRules(collator_std_rules, &length);
     354             : 
     355          12 :         total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
     356             : 
     357             :         /* avoid leaking collator on OOM */
     358          12 :         all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
     359          12 :         if (!all_rules)
     360             :         {
     361           0 :             ucol_close(collator_std_rules);
     362           0 :             ereport(ERROR,
     363             :                     (errcode(ERRCODE_OUT_OF_MEMORY),
     364             :                      errmsg("out of memory")));
     365             :         }
     366             : 
     367          12 :         u_strcpy(all_rules, std_rules);
     368          12 :         u_strcat(all_rules, my_rules);
     369             : 
     370          12 :         ucol_close(collator_std_rules);
     371             : 
     372          12 :         status = U_ZERO_ERROR;
     373          12 :         collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
     374             :                                             UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
     375             :                                             NULL, &status);
     376          12 :         if (U_FAILURE(status))
     377             :         {
     378           6 :             ereport(ERROR,
     379             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     380             :                      errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
     381             :                             iculocstr, icurules, u_errorName(status))));
     382             :         }
     383             : 
     384           6 :         return collator_all_rules;
     385             :     }
     386             : }
     387             : 
     388             : size_t
     389         528 : strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
     390             :              pg_locale_t locale)
     391             : {
     392             :     int32_t     len_uchar;
     393             :     int32_t     len_conv;
     394             :     UChar      *buff_uchar;
     395             :     UChar      *buff_conv;
     396             :     size_t      result_len;
     397             : 
     398         528 :     len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
     399         528 :     len_conv = icu_convert_case(u_strToLower, locale,
     400             :                                 &buff_conv, buff_uchar, len_uchar);
     401         528 :     result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
     402         528 :     pfree(buff_uchar);
     403         528 :     pfree(buff_conv);
     404             : 
     405         528 :     return result_len;
     406             : }
     407             : 
     408             : size_t
     409          30 : strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
     410             :              pg_locale_t locale)
     411             : {
     412             :     int32_t     len_uchar;
     413             :     int32_t     len_conv;
     414             :     UChar      *buff_uchar;
     415             :     UChar      *buff_conv;
     416             :     size_t      result_len;
     417             : 
     418          30 :     len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
     419          30 :     len_conv = icu_convert_case(u_strToTitle_default_BI, locale,
     420             :                                 &buff_conv, buff_uchar, len_uchar);
     421          30 :     result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
     422          30 :     pfree(buff_uchar);
     423          30 :     pfree(buff_conv);
     424             : 
     425          30 :     return result_len;
     426             : }
     427             : 
     428             : size_t
     429          54 : strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
     430             :              pg_locale_t locale)
     431             : {
     432             :     int32_t     len_uchar;
     433             :     int32_t     len_conv;
     434             :     UChar      *buff_uchar;
     435             :     UChar      *buff_conv;
     436             :     size_t      result_len;
     437             : 
     438          54 :     len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
     439          54 :     len_conv = icu_convert_case(u_strToUpper, locale,
     440             :                                 &buff_conv, buff_uchar, len_uchar);
     441          54 :     result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
     442          54 :     pfree(buff_uchar);
     443          54 :     pfree(buff_conv);
     444             : 
     445          54 :     return result_len;
     446             : }
     447             : 
     448             : size_t
     449          12 : strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
     450             :             pg_locale_t locale)
     451             : {
     452             :     int32_t     len_uchar;
     453             :     int32_t     len_conv;
     454             :     UChar      *buff_uchar;
     455             :     UChar      *buff_conv;
     456             :     size_t      result_len;
     457             : 
     458          12 :     len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
     459          12 :     len_conv = icu_convert_case(u_strFoldCase_default, locale,
     460             :                                 &buff_conv, buff_uchar, len_uchar);
     461          12 :     result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
     462          12 :     pfree(buff_uchar);
     463          12 :     pfree(buff_conv);
     464             : 
     465          12 :     return result_len;
     466             : }
     467             : 
     468             : /*
     469             :  * strncoll_icu_utf8
     470             :  *
     471             :  * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
     472             :  * database encoding. An argument length of -1 means the string is
     473             :  * NUL-terminated.
     474             :  */
     475             : #ifdef HAVE_UCOL_STRCOLLUTF8
     476             : int
     477       25414 : strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
     478             :                   pg_locale_t locale)
     479             : {
     480             :     int         result;
     481             :     UErrorCode  status;
     482             : 
     483             :     Assert(locale->provider == COLLPROVIDER_ICU);
     484             : 
     485             :     Assert(GetDatabaseEncoding() == PG_UTF8);
     486             : 
     487       25414 :     status = U_ZERO_ERROR;
     488       25414 :     result = ucol_strcollUTF8(locale->info.icu.ucol,
     489             :                               arg1, len1,
     490             :                               arg2, len2,
     491             :                               &status);
     492       25414 :     if (U_FAILURE(status))
     493           0 :         ereport(ERROR,
     494             :                 (errmsg("collation failed: %s", u_errorName(status))));
     495             : 
     496       25414 :     return result;
     497             : }
     498             : #endif
     499             : 
     500             : /* 'srclen' of -1 means the strings are NUL-terminated */
     501             : size_t
     502       10020 : strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
     503             :              pg_locale_t locale)
     504             : {
     505             :     char        sbuf[TEXTBUFLEN];
     506       10020 :     char       *buf = sbuf;
     507             :     UChar      *uchar;
     508             :     int32_t     ulen;
     509             :     size_t      uchar_bsize;
     510             :     Size        result_bsize;
     511             : 
     512             :     Assert(locale->provider == COLLPROVIDER_ICU);
     513             : 
     514       10020 :     init_icu_converter();
     515             : 
     516       10020 :     ulen = uchar_length(icu_converter, src, srclen);
     517             : 
     518       10020 :     uchar_bsize = (ulen + 1) * sizeof(UChar);
     519             : 
     520       10020 :     if (uchar_bsize > TEXTBUFLEN)
     521           0 :         buf = palloc(uchar_bsize);
     522             : 
     523       10020 :     uchar = (UChar *) buf;
     524             : 
     525       10020 :     ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
     526             : 
     527       10020 :     result_bsize = ucol_getSortKey(locale->info.icu.ucol,
     528             :                                    uchar, ulen,
     529             :                                    (uint8_t *) dest, destsize);
     530             : 
     531             :     /*
     532             :      * ucol_getSortKey() counts the nul-terminator in the result length, but
     533             :      * this function should not.
     534             :      */
     535             :     Assert(result_bsize > 0);
     536       10020 :     result_bsize--;
     537             : 
     538       10020 :     if (buf != sbuf)
     539           0 :         pfree(buf);
     540             : 
     541             :     /* if dest is defined, it should be nul-terminated */
     542             :     Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
     543             : 
     544       10020 :     return result_bsize;
     545             : }
     546             : 
     547             : /* 'srclen' of -1 means the strings are NUL-terminated */
     548             : size_t
     549        1656 : strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
     550             :                          const char *src, ssize_t srclen,
     551             :                          pg_locale_t locale)
     552             : {
     553             :     size_t      result;
     554             :     UCharIterator iter;
     555             :     uint32_t    state[2];
     556             :     UErrorCode  status;
     557             : 
     558             :     Assert(locale->provider == COLLPROVIDER_ICU);
     559             : 
     560             :     Assert(GetDatabaseEncoding() == PG_UTF8);
     561             : 
     562        1656 :     uiter_setUTF8(&iter, src, srclen);
     563        1656 :     state[0] = state[1] = 0;    /* won't need that again */
     564        1656 :     status = U_ZERO_ERROR;
     565        1656 :     result = ucol_nextSortKeyPart(locale->info.icu.ucol,
     566             :                                   &iter,
     567             :                                   state,
     568             :                                   (uint8_t *) dest,
     569             :                                   destsize,
     570             :                                   &status);
     571        1656 :     if (U_FAILURE(status))
     572           0 :         ereport(ERROR,
     573             :                 (errmsg("sort key generation failed: %s",
     574             :                         u_errorName(status))));
     575             : 
     576        1656 :     return result;
     577             : }
     578             : 
     579             : char *
     580       67792 : get_collation_actual_version_icu(const char *collcollate)
     581             : {
     582             :     UCollator  *collator;
     583             :     UVersionInfo versioninfo;
     584             :     char        buf[U_MAX_VERSION_STRING_LENGTH];
     585             : 
     586       67792 :     collator = pg_ucol_open(collcollate);
     587             : 
     588       67792 :     ucol_getVersion(collator, versioninfo);
     589       67792 :     ucol_close(collator);
     590             : 
     591       67792 :     u_versionToString(versioninfo, buf);
     592       67792 :     return pstrdup(buf);
     593             : }
     594             : 
     595             : /*
     596             :  * Convert a string in the database encoding into a string of UChars.
     597             :  *
     598             :  * The source string at buff is of length nbytes
     599             :  * (it needn't be nul-terminated)
     600             :  *
     601             :  * *buff_uchar receives a pointer to the palloc'd result string, and
     602             :  * the function's result is the number of UChars generated.
     603             :  *
     604             :  * The result string is nul-terminated, though most callers rely on the
     605             :  * result length instead.
     606             :  */
     607             : static int32_t
     608         636 : icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
     609             : {
     610             :     int32_t     len_uchar;
     611             : 
     612         636 :     init_icu_converter();
     613             : 
     614         636 :     len_uchar = uchar_length(icu_converter, buff, nbytes);
     615             : 
     616         636 :     *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
     617         636 :     len_uchar = uchar_convert(icu_converter,
     618             :                               *buff_uchar, len_uchar + 1, buff, nbytes);
     619             : 
     620         636 :     return len_uchar;
     621             : }
     622             : 
     623             : /*
     624             :  * Convert a string of UChars into the database encoding.
     625             :  *
     626             :  * The source string at buff_uchar is of length len_uchar
     627             :  * (it needn't be nul-terminated)
     628             :  *
     629             :  * *result receives a pointer to the palloc'd result string, and the
     630             :  * function's result is the number of bytes generated (not counting nul).
     631             :  *
     632             :  * The result string is nul-terminated.
     633             :  */
     634             : static size_t
     635         624 : icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar)
     636             : {
     637             :     UErrorCode  status;
     638             :     int32_t     len_result;
     639             : 
     640         624 :     init_icu_converter();
     641             : 
     642         624 :     status = U_ZERO_ERROR;
     643         624 :     len_result = ucnv_fromUChars(icu_converter, NULL, 0,
     644             :                                  buff_uchar, len_uchar, &status);
     645         624 :     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
     646           0 :         ereport(ERROR,
     647             :                 (errmsg("%s failed: %s", "ucnv_fromUChars",
     648             :                         u_errorName(status))));
     649             : 
     650         624 :     if (len_result + 1 > destsize)
     651          60 :         return len_result;
     652             : 
     653         564 :     status = U_ZERO_ERROR;
     654         564 :     len_result = ucnv_fromUChars(icu_converter, dest, len_result + 1,
     655             :                                  buff_uchar, len_uchar, &status);
     656         564 :     if (U_FAILURE(status) ||
     657         564 :         status == U_STRING_NOT_TERMINATED_WARNING)
     658           0 :         ereport(ERROR,
     659             :                 (errmsg("%s failed: %s", "ucnv_fromUChars",
     660             :                         u_errorName(status))));
     661             : 
     662         564 :     return len_result;
     663             : }
     664             : 
     665             : static int32_t
     666         624 : icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
     667             :                  UChar **buff_dest, UChar *buff_source, int32_t len_source)
     668             : {
     669             :     UErrorCode  status;
     670             :     int32_t     len_dest;
     671             : 
     672         624 :     len_dest = len_source;      /* try first with same length */
     673         624 :     *buff_dest = palloc(len_dest * sizeof(**buff_dest));
     674         624 :     status = U_ZERO_ERROR;
     675         624 :     len_dest = func(*buff_dest, len_dest, buff_source, len_source,
     676             :                     mylocale->info.icu.locale, &status);
     677         624 :     if (status == U_BUFFER_OVERFLOW_ERROR)
     678             :     {
     679             :         /* try again with adjusted length */
     680          18 :         pfree(*buff_dest);
     681          18 :         *buff_dest = palloc(len_dest * sizeof(**buff_dest));
     682          18 :         status = U_ZERO_ERROR;
     683          18 :         len_dest = func(*buff_dest, len_dest, buff_source, len_source,
     684             :                         mylocale->info.icu.locale, &status);
     685             :     }
     686         624 :     if (U_FAILURE(status))
     687           0 :         ereport(ERROR,
     688             :                 (errmsg("case conversion failed: %s", u_errorName(status))));
     689         624 :     return len_dest;
     690             : }
     691             : 
     692             : static int32_t
     693          30 : u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
     694             :                         const UChar *src, int32_t srcLength,
     695             :                         const char *locale,
     696             :                         UErrorCode *pErrorCode)
     697             : {
     698          30 :     return u_strToTitle(dest, destCapacity, src, srcLength,
     699             :                         NULL, locale, pErrorCode);
     700             : }
     701             : 
     702             : static int32_t
     703          24 : u_strFoldCase_default(UChar *dest, int32_t destCapacity,
     704             :                       const UChar *src, int32_t srcLength,
     705             :                       const char *locale,
     706             :                       UErrorCode *pErrorCode)
     707             : {
     708          24 :     uint32      options = U_FOLD_CASE_DEFAULT;
     709             :     char        lang[3];
     710             :     UErrorCode  status;
     711             : 
     712             :     /*
     713             :      * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
     714             :      * folding does not accept a locale. Instead it just supports a single
     715             :      * option relevant to Turkic languages 'az' and 'tr'; check for those
     716             :      * languages to enable the option.
     717             :      */
     718          24 :     status = U_ZERO_ERROR;
     719          24 :     uloc_getLanguage(locale, lang, 3, &status);
     720          24 :     if (U_SUCCESS(status))
     721             :     {
     722             :         /*
     723             :          * The option name is confusing, but it causes u_strFoldCase to use
     724             :          * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
     725             :          */
     726          24 :         if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
     727          12 :             options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
     728             :     }
     729             : 
     730          24 :     return u_strFoldCase(dest, destCapacity, src, srcLength,
     731             :                          options, pErrorCode);
     732             : }
     733             : 
     734             : /*
     735             :  * strncoll_icu
     736             :  *
     737             :  * Convert the arguments from the database encoding to UChar strings, then
     738             :  * call ucol_strcoll(). An argument length of -1 means that the string is
     739             :  * NUL-terminated.
     740             :  *
     741             :  * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
     742             :  * caller should call that instead.
     743             :  */
     744             : static int
     745           0 : strncoll_icu(const char *arg1, ssize_t len1,
     746             :              const char *arg2, ssize_t len2, pg_locale_t locale)
     747             : {
     748             :     char        sbuf[TEXTBUFLEN];
     749           0 :     char       *buf = sbuf;
     750             :     int32_t     ulen1;
     751             :     int32_t     ulen2;
     752             :     size_t      bufsize1;
     753             :     size_t      bufsize2;
     754             :     UChar      *uchar1,
     755             :                *uchar2;
     756             :     int         result;
     757             : 
     758             :     Assert(locale->provider == COLLPROVIDER_ICU);
     759             : 
     760             :     /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */
     761             : #ifdef HAVE_UCOL_STRCOLLUTF8
     762             :     Assert(GetDatabaseEncoding() != PG_UTF8);
     763             : #endif
     764             : 
     765           0 :     init_icu_converter();
     766             : 
     767           0 :     ulen1 = uchar_length(icu_converter, arg1, len1);
     768           0 :     ulen2 = uchar_length(icu_converter, arg2, len2);
     769             : 
     770           0 :     bufsize1 = (ulen1 + 1) * sizeof(UChar);
     771           0 :     bufsize2 = (ulen2 + 1) * sizeof(UChar);
     772             : 
     773           0 :     if (bufsize1 + bufsize2 > TEXTBUFLEN)
     774           0 :         buf = palloc(bufsize1 + bufsize2);
     775             : 
     776           0 :     uchar1 = (UChar *) buf;
     777           0 :     uchar2 = (UChar *) (buf + bufsize1);
     778             : 
     779           0 :     ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
     780           0 :     ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
     781             : 
     782           0 :     result = ucol_strcoll(locale->info.icu.ucol,
     783             :                           uchar1, ulen1,
     784             :                           uchar2, ulen2);
     785             : 
     786           0 :     if (buf != sbuf)
     787           0 :         pfree(buf);
     788             : 
     789           0 :     return result;
     790             : }
     791             : 
     792             : /* 'srclen' of -1 means the strings are NUL-terminated */
     793             : static size_t
     794           0 : strnxfrm_prefix_icu(char *dest, size_t destsize,
     795             :                     const char *src, ssize_t srclen,
     796             :                     pg_locale_t locale)
     797             : {
     798             :     char        sbuf[TEXTBUFLEN];
     799           0 :     char       *buf = sbuf;
     800             :     UCharIterator iter;
     801             :     uint32_t    state[2];
     802             :     UErrorCode  status;
     803           0 :     int32_t     ulen = -1;
     804           0 :     UChar      *uchar = NULL;
     805             :     size_t      uchar_bsize;
     806             :     Size        result_bsize;
     807             : 
     808             :     Assert(locale->provider == COLLPROVIDER_ICU);
     809             : 
     810             :     /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */
     811             :     Assert(GetDatabaseEncoding() != PG_UTF8);
     812             : 
     813           0 :     init_icu_converter();
     814             : 
     815           0 :     ulen = uchar_length(icu_converter, src, srclen);
     816             : 
     817           0 :     uchar_bsize = (ulen + 1) * sizeof(UChar);
     818             : 
     819           0 :     if (uchar_bsize > TEXTBUFLEN)
     820           0 :         buf = palloc(uchar_bsize);
     821             : 
     822           0 :     uchar = (UChar *) buf;
     823             : 
     824           0 :     ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
     825             : 
     826           0 :     uiter_setString(&iter, uchar, ulen);
     827           0 :     state[0] = state[1] = 0;    /* won't need that again */
     828           0 :     status = U_ZERO_ERROR;
     829           0 :     result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol,
     830             :                                         &iter,
     831             :                                         state,
     832             :                                         (uint8_t *) dest,
     833             :                                         destsize,
     834             :                                         &status);
     835           0 :     if (U_FAILURE(status))
     836           0 :         ereport(ERROR,
     837             :                 (errmsg("sort key generation failed: %s",
     838             :                         u_errorName(status))));
     839             : 
     840           0 :     return result_bsize;
     841             : }
     842             : 
     843             : static void
     844       11280 : init_icu_converter(void)
     845             : {
     846             :     const char *icu_encoding_name;
     847             :     UErrorCode  status;
     848             :     UConverter *conv;
     849             : 
     850       11280 :     if (icu_converter)
     851       11274 :         return;                 /* already done */
     852             : 
     853           6 :     icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
     854           6 :     if (!icu_encoding_name)
     855           0 :         ereport(ERROR,
     856             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     857             :                  errmsg("encoding \"%s\" not supported by ICU",
     858             :                         pg_encoding_to_char(GetDatabaseEncoding()))));
     859             : 
     860           6 :     status = U_ZERO_ERROR;
     861           6 :     conv = ucnv_open(icu_encoding_name, &status);
     862           6 :     if (U_FAILURE(status))
     863           0 :         ereport(ERROR,
     864             :                 (errmsg("could not open ICU converter for encoding \"%s\": %s",
     865             :                         icu_encoding_name, u_errorName(status))));
     866             : 
     867           6 :     icu_converter = conv;
     868             : }
     869             : 
     870             : /*
     871             :  * Find length, in UChars, of given string if converted to UChar string.
     872             :  *
     873             :  * A length of -1 indicates that the input string is NUL-terminated.
     874             :  */
     875             : static size_t
     876       10656 : uchar_length(UConverter *converter, const char *str, int32_t len)
     877             : {
     878       10656 :     UErrorCode  status = U_ZERO_ERROR;
     879             :     int32_t     ulen;
     880             : 
     881       10656 :     ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
     882       10656 :     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
     883           0 :         ereport(ERROR,
     884             :                 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
     885       10656 :     return ulen;
     886             : }
     887             : 
     888             : /*
     889             :  * Convert the given source string into a UChar string, stored in dest, and
     890             :  * return the length (in UChars).
     891             :  *
     892             :  * A srclen of -1 indicates that the input string is NUL-terminated.
     893             :  */
     894             : static int32_t
     895       10656 : uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
     896             :               const char *src, int32_t srclen)
     897             : {
     898       10656 :     UErrorCode  status = U_ZERO_ERROR;
     899             :     int32_t     ulen;
     900             : 
     901       10656 :     status = U_ZERO_ERROR;
     902       10656 :     ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
     903       10656 :     if (U_FAILURE(status))
     904           0 :         ereport(ERROR,
     905             :                 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
     906       10656 :     return ulen;
     907             : }
     908             : 
     909             : /*
     910             :  * Parse collation attributes from the given locale string and apply them to
     911             :  * the open collator.
     912             :  *
     913             :  * First, the locale string is canonicalized to an ICU format locale ID such
     914             :  * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
     915             :  * the key-value arguments.
     916             :  *
     917             :  * Starting with ICU version 54, the attributes are processed automatically by
     918             :  * ucol_open(), so this is only necessary for emulating this behavior on older
     919             :  * versions.
     920             :  */
     921             : pg_attribute_unused()
     922             : static void
     923           0 : icu_set_collation_attributes(UCollator *collator, const char *loc,
     924             :                              UErrorCode *status)
     925             : {
     926             :     int32_t     len;
     927             :     char       *icu_locale_id;
     928             :     char       *lower_str;
     929             :     char       *str;
     930             :     char       *token;
     931             : 
     932             :     /*
     933             :      * The input locale may be a BCP 47 language tag, e.g.
     934             :      * "und-u-kc-ks-level1", which expresses the same attributes in a
     935             :      * different form. It will be converted to the equivalent ICU format
     936             :      * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
     937             :      * uloc_canonicalize().
     938             :      */
     939           0 :     *status = U_ZERO_ERROR;
     940           0 :     len = uloc_canonicalize(loc, NULL, 0, status);
     941           0 :     icu_locale_id = palloc(len + 1);
     942           0 :     *status = U_ZERO_ERROR;
     943           0 :     len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
     944           0 :     if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
     945           0 :         return;
     946             : 
     947           0 :     lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
     948             : 
     949           0 :     pfree(icu_locale_id);
     950             : 
     951           0 :     str = strchr(lower_str, '@');
     952           0 :     if (!str)
     953           0 :         return;
     954           0 :     str++;
     955             : 
     956           0 :     while ((token = strsep(&str, ";")))
     957             :     {
     958           0 :         char       *e = strchr(token, '=');
     959             : 
     960           0 :         if (e)
     961             :         {
     962             :             char       *name;
     963             :             char       *value;
     964             :             UColAttribute uattr;
     965             :             UColAttributeValue uvalue;
     966             : 
     967           0 :             *status = U_ZERO_ERROR;
     968             : 
     969           0 :             *e = '\0';
     970           0 :             name = token;
     971           0 :             value = e + 1;
     972             : 
     973             :             /*
     974             :              * See attribute name and value lists in ICU i18n/coll.cpp
     975             :              */
     976           0 :             if (strcmp(name, "colstrength") == 0)
     977           0 :                 uattr = UCOL_STRENGTH;
     978           0 :             else if (strcmp(name, "colbackwards") == 0)
     979           0 :                 uattr = UCOL_FRENCH_COLLATION;
     980           0 :             else if (strcmp(name, "colcaselevel") == 0)
     981           0 :                 uattr = UCOL_CASE_LEVEL;
     982           0 :             else if (strcmp(name, "colcasefirst") == 0)
     983           0 :                 uattr = UCOL_CASE_FIRST;
     984           0 :             else if (strcmp(name, "colalternate") == 0)
     985           0 :                 uattr = UCOL_ALTERNATE_HANDLING;
     986           0 :             else if (strcmp(name, "colnormalization") == 0)
     987           0 :                 uattr = UCOL_NORMALIZATION_MODE;
     988           0 :             else if (strcmp(name, "colnumeric") == 0)
     989           0 :                 uattr = UCOL_NUMERIC_COLLATION;
     990             :             else
     991             :                 /* ignore if unknown */
     992           0 :                 continue;
     993             : 
     994           0 :             if (strcmp(value, "primary") == 0)
     995           0 :                 uvalue = UCOL_PRIMARY;
     996           0 :             else if (strcmp(value, "secondary") == 0)
     997           0 :                 uvalue = UCOL_SECONDARY;
     998           0 :             else if (strcmp(value, "tertiary") == 0)
     999           0 :                 uvalue = UCOL_TERTIARY;
    1000           0 :             else if (strcmp(value, "quaternary") == 0)
    1001           0 :                 uvalue = UCOL_QUATERNARY;
    1002           0 :             else if (strcmp(value, "identical") == 0)
    1003           0 :                 uvalue = UCOL_IDENTICAL;
    1004           0 :             else if (strcmp(value, "no") == 0)
    1005           0 :                 uvalue = UCOL_OFF;
    1006           0 :             else if (strcmp(value, "yes") == 0)
    1007           0 :                 uvalue = UCOL_ON;
    1008           0 :             else if (strcmp(value, "shifted") == 0)
    1009           0 :                 uvalue = UCOL_SHIFTED;
    1010           0 :             else if (strcmp(value, "non-ignorable") == 0)
    1011           0 :                 uvalue = UCOL_NON_IGNORABLE;
    1012           0 :             else if (strcmp(value, "lower") == 0)
    1013           0 :                 uvalue = UCOL_LOWER_FIRST;
    1014           0 :             else if (strcmp(value, "upper") == 0)
    1015           0 :                 uvalue = UCOL_UPPER_FIRST;
    1016             :             else
    1017             :             {
    1018           0 :                 *status = U_ILLEGAL_ARGUMENT_ERROR;
    1019           0 :                 break;
    1020             :             }
    1021             : 
    1022           0 :             ucol_setAttribute(collator, uattr, uvalue, status);
    1023             :         }
    1024             :     }
    1025             : 
    1026           0 :     pfree(lower_str);
    1027             : }
    1028             : 
    1029             : #endif                          /* USE_ICU */

Generated by: LCOV version 1.14