LCOV - code coverage report
Current view: top level - src/backend/utils/adt - pg_locale_icu.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 158 278 56.8 %
Date: 2025-01-18 04:15:08 Functions: 17 20 85.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-----------------------------------------------------------------------
       2             :  *
       3             :  * PostgreSQL locale utilities for ICU
       4             :  *
       5             :  * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
       6             :  *
       7             :  * src/backend/utils/adt/pg_locale_icu.c
       8             :  *
       9             :  *-----------------------------------------------------------------------
      10             :  */
      11             : 
      12             : #include "postgres.h"
      13             : 
      14             : #ifdef USE_ICU
      15             : #include <unicode/ucnv.h>
      16             : #include <unicode/ustring.h>
      17             : 
      18             : /*
      19             :  * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
      20             :  * (see
      21             :  * <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
      22             :  */
      23             : #if U_ICU_VERSION_MAJOR_NUM >= 53
      24             : #define HAVE_UCOL_STRCOLLUTF8 1
      25             : #else
      26             : #undef HAVE_UCOL_STRCOLLUTF8
      27             : #endif
      28             : 
      29             : #endif
      30             : 
      31             : #include "access/htup_details.h"
      32             : #include "catalog/pg_database.h"
      33             : #include "catalog/pg_collation.h"
      34             : #include "mb/pg_wchar.h"
      35             : #include "miscadmin.h"
      36             : #include "utils/builtins.h"
      37             : #include "utils/formatting.h"
      38             : #include "utils/memutils.h"
      39             : #include "utils/pg_locale.h"
      40             : #include "utils/syscache.h"
      41             : 
      42             : /*
      43             :  * Size of stack buffer to use for string transformations, used to avoid heap
      44             :  * allocations in typical cases. This should be large enough that most strings
      45             :  * will fit, but small enough that we feel comfortable putting it on the
      46             :  * stack.
      47             :  */
      48             : #define     TEXTBUFLEN          1024
      49             : 
      50             : extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context);
      51             : extern size_t strlower_icu(char *dst, size_t dstsize, const char *src,
      52             :                            ssize_t srclen, pg_locale_t locale);
      53             : extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src,
      54             :                            ssize_t srclen, pg_locale_t locale);
      55             : extern size_t strupper_icu(char *dst, size_t dstsize, const char *src,
      56             :                            ssize_t srclen, pg_locale_t locale);
      57             : 
      58             : #ifdef USE_ICU
      59             : 
      60             : extern UCollator *pg_ucol_open(const char *loc_str);
      61             : 
      62             : static int  strncoll_icu(const char *arg1, ssize_t len1,
      63             :                          const char *arg2, ssize_t len2,
      64             :                          pg_locale_t locale);
      65             : static size_t strnxfrm_icu(char *dest, size_t destsize,
      66             :                            const char *src, ssize_t srclen,
      67             :                            pg_locale_t locale);
      68             : static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
      69             :                                   const char *src, ssize_t srclen,
      70             :                                   pg_locale_t locale);
      71             : extern char *get_collation_actual_version_icu(const char *collcollate);
      72             : 
      73             : typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity,
      74             :                                      const UChar *src, int32_t srcLength,
      75             :                                      const char *locale,
      76             :                                      UErrorCode *pErrorCode);
      77             : 
      78             : /*
      79             :  * Converter object for converting between ICU's UChar strings and C strings
      80             :  * in database encoding.  Since the database encoding doesn't change, we only
      81             :  * need one of these per session.
      82             :  */
      83             : static UConverter *icu_converter = NULL;
      84             : 
      85             : static UCollator *make_icu_collator(const char *iculocstr,
      86             :                                     const char *icurules);
      87             : static int  strncoll_icu(const char *arg1, ssize_t len1,
      88             :                          const char *arg2, ssize_t len2,
      89             :                          pg_locale_t locale);
      90             : static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
      91             :                                   const char *src, ssize_t srclen,
      92             :                                   pg_locale_t locale);
      93             : #ifdef HAVE_UCOL_STRCOLLUTF8
      94             : static int  strncoll_icu_utf8(const char *arg1, ssize_t len1,
      95             :                               const char *arg2, ssize_t len2,
      96             :                               pg_locale_t locale);
      97             : #endif
      98             : static size_t strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
      99             :                                        const char *src, ssize_t srclen,
     100             :                                        pg_locale_t locale);
     101             : static void init_icu_converter(void);
     102             : static size_t uchar_length(UConverter *converter,
     103             :                            const char *str, int32_t len);
     104             : static int32_t uchar_convert(UConverter *converter,
     105             :                              UChar *dest, int32_t destlen,
     106             :                              const char *src, int32_t srclen);
     107             : static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff,
     108             :                             size_t nbytes);
     109             : static size_t icu_from_uchar(char *dest, size_t destsize,
     110             :                              const UChar *buff_uchar, int32_t len_uchar);
     111             : static void icu_set_collation_attributes(UCollator *collator, const char *loc,
     112             :                                          UErrorCode *status);
     113             : static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
     114             :                                 UChar **buff_dest, UChar *buff_source,
     115             :                                 int32_t len_source);
     116             : static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
     117             :                                        const UChar *src, int32_t srcLength,
     118             :                                        const char *locale,
     119             :                                        UErrorCode *pErrorCode);
     120             : 
     121             : static const struct collate_methods collate_methods_icu = {
     122             :     .strncoll = strncoll_icu,
     123             :     .strnxfrm = strnxfrm_icu,
     124             :     .strnxfrm_prefix = strnxfrm_prefix_icu,
     125             :     .strxfrm_is_safe = true,
     126             : };
     127             : 
     128             : static const struct collate_methods collate_methods_icu_utf8 = {
     129             : #ifdef HAVE_UCOL_STRCOLLUTF8
     130             :     .strncoll = strncoll_icu_utf8,
     131             : #else
     132             :     .strncoll = strncoll_icu,
     133             : #endif
     134             :     .strnxfrm = strnxfrm_icu,
     135             :     .strnxfrm_prefix = strnxfrm_prefix_icu_utf8,
     136             :     .strxfrm_is_safe = true,
     137             : };
     138             : 
     139             : #endif
     140             : 
     141             : pg_locale_t
     142         210 : create_pg_locale_icu(Oid collid, MemoryContext context)
     143             : {
     144             : #ifdef USE_ICU
     145             :     bool        deterministic;
     146             :     const char *iculocstr;
     147         210 :     const char *icurules = NULL;
     148             :     UCollator  *collator;
     149             :     pg_locale_t result;
     150             : 
     151         210 :     if (collid == DEFAULT_COLLATION_OID)
     152             :     {
     153             :         HeapTuple   tp;
     154             :         Datum       datum;
     155             :         bool        isnull;
     156             : 
     157          26 :         tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
     158          26 :         if (!HeapTupleIsValid(tp))
     159           0 :             elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
     160             : 
     161             :         /* default database collation is always deterministic */
     162          26 :         deterministic = true;
     163          26 :         datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
     164             :                                        Anum_pg_database_datlocale);
     165          26 :         iculocstr = TextDatumGetCString(datum);
     166          26 :         datum = SysCacheGetAttr(DATABASEOID, tp,
     167             :                                 Anum_pg_database_daticurules, &isnull);
     168          26 :         if (!isnull)
     169           0 :             icurules = TextDatumGetCString(datum);
     170             : 
     171          26 :         ReleaseSysCache(tp);
     172             :     }
     173             :     else
     174             :     {
     175             :         Form_pg_collation collform;
     176             :         HeapTuple   tp;
     177             :         Datum       datum;
     178             :         bool        isnull;
     179             : 
     180         184 :         tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
     181         184 :         if (!HeapTupleIsValid(tp))
     182           0 :             elog(ERROR, "cache lookup failed for collation %u", collid);
     183         184 :         collform = (Form_pg_collation) GETSTRUCT(tp);
     184         184 :         deterministic = collform->collisdeterministic;
     185         184 :         datum = SysCacheGetAttrNotNull(COLLOID, tp,
     186             :                                        Anum_pg_collation_colllocale);
     187         184 :         iculocstr = TextDatumGetCString(datum);
     188         184 :         datum = SysCacheGetAttr(COLLOID, tp,
     189             :                                 Anum_pg_collation_collicurules, &isnull);
     190         184 :         if (!isnull)
     191          12 :             icurules = TextDatumGetCString(datum);
     192             : 
     193         184 :         ReleaseSysCache(tp);
     194             :     }
     195             : 
     196         210 :     collator = make_icu_collator(iculocstr, icurules);
     197             : 
     198         200 :     result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
     199         200 :     result->info.icu.locale = MemoryContextStrdup(context, iculocstr);
     200         200 :     result->info.icu.ucol = collator;
     201         200 :     result->provider = COLLPROVIDER_ICU;
     202         200 :     result->deterministic = deterministic;
     203         200 :     result->collate_is_c = false;
     204         200 :     result->ctype_is_c = false;
     205         200 :     if (GetDatabaseEncoding() == PG_UTF8)
     206         200 :         result->collate = &collate_methods_icu_utf8;
     207             :     else
     208           0 :         result->collate = &collate_methods_icu;
     209             : 
     210         200 :     return result;
     211             : #else
     212             :     /* could get here if a collation was created by a build with ICU */
     213             :     ereport(ERROR,
     214             :             (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     215             :              errmsg("ICU is not supported in this build")));
     216             : 
     217             :     return NULL;
     218             : #endif
     219             : }
     220             : 
     221             : #ifdef USE_ICU
     222             : 
     223             : /*
     224             :  * Wrapper around ucol_open() to handle API differences for older ICU
     225             :  * versions.
     226             :  *
     227             :  * Ensure that no path leaks a UCollator.
     228             :  */
     229             : UCollator *
     230       68150 : pg_ucol_open(const char *loc_str)
     231             : {
     232             :     UCollator  *collator;
     233             :     UErrorCode  status;
     234       68150 :     const char *orig_str = loc_str;
     235       68150 :     char       *fixed_str = NULL;
     236             : 
     237             :     /*
     238             :      * Must never open default collator, because it depends on the environment
     239             :      * and may change at any time. Should not happen, but check here to catch
     240             :      * bugs that might be hard to catch otherwise.
     241             :      *
     242             :      * NB: the default collator is not the same as the collator for the root
     243             :      * locale. The root locale may be specified as the empty string, "und", or
     244             :      * "root". The default collator is opened by passing NULL to ucol_open().
     245             :      */
     246       68150 :     if (loc_str == NULL)
     247           0 :         elog(ERROR, "opening default collator is not supported");
     248             : 
     249             :     /*
     250             :      * In ICU versions 54 and earlier, "und" is not a recognized spelling of
     251             :      * the root locale. If the first component of the locale is "und", replace
     252             :      * with "root" before opening.
     253             :      */
     254             :     if (U_ICU_VERSION_MAJOR_NUM < 55)
     255             :     {
     256             :         char        lang[ULOC_LANG_CAPACITY];
     257             : 
     258             :         status = U_ZERO_ERROR;
     259             :         uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
     260             :         if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
     261             :         {
     262             :             ereport(ERROR,
     263             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     264             :                      errmsg("could not get language from locale \"%s\": %s",
     265             :                             loc_str, u_errorName(status))));
     266             :         }
     267             : 
     268             :         if (strcmp(lang, "und") == 0)
     269             :         {
     270             :             const char *remainder = loc_str + strlen("und");
     271             : 
     272             :             fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
     273             :             strcpy(fixed_str, "root");
     274             :             strcat(fixed_str, remainder);
     275             : 
     276             :             loc_str = fixed_str;
     277             :         }
     278             :     }
     279             : 
     280       68150 :     status = U_ZERO_ERROR;
     281       68150 :     collator = ucol_open(loc_str, &status);
     282       68150 :     if (U_FAILURE(status))
     283          12 :         ereport(ERROR,
     284             :         /* use original string for error report */
     285             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     286             :                  errmsg("could not open collator for locale \"%s\": %s",
     287             :                         orig_str, u_errorName(status))));
     288             : 
     289             :     if (U_ICU_VERSION_MAJOR_NUM < 54)
     290             :     {
     291             :         status = U_ZERO_ERROR;
     292             :         icu_set_collation_attributes(collator, loc_str, &status);
     293             : 
     294             :         /*
     295             :          * Pretend the error came from ucol_open(), for consistent error
     296             :          * message across ICU versions.
     297             :          */
     298             :         if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
     299             :         {
     300             :             ucol_close(collator);
     301             :             ereport(ERROR,
     302             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     303             :                      errmsg("could not open collator for locale \"%s\": %s",
     304             :                             orig_str, u_errorName(status))));
     305             :         }
     306             :     }
     307             : 
     308       68138 :     if (fixed_str != NULL)
     309           0 :         pfree(fixed_str);
     310             : 
     311       68138 :     return collator;
     312             : }
     313             : 
     314             : /*
     315             :  * Create a UCollator with the given locale string and rules.
     316             :  *
     317             :  * Ensure that no path leaks a UCollator.
     318             :  */
     319             : static UCollator *
     320         210 : make_icu_collator(const char *iculocstr, const char *icurules)
     321             : {
     322         210 :     if (!icurules)
     323             :     {
     324             :         /* simple case without rules */
     325         198 :         return pg_ucol_open(iculocstr);
     326             :     }
     327             :     else
     328             :     {
     329             :         UCollator  *collator_std_rules;
     330             :         UCollator  *collator_all_rules;
     331             :         const UChar *std_rules;
     332             :         UChar      *my_rules;
     333             :         UChar      *all_rules;
     334             :         int32_t     length;
     335             :         int32_t     total;
     336             :         UErrorCode  status;
     337             : 
     338             :         /*
     339             :          * If rules are specified, we extract the rules of the standard
     340             :          * collation, add our own rules, and make a new collator with the
     341             :          * combined rules.
     342             :          */
     343          12 :         icu_to_uchar(&my_rules, icurules, strlen(icurules));
     344             : 
     345          12 :         collator_std_rules = pg_ucol_open(iculocstr);
     346             : 
     347          12 :         std_rules = ucol_getRules(collator_std_rules, &length);
     348             : 
     349          12 :         total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
     350             : 
     351             :         /* avoid leaking collator on OOM */
     352          12 :         all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
     353          12 :         if (!all_rules)
     354             :         {
     355           0 :             ucol_close(collator_std_rules);
     356           0 :             ereport(ERROR,
     357             :                     (errcode(ERRCODE_OUT_OF_MEMORY),
     358             :                      errmsg("out of memory")));
     359             :         }
     360             : 
     361          12 :         u_strcpy(all_rules, std_rules);
     362          12 :         u_strcat(all_rules, my_rules);
     363             : 
     364          12 :         ucol_close(collator_std_rules);
     365             : 
     366          12 :         status = U_ZERO_ERROR;
     367          12 :         collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
     368             :                                             UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
     369             :                                             NULL, &status);
     370          12 :         if (U_FAILURE(status))
     371             :         {
     372           6 :             ereport(ERROR,
     373             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     374             :                      errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
     375             :                             iculocstr, icurules, u_errorName(status))));
     376             :         }
     377             : 
     378           6 :         return collator_all_rules;
     379             :     }
     380             : }
     381             : 
     382             : size_t
     383         516 : strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
     384             :              pg_locale_t locale)
     385             : {
     386             :     int32_t     len_uchar;
     387             :     int32_t     len_conv;
     388             :     UChar      *buff_uchar;
     389             :     UChar      *buff_conv;
     390             :     size_t      result_len;
     391             : 
     392         516 :     len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
     393         516 :     len_conv = icu_convert_case(u_strToLower, locale,
     394             :                                 &buff_conv, buff_uchar, len_uchar);
     395         516 :     result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
     396         516 :     pfree(buff_uchar);
     397         516 :     pfree(buff_conv);
     398             : 
     399         516 :     return result_len;
     400             : }
     401             : 
     402             : size_t
     403          30 : strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
     404             :              pg_locale_t locale)
     405             : {
     406             :     int32_t     len_uchar;
     407             :     int32_t     len_conv;
     408             :     UChar      *buff_uchar;
     409             :     UChar      *buff_conv;
     410             :     size_t      result_len;
     411             : 
     412          30 :     len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
     413          30 :     len_conv = icu_convert_case(u_strToTitle_default_BI, locale,
     414             :                                 &buff_conv, buff_uchar, len_uchar);
     415          30 :     result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
     416          30 :     pfree(buff_uchar);
     417          30 :     pfree(buff_conv);
     418             : 
     419          30 :     return result_len;
     420             : }
     421             : 
     422             : size_t
     423          54 : strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
     424             :              pg_locale_t locale)
     425             : {
     426             :     int32_t     len_uchar;
     427             :     int32_t     len_conv;
     428             :     UChar      *buff_uchar;
     429             :     UChar      *buff_conv;
     430             :     size_t      result_len;
     431             : 
     432          54 :     len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
     433          54 :     len_conv = icu_convert_case(u_strToUpper, locale,
     434             :                                 &buff_conv, buff_uchar, len_uchar);
     435          54 :     result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
     436          54 :     pfree(buff_uchar);
     437          54 :     pfree(buff_conv);
     438             : 
     439          54 :     return result_len;
     440             : }
     441             : 
     442             : /*
     443             :  * strncoll_icu_utf8
     444             :  *
     445             :  * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
     446             :  * database encoding. An argument length of -1 means the string is
     447             :  * NUL-terminated.
     448             :  */
     449             : #ifdef HAVE_UCOL_STRCOLLUTF8
     450             : int
     451       22886 : strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
     452             :                   pg_locale_t locale)
     453             : {
     454             :     int         result;
     455             :     UErrorCode  status;
     456             : 
     457             :     Assert(locale->provider == COLLPROVIDER_ICU);
     458             : 
     459             :     Assert(GetDatabaseEncoding() == PG_UTF8);
     460             : 
     461       22886 :     status = U_ZERO_ERROR;
     462       22886 :     result = ucol_strcollUTF8(locale->info.icu.ucol,
     463             :                               arg1, len1,
     464             :                               arg2, len2,
     465             :                               &status);
     466       22886 :     if (U_FAILURE(status))
     467           0 :         ereport(ERROR,
     468             :                 (errmsg("collation failed: %s", u_errorName(status))));
     469             : 
     470       22886 :     return result;
     471             : }
     472             : #endif
     473             : 
     474             : /* 'srclen' of -1 means the strings are NUL-terminated */
     475             : size_t
     476       10020 : strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
     477             :              pg_locale_t locale)
     478             : {
     479             :     char        sbuf[TEXTBUFLEN];
     480       10020 :     char       *buf = sbuf;
     481             :     UChar      *uchar;
     482             :     int32_t     ulen;
     483             :     size_t      uchar_bsize;
     484             :     Size        result_bsize;
     485             : 
     486             :     Assert(locale->provider == COLLPROVIDER_ICU);
     487             : 
     488       10020 :     init_icu_converter();
     489             : 
     490       10020 :     ulen = uchar_length(icu_converter, src, srclen);
     491             : 
     492       10020 :     uchar_bsize = (ulen + 1) * sizeof(UChar);
     493             : 
     494       10020 :     if (uchar_bsize > TEXTBUFLEN)
     495           0 :         buf = palloc(uchar_bsize);
     496             : 
     497       10020 :     uchar = (UChar *) buf;
     498             : 
     499       10020 :     ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
     500             : 
     501       10020 :     result_bsize = ucol_getSortKey(locale->info.icu.ucol,
     502             :                                    uchar, ulen,
     503             :                                    (uint8_t *) dest, destsize);
     504             : 
     505             :     /*
     506             :      * ucol_getSortKey() counts the nul-terminator in the result length, but
     507             :      * this function should not.
     508             :      */
     509             :     Assert(result_bsize > 0);
     510       10020 :     result_bsize--;
     511             : 
     512       10020 :     if (buf != sbuf)
     513           0 :         pfree(buf);
     514             : 
     515             :     /* if dest is defined, it should be nul-terminated */
     516             :     Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
     517             : 
     518       10020 :     return result_bsize;
     519             : }
     520             : 
     521             : /* 'srclen' of -1 means the strings are NUL-terminated */
     522             : size_t
     523        1656 : strnxfrm_prefix_icu_utf8(char *dest, size_t destsize,
     524             :                          const char *src, ssize_t srclen,
     525             :                          pg_locale_t locale)
     526             : {
     527             :     size_t      result;
     528             :     UCharIterator iter;
     529             :     uint32_t    state[2];
     530             :     UErrorCode  status;
     531             : 
     532             :     Assert(locale->provider == COLLPROVIDER_ICU);
     533             : 
     534             :     Assert(GetDatabaseEncoding() == PG_UTF8);
     535             : 
     536        1656 :     uiter_setUTF8(&iter, src, srclen);
     537        1656 :     state[0] = state[1] = 0;    /* won't need that again */
     538        1656 :     status = U_ZERO_ERROR;
     539        1656 :     result = ucol_nextSortKeyPart(locale->info.icu.ucol,
     540             :                                   &iter,
     541             :                                   state,
     542             :                                   (uint8_t *) dest,
     543             :                                   destsize,
     544             :                                   &status);
     545        1656 :     if (U_FAILURE(status))
     546           0 :         ereport(ERROR,
     547             :                 (errmsg("sort key generation failed: %s",
     548             :                         u_errorName(status))));
     549             : 
     550        1656 :     return result;
     551             : }
     552             : 
     553             : char *
     554       67792 : get_collation_actual_version_icu(const char *collcollate)
     555             : {
     556             :     UCollator  *collator;
     557             :     UVersionInfo versioninfo;
     558             :     char        buf[U_MAX_VERSION_STRING_LENGTH];
     559             : 
     560       67792 :     collator = pg_ucol_open(collcollate);
     561             : 
     562       67792 :     ucol_getVersion(collator, versioninfo);
     563       67792 :     ucol_close(collator);
     564             : 
     565       67792 :     u_versionToString(versioninfo, buf);
     566       67792 :     return pstrdup(buf);
     567             : }
     568             : 
     569             : /*
     570             :  * Convert a string in the database encoding into a string of UChars.
     571             :  *
     572             :  * The source string at buff is of length nbytes
     573             :  * (it needn't be nul-terminated)
     574             :  *
     575             :  * *buff_uchar receives a pointer to the palloc'd result string, and
     576             :  * the function's result is the number of UChars generated.
     577             :  *
     578             :  * The result string is nul-terminated, though most callers rely on the
     579             :  * result length instead.
     580             :  */
     581             : static int32_t
     582         612 : icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
     583             : {
     584             :     int32_t     len_uchar;
     585             : 
     586         612 :     init_icu_converter();
     587             : 
     588         612 :     len_uchar = uchar_length(icu_converter, buff, nbytes);
     589             : 
     590         612 :     *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
     591         612 :     len_uchar = uchar_convert(icu_converter,
     592             :                               *buff_uchar, len_uchar + 1, buff, nbytes);
     593             : 
     594         612 :     return len_uchar;
     595             : }
     596             : 
     597             : /*
     598             :  * Convert a string of UChars into the database encoding.
     599             :  *
     600             :  * The source string at buff_uchar is of length len_uchar
     601             :  * (it needn't be nul-terminated)
     602             :  *
     603             :  * *result receives a pointer to the palloc'd result string, and the
     604             :  * function's result is the number of bytes generated (not counting nul).
     605             :  *
     606             :  * The result string is nul-terminated.
     607             :  */
     608             : static size_t
     609         600 : icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar)
     610             : {
     611             :     UErrorCode  status;
     612             :     int32_t     len_result;
     613             : 
     614         600 :     init_icu_converter();
     615             : 
     616         600 :     status = U_ZERO_ERROR;
     617         600 :     len_result = ucnv_fromUChars(icu_converter, NULL, 0,
     618             :                                  buff_uchar, len_uchar, &status);
     619         600 :     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
     620           0 :         ereport(ERROR,
     621             :                 (errmsg("%s failed: %s", "ucnv_fromUChars",
     622             :                         u_errorName(status))));
     623             : 
     624         600 :     if (len_result + 1 > destsize)
     625          60 :         return len_result;
     626             : 
     627         540 :     status = U_ZERO_ERROR;
     628         540 :     len_result = ucnv_fromUChars(icu_converter, dest, len_result + 1,
     629             :                                  buff_uchar, len_uchar, &status);
     630         540 :     if (U_FAILURE(status) ||
     631         540 :         status == U_STRING_NOT_TERMINATED_WARNING)
     632           0 :         ereport(ERROR,
     633             :                 (errmsg("%s failed: %s", "ucnv_fromUChars",
     634             :                         u_errorName(status))));
     635             : 
     636         540 :     return len_result;
     637             : }
     638             : 
     639             : static int32_t
     640         600 : icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
     641             :                  UChar **buff_dest, UChar *buff_source, int32_t len_source)
     642             : {
     643             :     UErrorCode  status;
     644             :     int32_t     len_dest;
     645             : 
     646         600 :     len_dest = len_source;      /* try first with same length */
     647         600 :     *buff_dest = palloc(len_dest * sizeof(**buff_dest));
     648         600 :     status = U_ZERO_ERROR;
     649         600 :     len_dest = func(*buff_dest, len_dest, buff_source, len_source,
     650             :                     mylocale->info.icu.locale, &status);
     651         600 :     if (status == U_BUFFER_OVERFLOW_ERROR)
     652             :     {
     653             :         /* try again with adjusted length */
     654           0 :         pfree(*buff_dest);
     655           0 :         *buff_dest = palloc(len_dest * sizeof(**buff_dest));
     656           0 :         status = U_ZERO_ERROR;
     657           0 :         len_dest = func(*buff_dest, len_dest, buff_source, len_source,
     658             :                         mylocale->info.icu.locale, &status);
     659             :     }
     660         600 :     if (U_FAILURE(status))
     661           0 :         ereport(ERROR,
     662             :                 (errmsg("case conversion failed: %s", u_errorName(status))));
     663         600 :     return len_dest;
     664             : }
     665             : 
     666             : static int32_t
     667          30 : u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
     668             :                         const UChar *src, int32_t srcLength,
     669             :                         const char *locale,
     670             :                         UErrorCode *pErrorCode)
     671             : {
     672          30 :     return u_strToTitle(dest, destCapacity, src, srcLength,
     673             :                         NULL, locale, pErrorCode);
     674             : }
     675             : 
     676             : /*
     677             :  * strncoll_icu
     678             :  *
     679             :  * Convert the arguments from the database encoding to UChar strings, then
     680             :  * call ucol_strcoll(). An argument length of -1 means that the string is
     681             :  * NUL-terminated.
     682             :  *
     683             :  * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
     684             :  * caller should call that instead.
     685             :  */
     686             : static int
     687           0 : strncoll_icu(const char *arg1, ssize_t len1,
     688             :              const char *arg2, ssize_t len2, pg_locale_t locale)
     689             : {
     690             :     char        sbuf[TEXTBUFLEN];
     691           0 :     char       *buf = sbuf;
     692             :     int32_t     ulen1;
     693             :     int32_t     ulen2;
     694             :     size_t      bufsize1;
     695             :     size_t      bufsize2;
     696             :     UChar      *uchar1,
     697             :                *uchar2;
     698             :     int         result;
     699             : 
     700             :     Assert(locale->provider == COLLPROVIDER_ICU);
     701             : 
     702             :     /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */
     703             : #ifdef HAVE_UCOL_STRCOLLUTF8
     704             :     Assert(GetDatabaseEncoding() != PG_UTF8);
     705             : #endif
     706             : 
     707           0 :     init_icu_converter();
     708             : 
     709           0 :     ulen1 = uchar_length(icu_converter, arg1, len1);
     710           0 :     ulen2 = uchar_length(icu_converter, arg2, len2);
     711             : 
     712           0 :     bufsize1 = (ulen1 + 1) * sizeof(UChar);
     713           0 :     bufsize2 = (ulen2 + 1) * sizeof(UChar);
     714             : 
     715           0 :     if (bufsize1 + bufsize2 > TEXTBUFLEN)
     716           0 :         buf = palloc(bufsize1 + bufsize2);
     717             : 
     718           0 :     uchar1 = (UChar *) buf;
     719           0 :     uchar2 = (UChar *) (buf + bufsize1);
     720             : 
     721           0 :     ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
     722           0 :     ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
     723             : 
     724           0 :     result = ucol_strcoll(locale->info.icu.ucol,
     725             :                           uchar1, ulen1,
     726             :                           uchar2, ulen2);
     727             : 
     728           0 :     if (buf != sbuf)
     729           0 :         pfree(buf);
     730             : 
     731           0 :     return result;
     732             : }
     733             : 
     734             : /* 'srclen' of -1 means the strings are NUL-terminated */
     735             : static size_t
     736           0 : strnxfrm_prefix_icu(char *dest, size_t destsize,
     737             :                     const char *src, ssize_t srclen,
     738             :                     pg_locale_t locale)
     739             : {
     740             :     char        sbuf[TEXTBUFLEN];
     741           0 :     char       *buf = sbuf;
     742             :     UCharIterator iter;
     743             :     uint32_t    state[2];
     744             :     UErrorCode  status;
     745           0 :     int32_t     ulen = -1;
     746           0 :     UChar      *uchar = NULL;
     747             :     size_t      uchar_bsize;
     748             :     Size        result_bsize;
     749             : 
     750             :     Assert(locale->provider == COLLPROVIDER_ICU);
     751             : 
     752             :     /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */
     753             :     Assert(GetDatabaseEncoding() != PG_UTF8);
     754             : 
     755           0 :     init_icu_converter();
     756             : 
     757           0 :     ulen = uchar_length(icu_converter, src, srclen);
     758             : 
     759           0 :     uchar_bsize = (ulen + 1) * sizeof(UChar);
     760             : 
     761           0 :     if (uchar_bsize > TEXTBUFLEN)
     762           0 :         buf = palloc(uchar_bsize);
     763             : 
     764           0 :     uchar = (UChar *) buf;
     765             : 
     766           0 :     ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
     767             : 
     768           0 :     uiter_setString(&iter, uchar, ulen);
     769           0 :     state[0] = state[1] = 0;    /* won't need that again */
     770           0 :     status = U_ZERO_ERROR;
     771           0 :     result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol,
     772             :                                         &iter,
     773             :                                         state,
     774             :                                         (uint8_t *) dest,
     775             :                                         destsize,
     776             :                                         &status);
     777           0 :     if (U_FAILURE(status))
     778           0 :         ereport(ERROR,
     779             :                 (errmsg("sort key generation failed: %s",
     780             :                         u_errorName(status))));
     781             : 
     782           0 :     return result_bsize;
     783             : }
     784             : 
     785             : static void
     786       11232 : init_icu_converter(void)
     787             : {
     788             :     const char *icu_encoding_name;
     789             :     UErrorCode  status;
     790             :     UConverter *conv;
     791             : 
     792       11232 :     if (icu_converter)
     793       11226 :         return;                 /* already done */
     794             : 
     795           6 :     icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
     796           6 :     if (!icu_encoding_name)
     797           0 :         ereport(ERROR,
     798             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     799             :                  errmsg("encoding \"%s\" not supported by ICU",
     800             :                         pg_encoding_to_char(GetDatabaseEncoding()))));
     801             : 
     802           6 :     status = U_ZERO_ERROR;
     803           6 :     conv = ucnv_open(icu_encoding_name, &status);
     804           6 :     if (U_FAILURE(status))
     805           0 :         ereport(ERROR,
     806             :                 (errmsg("could not open ICU converter for encoding \"%s\": %s",
     807             :                         icu_encoding_name, u_errorName(status))));
     808             : 
     809           6 :     icu_converter = conv;
     810             : }
     811             : 
     812             : /*
     813             :  * Find length, in UChars, of given string if converted to UChar string.
     814             :  *
     815             :  * A length of -1 indicates that the input string is NUL-terminated.
     816             :  */
     817             : static size_t
     818       10632 : uchar_length(UConverter *converter, const char *str, int32_t len)
     819             : {
     820       10632 :     UErrorCode  status = U_ZERO_ERROR;
     821             :     int32_t     ulen;
     822             : 
     823       10632 :     ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
     824       10632 :     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
     825           0 :         ereport(ERROR,
     826             :                 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
     827       10632 :     return ulen;
     828             : }
     829             : 
     830             : /*
     831             :  * Convert the given source string into a UChar string, stored in dest, and
     832             :  * return the length (in UChars).
     833             :  *
     834             :  * A srclen of -1 indicates that the input string is NUL-terminated.
     835             :  */
     836             : static int32_t
     837       10632 : uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
     838             :               const char *src, int32_t srclen)
     839             : {
     840       10632 :     UErrorCode  status = U_ZERO_ERROR;
     841             :     int32_t     ulen;
     842             : 
     843       10632 :     status = U_ZERO_ERROR;
     844       10632 :     ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
     845       10632 :     if (U_FAILURE(status))
     846           0 :         ereport(ERROR,
     847             :                 (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
     848       10632 :     return ulen;
     849             : }
     850             : 
     851             : /*
     852             :  * Parse collation attributes from the given locale string and apply them to
     853             :  * the open collator.
     854             :  *
     855             :  * First, the locale string is canonicalized to an ICU format locale ID such
     856             :  * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
     857             :  * the key-value arguments.
     858             :  *
     859             :  * Starting with ICU version 54, the attributes are processed automatically by
     860             :  * ucol_open(), so this is only necessary for emulating this behavior on older
     861             :  * versions.
     862             :  */
     863             : pg_attribute_unused()
     864             : static void
     865           0 : icu_set_collation_attributes(UCollator *collator, const char *loc,
     866             :                              UErrorCode *status)
     867             : {
     868             :     int32_t     len;
     869             :     char       *icu_locale_id;
     870             :     char       *lower_str;
     871             :     char       *str;
     872             :     char       *token;
     873             : 
     874             :     /*
     875             :      * The input locale may be a BCP 47 language tag, e.g.
     876             :      * "und-u-kc-ks-level1", which expresses the same attributes in a
     877             :      * different form. It will be converted to the equivalent ICU format
     878             :      * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
     879             :      * uloc_canonicalize().
     880             :      */
     881           0 :     *status = U_ZERO_ERROR;
     882           0 :     len = uloc_canonicalize(loc, NULL, 0, status);
     883           0 :     icu_locale_id = palloc(len + 1);
     884           0 :     *status = U_ZERO_ERROR;
     885           0 :     len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
     886           0 :     if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
     887           0 :         return;
     888             : 
     889           0 :     lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
     890             : 
     891           0 :     pfree(icu_locale_id);
     892             : 
     893           0 :     str = strchr(lower_str, '@');
     894           0 :     if (!str)
     895           0 :         return;
     896           0 :     str++;
     897             : 
     898           0 :     while ((token = strsep(&str, ";")))
     899             :     {
     900           0 :         char       *e = strchr(token, '=');
     901             : 
     902           0 :         if (e)
     903             :         {
     904             :             char       *name;
     905             :             char       *value;
     906             :             UColAttribute uattr;
     907             :             UColAttributeValue uvalue;
     908             : 
     909           0 :             *status = U_ZERO_ERROR;
     910             : 
     911           0 :             *e = '\0';
     912           0 :             name = token;
     913           0 :             value = e + 1;
     914             : 
     915             :             /*
     916             :              * See attribute name and value lists in ICU i18n/coll.cpp
     917             :              */
     918           0 :             if (strcmp(name, "colstrength") == 0)
     919           0 :                 uattr = UCOL_STRENGTH;
     920           0 :             else if (strcmp(name, "colbackwards") == 0)
     921           0 :                 uattr = UCOL_FRENCH_COLLATION;
     922           0 :             else if (strcmp(name, "colcaselevel") == 0)
     923           0 :                 uattr = UCOL_CASE_LEVEL;
     924           0 :             else if (strcmp(name, "colcasefirst") == 0)
     925           0 :                 uattr = UCOL_CASE_FIRST;
     926           0 :             else if (strcmp(name, "colalternate") == 0)
     927           0 :                 uattr = UCOL_ALTERNATE_HANDLING;
     928           0 :             else if (strcmp(name, "colnormalization") == 0)
     929           0 :                 uattr = UCOL_NORMALIZATION_MODE;
     930           0 :             else if (strcmp(name, "colnumeric") == 0)
     931           0 :                 uattr = UCOL_NUMERIC_COLLATION;
     932             :             else
     933             :                 /* ignore if unknown */
     934           0 :                 continue;
     935             : 
     936           0 :             if (strcmp(value, "primary") == 0)
     937           0 :                 uvalue = UCOL_PRIMARY;
     938           0 :             else if (strcmp(value, "secondary") == 0)
     939           0 :                 uvalue = UCOL_SECONDARY;
     940           0 :             else if (strcmp(value, "tertiary") == 0)
     941           0 :                 uvalue = UCOL_TERTIARY;
     942           0 :             else if (strcmp(value, "quaternary") == 0)
     943           0 :                 uvalue = UCOL_QUATERNARY;
     944           0 :             else if (strcmp(value, "identical") == 0)
     945           0 :                 uvalue = UCOL_IDENTICAL;
     946           0 :             else if (strcmp(value, "no") == 0)
     947           0 :                 uvalue = UCOL_OFF;
     948           0 :             else if (strcmp(value, "yes") == 0)
     949           0 :                 uvalue = UCOL_ON;
     950           0 :             else if (strcmp(value, "shifted") == 0)
     951           0 :                 uvalue = UCOL_SHIFTED;
     952           0 :             else if (strcmp(value, "non-ignorable") == 0)
     953           0 :                 uvalue = UCOL_NON_IGNORABLE;
     954           0 :             else if (strcmp(value, "lower") == 0)
     955           0 :                 uvalue = UCOL_LOWER_FIRST;
     956           0 :             else if (strcmp(value, "upper") == 0)
     957           0 :                 uvalue = UCOL_UPPER_FIRST;
     958             :             else
     959             :             {
     960           0 :                 *status = U_ILLEGAL_ARGUMENT_ERROR;
     961           0 :                 break;
     962             :             }
     963             : 
     964           0 :             ucol_setAttribute(collator, uattr, uvalue, status);
     965             :         }
     966             :     }
     967             : 
     968           0 :     pfree(lower_str);
     969             : }
     970             : 
     971             : #endif                          /* USE_ICU */

Generated by: LCOV version 1.14