LCOV - code coverage report
Current view: top level - src/common - unicode_case.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 92.9 % 112 104
Test Date: 2026-03-09 23:14:56 Functions: 84.6 % 13 11
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  * unicode_case.c
       3              :  *      Unicode case mapping and case conversion.
       4              :  *
       5              :  * Portions Copyright (c) 2017-2026, PostgreSQL Global Development Group
       6              :  *
       7              :  * IDENTIFICATION
       8              :  *    src/common/unicode_case.c
       9              :  *
      10              :  *-------------------------------------------------------------------------
      11              :  */
      12              : #ifndef FRONTEND
      13              : #include "postgres.h"
      14              : #else
      15              : #include "postgres_fe.h"
      16              : #endif
      17              : 
      18              : #include "common/unicode_case.h"
      19              : #include "common/unicode_case_table.h"
      20              : #include "common/unicode_category.h"
      21              : #include "mb/pg_wchar.h"
      22              : 
      23              : enum CaseMapResult
      24              : {
      25              :     CASEMAP_SELF,
      26              :     CASEMAP_SIMPLE,
      27              :     CASEMAP_SPECIAL,
      28              : };
      29              : 
      30              : /*
      31              :  * Map for each case kind.
      32              :  */
      33              : static const char32_t *const casekind_map[NCaseKind] =
      34              : {
      35              :     [CaseLower] = case_map_lower,
      36              :     [CaseTitle] = case_map_title,
      37              :     [CaseUpper] = case_map_upper,
      38              :     [CaseFold] = case_map_fold,
      39              : };
      40              : 
      41              : static char32_t find_case_map(char32_t ucs, const char32_t *map);
      42              : static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
      43              :                            CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
      44              :                            void *wbstate);
      45              : static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full,
      46              :                                   const char *src, size_t srclen, size_t srcoff,
      47              :                                   char32_t *simple, const char32_t **special);
      48              : 
      49              : char32_t
      50          264 : unicode_lowercase_simple(char32_t code)
      51              : {
      52          264 :     char32_t    cp = find_case_map(code, case_map_lower);
      53              : 
      54          264 :     return cp != 0 ? cp : code;
      55              : }
      56              : 
      57              : char32_t
      58            0 : unicode_titlecase_simple(char32_t code)
      59              : {
      60            0 :     char32_t    cp = find_case_map(code, case_map_title);
      61              : 
      62            0 :     return cp != 0 ? cp : code;
      63              : }
      64              : 
      65              : char32_t
      66          264 : unicode_uppercase_simple(char32_t code)
      67              : {
      68          264 :     char32_t    cp = find_case_map(code, case_map_upper);
      69              : 
      70          264 :     return cp != 0 ? cp : code;
      71              : }
      72              : 
      73              : char32_t
      74            0 : unicode_casefold_simple(char32_t code)
      75              : {
      76            0 :     char32_t    cp = find_case_map(code, case_map_fold);
      77              : 
      78            0 :     return cp != 0 ? cp : code;
      79              : }
      80              : 
      81              : /*
      82              :  * unicode_strlower()
      83              :  *
      84              :  * Convert src to lowercase, and return the result length (not including
      85              :  * terminating NUL).
      86              :  *
      87              :  * String src must be encoded in UTF-8. If srclen < 0, src must be
      88              :  * NUL-terminated.
      89              :  *
      90              :  * Result string is stored in dst, truncating if larger than dstsize. If
      91              :  * dstsize is greater than the result length, dst will be NUL-terminated;
      92              :  * otherwise not.
      93              :  *
      94              :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
      95              :  * required buffer size before allocating.
      96              :  *
      97              :  * If full is true, use special case mappings if available and if the
      98              :  * conditions are satisfied.
      99              :  */
     100              : size_t
     101         6200 : unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     102              :                  bool full)
     103              : {
     104         6200 :     return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
     105              :                         NULL);
     106              : }
     107              : 
     108              : /*
     109              :  * unicode_strtitle()
     110              :  *
     111              :  * Convert src to titlecase, and return the result length (not including
     112              :  * terminating NUL).
     113              :  *
     114              :  * String src must be encoded in UTF-8. If srclen < 0, src must be
     115              :  * NUL-terminated.
     116              :  *
     117              :  * Result string is stored in dst, truncating if larger than dstsize. If
     118              :  * dstsize is greater than the result length, dst will be NUL-terminated;
     119              :  * otherwise not.
     120              :  *
     121              :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
     122              :  * required buffer size before allocating.
     123              :  *
     124              :  * If full is true, use special case mappings if available and if the
     125              :  * conditions are satisfied. Otherwise, use only simple mappings and use
     126              :  * uppercase instead of titlecase.
     127              :  *
     128              :  * Titlecasing requires knowledge about word boundaries, which is provided by
     129              :  * the callback wbnext. A word boundary is the offset of the start of a word
     130              :  * or the offset of the character immediately following a word.
     131              :  *
     132              :  * The caller is expected to initialize and free the callback state
     133              :  * wbstate. The callback should first return offset 0 for the first boundary;
     134              :  * then the offset of each subsequent word boundary; then the total length of
     135              :  * the string to indicate the final boundary.
     136              :  */
     137              : size_t
     138           97 : unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     139              :                  bool full, WordBoundaryNext wbnext, void *wbstate)
     140              : {
     141           97 :     return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
     142              :                         wbstate);
     143              : }
     144              : 
     145              : /*
     146              :  * unicode_strupper()
     147              :  *
     148              :  * Convert src to uppercase, and return the result length (not including
     149              :  * terminating NUL).
     150              :  *
     151              :  * String src must be encoded in UTF-8. If srclen < 0, src must be
     152              :  * NUL-terminated.
     153              :  *
     154              :  * Result string is stored in dst, truncating if larger than dstsize. If
     155              :  * dstsize is greater than the result length, dst will be NUL-terminated;
     156              :  * otherwise not.
     157              :  *
     158              :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
     159              :  * required buffer size before allocating.
     160              :  *
     161              :  * If full is true, use special case mappings if available and if the
     162              :  * conditions are satisfied.
     163              :  */
     164              : size_t
     165       158517 : unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     166              :                  bool full)
     167              : {
     168       158517 :     return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
     169              :                         NULL);
     170              : }
     171              : 
     172              : /*
     173              :  * unicode_strfold()
     174              :  *
     175              :  * Case fold src, and return the result length (not including terminating
     176              :  * NUL).
     177              :  *
     178              :  * String src must be encoded in UTF-8. If srclen < 0, src must be
     179              :  * NUL-terminated.
     180              :  *
     181              :  * Result string is stored in dst, truncating if larger than dstsize. If
     182              :  * dstsize is greater than the result length, dst will be NUL-terminated;
     183              :  * otherwise not.
     184              :  *
     185              :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
     186              :  * required buffer size before allocating.
     187              :  */
     188              : size_t
     189            6 : unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     190              :                 bool full)
     191              : {
     192            6 :     return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
     193              :                         NULL);
     194              : }
     195              : 
     196              : /*
     197              :  * Implement Unicode Default Case Conversion algorithm.
     198              :  *
     199              :  * If str_casekind is CaseLower or CaseUpper, map each character in the string
     200              :  * for which a mapping is available.
     201              :  *
     202              :  * If str_casekind is CaseTitle, maps characters found on a word boundary to
     203              :  * titlecase (or uppercase if full is false) and other characters to
     204              :  * lowercase. NB: does not currently implement the Unicode behavior in which
     205              :  * the word boundary is adjusted to the next Cased character. That behavior
     206              :  * could be implemented as an option, but it doesn't match the default
     207              :  * behavior of ICU, nor does it match the documented behavior of INITCAP().
     208              :  *
     209              :  * If full is true, use special mappings for relevant characters, which can
     210              :  * map a single codepoint to multiple codepoints, or depend on conditions.
     211              :  */
     212              : static size_t
     213       164820 : convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     214              :              CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
     215              :              void *wbstate)
     216              : {
     217              :     /* character CaseKind varies while titlecasing */
     218       164820 :     CaseKind    chr_casekind = str_casekind;
     219       164820 :     size_t      srcoff = 0;
     220       164820 :     size_t      result_len = 0;
     221       164820 :     size_t      boundary = 0;
     222              : 
     223              :     Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
     224              :            (str_casekind != CaseTitle && !wbnext && !wbstate));
     225              : 
     226       164820 :     if (str_casekind == CaseTitle)
     227              :     {
     228           97 :         boundary = wbnext(wbstate);
     229              :         Assert(boundary == 0);  /* start of text is always a boundary */
     230              :     }
     231              : 
     232       491971 :     while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
     233              :     {
     234       327151 :         char32_t    u1 = utf8_to_unicode((const unsigned char *) src + srcoff);
     235       327151 :         int         u1len = unicode_utf8len(u1);
     236       327151 :         char32_t    simple = 0;
     237       327151 :         const char32_t *special = NULL;
     238              :         enum CaseMapResult casemap_result;
     239              : 
     240       327151 :         if (str_casekind == CaseTitle)
     241              :         {
     242          753 :             if (srcoff == boundary)
     243              :             {
     244          315 :                 chr_casekind = full ? CaseTitle : CaseUpper;
     245          315 :                 boundary = wbnext(wbstate);
     246              :             }
     247              :             else
     248          438 :                 chr_casekind = CaseLower;
     249              :         }
     250              : 
     251       327151 :         casemap_result = casemap(u1, chr_casekind, full, src, srclen, srcoff,
     252              :                                  &simple, &special);
     253              : 
     254       327151 :         switch (casemap_result)
     255              :         {
     256           60 :             case CASEMAP_SELF:
     257              :                 /* no mapping; copy bytes from src */
     258              :                 Assert(simple == 0);
     259              :                 Assert(special == NULL);
     260           60 :                 if (result_len + u1len <= dstsize)
     261           60 :                     memcpy(dst + result_len, src + srcoff, u1len);
     262              : 
     263           60 :                 result_len += u1len;
     264           60 :                 break;
     265       327034 :             case CASEMAP_SIMPLE:
     266              :                 {
     267              :                     /* replace with single character */
     268       327034 :                     char32_t    u2 = simple;
     269       327034 :                     char32_t    u2len = unicode_utf8len(u2);
     270              : 
     271              :                     Assert(special == NULL);
     272       327034 :                     if (result_len + u2len <= dstsize)
     273       327010 :                         unicode_to_utf8(u2, (unsigned char *) dst + result_len);
     274              : 
     275       327034 :                     result_len += u2len;
     276              :                 }
     277       327034 :                 break;
     278           57 :             case CASEMAP_SPECIAL:
     279              :                 /* replace with up to MAX_CASE_EXPANSION characters */
     280              :                 Assert(simple == 0);
     281          141 :                 for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
     282              :                 {
     283           84 :                     char32_t    u2 = special[i];
     284           84 :                     size_t      u2len = unicode_utf8len(u2);
     285              : 
     286           84 :                     if (result_len + u2len <= dstsize)
     287           84 :                         unicode_to_utf8(u2, (unsigned char *) dst + result_len);
     288              : 
     289           84 :                     result_len += u2len;
     290              :                 }
     291           57 :                 break;
     292              :         }
     293              : 
     294       327151 :         srcoff += u1len;
     295              :     }
     296              : 
     297       164820 :     if (result_len < dstsize)
     298       164784 :         dst[result_len] = '\0';
     299              : 
     300       164820 :     return result_len;
     301              : }
     302              : 
     303              : /*
     304              :  * Check that the condition matches Final_Sigma, described in Unicode Table
     305              :  * 3-17. The character at the given offset must be directly preceded by a
     306              :  * Cased character, and must not be directly followed by a Cased character.
     307              :  *
     308              :  * Case_Ignorable characters are ignored. NB: some characters may be both
     309              :  * Cased and Case_Ignorable, in which case they are ignored.
     310              :  */
     311              : static bool
     312           30 : check_final_sigma(const unsigned char *str, size_t len, size_t offset)
     313              : {
     314              :     /* the start of the string is not preceded by a Cased character */
     315           30 :     if (offset == 0)
     316            3 :         return false;
     317              : 
     318              :     /* iterate backwards, looking for Cased character */
     319           72 :     for (int i = offset - 1; i >= 0; i--)
     320              :     {
     321           72 :         if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
     322              :         {
     323           39 :             char32_t    curr = utf8_to_unicode(str + i);
     324              : 
     325           39 :             if (pg_u_prop_case_ignorable(curr))
     326           12 :                 continue;
     327           27 :             else if (pg_u_prop_cased(curr))
     328           21 :                 break;
     329              :             else
     330            6 :                 return false;
     331              :         }
     332           33 :         else if ((str[i] & 0xC0) == 0x80)
     333           33 :             continue;
     334              : 
     335              :         Assert(false);          /* invalid UTF-8 */
     336              :     }
     337              : 
     338              :     /* end of string is not followed by a Cased character */
     339           21 :     if (offset == len)
     340            0 :         return true;
     341              : 
     342              :     /* iterate forwards, looking for Cased character */
     343           66 :     for (int i = offset + 1; i < len && str[i] != '\0'; i++)
     344              :     {
     345           57 :         if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
     346              :         {
     347           24 :             char32_t    curr = utf8_to_unicode(str + i);
     348              : 
     349           24 :             if (pg_u_prop_case_ignorable(curr))
     350           12 :                 continue;
     351           12 :             else if (pg_u_prop_cased(curr))
     352            9 :                 return false;
     353              :             else
     354            3 :                 break;
     355              :         }
     356           33 :         else if ((str[i] & 0xC0) == 0x80)
     357           33 :             continue;
     358              : 
     359              :         Assert(false);          /* invalid UTF-8 */
     360              :     }
     361              : 
     362           12 :     return true;
     363              : }
     364              : 
     365              : /*
     366              :  * Unicode allows for special casing to be applied only under certain
     367              :  * circumstances. The only currently-supported condition is Final_Sigma.
     368              :  */
     369              : static bool
     370           75 : check_special_conditions(int conditions, const char *str, size_t len,
     371              :                          size_t offset)
     372              : {
     373           75 :     if (conditions == 0)
     374           45 :         return true;
     375           30 :     else if (conditions == PG_U_FINAL_SIGMA)
     376           30 :         return check_final_sigma((const unsigned char *) str, len, offset);
     377              : 
     378              :     /* no other conditions supported */
     379              :     Assert(false);
     380            0 :     return false;
     381              : }
     382              : 
     383              : /*
     384              :  * Map the given character to the requested case.
     385              :  *
     386              :  * If full is true, and a special case mapping is found and the conditions are
     387              :  * met, 'special' is set to the mapping result (which is an array of up to
     388              :  * MAX_CASE_EXPANSION characters) and CASEMAP_SPECIAL is returned.
     389              :  *
     390              :  * Otherwise, search for a simple mapping, and if found, set 'simple' to the
     391              :  * result and return CASEMAP_SIMPLE.
     392              :  *
     393              :  * If no mapping is found, return CASEMAP_SELF, and the caller should copy the
     394              :  * character without modification.
     395              :  */
     396              : static enum CaseMapResult
     397       327151 : casemap(char32_t u1, CaseKind casekind, bool full,
     398              :         const char *src, size_t srclen, size_t srcoff,
     399              :         char32_t *simple, const char32_t **special)
     400              : {
     401              :     uint16      idx;
     402              : 
     403              :     /* Fast path for codepoints < 0x80 */
     404       327151 :     if (u1 < 0x80)
     405              :     {
     406              :         /*
     407              :          * The first elements in all tables are reserved as 0 (as NULL). The
     408              :          * data starts at index 1, not 0.
     409              :          */
     410       326128 :         *simple = casekind_map[casekind][u1 + 1];
     411              : 
     412       326128 :         return CASEMAP_SIMPLE;
     413              :     }
     414              : 
     415         1023 :     idx = case_index(u1);
     416              : 
     417         1023 :     if (idx == 0)
     418           60 :         return CASEMAP_SELF;
     419              : 
     420         1038 :     if (full && case_map_special[idx] &&
     421           75 :         check_special_conditions(special_case[case_map_special[idx]].conditions,
     422              :                                  src, srclen, srcoff))
     423              :     {
     424           57 :         *special = special_case[case_map_special[idx]].map[casekind];
     425           57 :         return CASEMAP_SPECIAL;
     426              :     }
     427              : 
     428          906 :     *simple = casekind_map[casekind][idx];
     429              : 
     430          906 :     return CASEMAP_SIMPLE;
     431              : }
     432              : 
     433              : /*
     434              :  * Find entry in simple case map.
     435              :  * If the entry does not exist, 0 will be returned.
     436              :  */
     437              : static char32_t
     438          528 : find_case_map(char32_t ucs, const char32_t *map)
     439              : {
     440              :     /* Fast path for codepoints < 0x80 */
     441          528 :     if (ucs < 0x80)
     442              :         /* The first elements in all tables are reserved as 0 (as NULL). */
     443          312 :         return map[ucs + 1];
     444          216 :     return map[case_index(ucs)];
     445              : }
        

Generated by: LCOV version 2.0-1