LCOV - code coverage report
Current view: top level - src/common - unicode_case.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 92.9 % 112 104
Test Date: 2026-05-21 09:16:36 Functions: 84.6 % 13 11
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  * unicode_case.c
       3              :  *      Unicode case mapping and case conversion.
       4              :  *
       5              :  * Portions Copyright (c) 2017-2026, PostgreSQL Global Development Group
       6              :  *
       7              :  * IDENTIFICATION
       8              :  *    src/common/unicode_case.c
       9              :  *
      10              :  *-------------------------------------------------------------------------
      11              :  */
      12              : #ifndef FRONTEND
      13              : #include "postgres.h"
      14              : #else
      15              : #include "postgres_fe.h"
      16              : #endif
      17              : 
      18              : #include "common/unicode_case.h"
      19              : #include "common/unicode_case_table.h"
      20              : #include "common/unicode_category.h"
      21              : #include "mb/pg_wchar.h"
      22              : 
      23              : enum CaseMapResult
      24              : {
      25              :     CASEMAP_SELF,
      26              :     CASEMAP_SIMPLE,
      27              :     CASEMAP_SPECIAL,
      28              : };
      29              : 
      30              : /*
      31              :  * Map for each case kind.
      32              :  */
      33              : static const char32_t *const casekind_map[NCaseKind] =
      34              : {
      35              :     [CaseLower] = case_map_lower,
      36              :     [CaseTitle] = case_map_title,
      37              :     [CaseUpper] = case_map_upper,
      38              :     [CaseFold] = case_map_fold,
      39              : };
      40              : 
      41              : static char32_t find_case_map(char32_t ucs, const char32_t *map);
      42              : static size_t convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
      43              :                            CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
      44              :                            void *wbstate);
      45              : static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full,
      46              :                                   const char *src, size_t srclen, size_t srcoff,
      47              :                                   char32_t *simple, const char32_t **special);
      48              : 
      49              : char32_t
      50          325 : unicode_lowercase_simple(char32_t code)
      51              : {
      52          325 :     char32_t    cp = find_case_map(code, case_map_lower);
      53              : 
      54          325 :     return cp != 0 ? cp : code;
      55              : }
      56              : 
      57              : char32_t
      58            0 : unicode_titlecase_simple(char32_t code)
      59              : {
      60            0 :     char32_t    cp = find_case_map(code, case_map_title);
      61              : 
      62            0 :     return cp != 0 ? cp : code;
      63              : }
      64              : 
      65              : char32_t
      66          325 : unicode_uppercase_simple(char32_t code)
      67              : {
      68          325 :     char32_t    cp = find_case_map(code, case_map_upper);
      69              : 
      70          325 :     return cp != 0 ? cp : code;
      71              : }
      72              : 
      73              : char32_t
      74            0 : unicode_casefold_simple(char32_t code)
      75              : {
      76            0 :     char32_t    cp = find_case_map(code, case_map_fold);
      77              : 
      78            0 :     return cp != 0 ? cp : code;
      79              : }
      80              : 
      81              : /*
      82              :  * unicode_strlower()
      83              :  *
      84              :  * Convert src to lowercase, and return the result length (not including
      85              :  * terminating NUL).
      86              :  *
      87              :  * String src must be encoded in UTF-8.
      88              :  *
      89              :  * Result string is stored in dst, truncating if larger than dstsize. If
      90              :  * dstsize is greater than the result length, dst will be NUL-terminated;
      91              :  * otherwise not.
      92              :  *
      93              :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
      94              :  * required buffer size before allocating.
      95              :  *
      96              :  * If full is true, use special case mappings if available and if the
      97              :  * conditions are satisfied.
      98              :  */
      99              : size_t
     100         6312 : unicode_strlower(char *dst, size_t dstsize, const char *src, size_t srclen,
     101              :                  bool full)
     102              : {
     103         6312 :     return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
     104              :                         NULL);
     105              : }
     106              : 
     107              : /*
     108              :  * unicode_strtitle()
     109              :  *
     110              :  * Convert src to titlecase, and return the result length (not including
     111              :  * terminating NUL).
     112              :  *
     113              :  * String src must be encoded in UTF-8.
     114              :  *
     115              :  * Result string is stored in dst, truncating if larger than dstsize. If
     116              :  * dstsize is greater than the result length, dst will be NUL-terminated;
     117              :  * otherwise not.
     118              :  *
     119              :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
     120              :  * required buffer size before allocating.
     121              :  *
     122              :  * If full is true, use special case mappings if available and if the
     123              :  * conditions are satisfied. Otherwise, use only simple mappings and use
     124              :  * uppercase instead of titlecase.
     125              :  *
     126              :  * Titlecasing requires knowledge about word boundaries, which is provided by
     127              :  * the callback wbnext. A word boundary is the offset of the start of a word
     128              :  * or the offset of the character immediately following a word.
     129              :  *
     130              :  * The caller is expected to initialize and free the callback state
     131              :  * wbstate. The callback should first return offset 0 for the first boundary;
     132              :  * then the offset of each subsequent word boundary; then the total length of
     133              :  * the string to indicate the final boundary.
     134              :  */
     135              : size_t
     136          133 : unicode_strtitle(char *dst, size_t dstsize, const char *src, size_t srclen,
     137              :                  bool full, WordBoundaryNext wbnext, void *wbstate)
     138              : {
     139          133 :     return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
     140              :                         wbstate);
     141              : }
     142              : 
     143              : /*
     144              :  * unicode_strupper()
     145              :  *
     146              :  * Convert src to uppercase, and return the result length (not including
     147              :  * terminating NUL).
     148              :  *
     149              :  * String src must be encoded in UTF-8.
     150              :  *
     151              :  * Result string is stored in dst, truncating if larger than dstsize. If
     152              :  * dstsize is greater than the result length, dst will be NUL-terminated;
     153              :  * otherwise not.
     154              :  *
     155              :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
     156              :  * required buffer size before allocating.
     157              :  *
     158              :  * If full is true, use special case mappings if available and if the
     159              :  * conditions are satisfied.
     160              :  */
     161              : size_t
     162       158561 : unicode_strupper(char *dst, size_t dstsize, const char *src, size_t srclen,
     163              :                  bool full)
     164              : {
     165       158561 :     return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
     166              :                         NULL);
     167              : }
     168              : 
     169              : /*
     170              :  * unicode_strfold()
     171              :  *
     172              :  * Case fold src, and return the result length (not including terminating
     173              :  * NUL).
     174              :  *
     175              :  * String src must be encoded in UTF-8.
     176              :  *
     177              :  * Result string is stored in dst, truncating if larger than dstsize. If
     178              :  * dstsize is greater than the result length, dst will be NUL-terminated;
     179              :  * otherwise not.
     180              :  *
     181              :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
     182              :  * required buffer size before allocating.
     183              :  */
     184              : size_t
     185           10 : unicode_strfold(char *dst, size_t dstsize, const char *src, size_t srclen,
     186              :                 bool full)
     187              : {
     188           10 :     return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
     189              :                         NULL);
     190              : }
     191              : 
     192              : /*
     193              :  * Implement Unicode Default Case Conversion algorithm.
     194              :  *
     195              :  * If str_casekind is CaseLower or CaseUpper, map each character in the string
     196              :  * for which a mapping is available.
     197              :  *
     198              :  * If str_casekind is CaseTitle, maps characters found on a word boundary to
     199              :  * titlecase (or uppercase if full is false) and other characters to
     200              :  * lowercase. NB: does not currently implement the Unicode behavior in which
     201              :  * the word boundary is adjusted to the next Cased character. That behavior
     202              :  * could be implemented as an option, but it doesn't match the default
     203              :  * behavior of ICU, nor does it match the documented behavior of INITCAP().
     204              :  *
     205              :  * If full is true, use special mappings for relevant characters, which can
     206              :  * map a single codepoint to multiple codepoints, or depend on conditions.
     207              :  */
     208              : static size_t
     209       165016 : convert_case(char *dst, size_t dstsize, const char *src, size_t srclen,
     210              :              CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
     211              :              void *wbstate)
     212              : {
     213              :     /* character CaseKind varies while titlecasing */
     214       165016 :     CaseKind    chr_casekind = str_casekind;
     215       165016 :     size_t      srcoff = 0;
     216       165016 :     size_t      result_len = 0;
     217       165016 :     size_t      boundary = 0;
     218              : 
     219              :     Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
     220              :            (str_casekind != CaseTitle && !wbnext && !wbstate));
     221              : 
     222       165016 :     if (str_casekind == CaseTitle)
     223              :     {
     224          133 :         boundary = wbnext(wbstate);
     225              :         Assert(boundary == 0);  /* start of text is always a boundary */
     226              :     }
     227              : 
     228       494087 :     while (srcoff < srclen)
     229              :     {
     230       329071 :         char32_t    u1 = utf8_to_unicode((const unsigned char *) src + srcoff);
     231       329071 :         int         u1len = unicode_utf8len(u1);
     232       329071 :         char32_t    simple = 0;
     233       329071 :         const char32_t *special = NULL;
     234              :         enum CaseMapResult casemap_result;
     235              : 
     236       329071 :         if (str_casekind == CaseTitle)
     237              :         {
     238         1032 :             if (srcoff == boundary)
     239              :             {
     240          431 :                 chr_casekind = full ? CaseTitle : CaseUpper;
     241          431 :                 boundary = wbnext(wbstate);
     242              :             }
     243              :             else
     244          601 :                 chr_casekind = CaseLower;
     245              :         }
     246              : 
     247       329071 :         casemap_result = casemap(u1, chr_casekind, full, src, srclen, srcoff,
     248              :                                  &simple, &special);
     249              : 
     250       329071 :         switch (casemap_result)
     251              :         {
     252           88 :             case CASEMAP_SELF:
     253              :                 /* no mapping; copy bytes from src */
     254              :                 Assert(simple == 0);
     255              :                 Assert(special == NULL);
     256           88 :                 if (result_len + u1len <= dstsize)
     257           88 :                     memcpy(dst + result_len, src + srcoff, u1len);
     258              : 
     259           88 :                 result_len += u1len;
     260           88 :                 break;
     261       328900 :             case CASEMAP_SIMPLE:
     262              :                 {
     263              :                     /* replace with single character */
     264       328900 :                     char32_t    u2 = simple;
     265       328900 :                     char32_t    u2len = unicode_utf8len(u2);
     266              : 
     267              :                     Assert(special == NULL);
     268       328900 :                     if (result_len + u2len <= dstsize)
     269       328868 :                         unicode_to_utf8(u2, (unsigned char *) dst + result_len);
     270              : 
     271       328900 :                     result_len += u2len;
     272              :                 }
     273       328900 :                 break;
     274           83 :             case CASEMAP_SPECIAL:
     275              :                 /* replace with up to MAX_CASE_EXPANSION characters */
     276              :                 Assert(simple == 0);
     277          205 :                 for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
     278              :                 {
     279          122 :                     char32_t    u2 = special[i];
     280          122 :                     size_t      u2len = unicode_utf8len(u2);
     281              : 
     282          122 :                     if (result_len + u2len <= dstsize)
     283          122 :                         unicode_to_utf8(u2, (unsigned char *) dst + result_len);
     284              : 
     285          122 :                     result_len += u2len;
     286              :                 }
     287           83 :                 break;
     288              :         }
     289              : 
     290       329071 :         srcoff += u1len;
     291              :     }
     292              : 
     293       165016 :     if (result_len < dstsize)
     294       164968 :         dst[result_len] = '\0';
     295              : 
     296       165016 :     return result_len;
     297              : }
     298              : 
     299              : /*
     300              :  * Check that the condition matches Final_Sigma, described in Unicode Table
     301              :  * 3-17. The character at the given offset must be directly preceded by a
     302              :  * Cased character, and must not be directly followed by a Cased character.
     303              :  *
     304              :  * Case_Ignorable characters are ignored. NB: some characters may be both
     305              :  * Cased and Case_Ignorable, in which case they are ignored.
     306              :  */
     307              : static bool
     308           50 : check_final_sigma(const unsigned char *str, size_t len, size_t offset)
     309              : {
     310              :     /* the start of the string is not preceded by a Cased character */
     311           50 :     if (offset == 0)
     312            5 :         return false;
     313              : 
     314              :     /* iterate backwards, looking for Cased character */
     315          120 :     for (int i = offset - 1; i >= 0; i--)
     316              :     {
     317          120 :         if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
     318              :         {
     319           65 :             char32_t    curr = utf8_to_unicode(str + i);
     320              : 
     321           65 :             if (pg_u_prop_case_ignorable(curr))
     322           20 :                 continue;
     323           45 :             else if (pg_u_prop_cased(curr))
     324           35 :                 break;
     325              :             else
     326           10 :                 return false;
     327              :         }
     328           55 :         else if ((str[i] & 0xC0) == 0x80)
     329           55 :             continue;
     330              : 
     331              :         Assert(false);          /* invalid UTF-8 */
     332              :     }
     333              : 
     334              :     /* end of string is not followed by a Cased character */
     335           35 :     if (offset == len)
     336            0 :         return true;
     337              : 
     338              :     /* iterate forwards, looking for Cased character */
     339          110 :     for (int i = offset + 1; i < len && str[i] != '\0'; i++)
     340              :     {
     341           95 :         if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
     342              :         {
     343           40 :             char32_t    curr = utf8_to_unicode(str + i);
     344              : 
     345           40 :             if (pg_u_prop_case_ignorable(curr))
     346           20 :                 continue;
     347           20 :             else if (pg_u_prop_cased(curr))
     348           15 :                 return false;
     349              :             else
     350            5 :                 break;
     351              :         }
     352           55 :         else if ((str[i] & 0xC0) == 0x80)
     353           55 :             continue;
     354              : 
     355              :         Assert(false);          /* invalid UTF-8 */
     356              :     }
     357              : 
     358           20 :     return true;
     359              : }
     360              : 
     361              : /*
     362              :  * Unicode allows for special casing to be applied only under certain
     363              :  * circumstances. The only currently-supported condition is Final_Sigma.
     364              :  */
     365              : static bool
     366          113 : check_special_conditions(int conditions, const char *str, size_t len,
     367              :                          size_t offset)
     368              : {
     369          113 :     if (conditions == 0)
     370           63 :         return true;
     371           50 :     else if (conditions == PG_U_FINAL_SIGMA)
     372           50 :         return check_final_sigma((const unsigned char *) str, len, offset);
     373              : 
     374              :     /* no other conditions supported */
     375              :     Assert(false);
     376            0 :     return false;
     377              : }
     378              : 
     379              : /*
     380              :  * Map the given character to the requested case.
     381              :  *
     382              :  * If full is true, and a special case mapping is found and the conditions are
     383              :  * met, 'special' is set to the mapping result (which is an array of up to
     384              :  * MAX_CASE_EXPANSION characters) and CASEMAP_SPECIAL is returned.
     385              :  *
     386              :  * Otherwise, search for a simple mapping, and if found, set 'simple' to the
     387              :  * result and return CASEMAP_SIMPLE.
     388              :  *
     389              :  * If no mapping is found, return CASEMAP_SELF, and the caller should copy the
     390              :  * character without modification.
     391              :  */
     392              : static enum CaseMapResult
     393       329071 : casemap(char32_t u1, CaseKind casekind, bool full,
     394              :         const char *src, size_t srclen, size_t srcoff,
     395              :         char32_t *simple, const char32_t **special)
     396              : {
     397              :     uint16      idx;
     398              : 
     399              :     /* Fast path for codepoints < 0x80 */
     400       329071 :     if (u1 < 0x80)
     401              :     {
     402              :         /*
     403              :          * The first elements in all tables are reserved as 0 (as NULL). The
     404              :          * data starts at index 1, not 0.
     405              :          */
     406       327650 :         *simple = casekind_map[casekind][u1 + 1];
     407              : 
     408       327650 :         return CASEMAP_SIMPLE;
     409              :     }
     410              : 
     411         1421 :     idx = case_index(u1);
     412              : 
     413         1421 :     if (idx == 0)
     414           88 :         return CASEMAP_SELF;
     415              : 
     416         1446 :     if (full && case_map_special[idx] &&
     417          113 :         check_special_conditions(special_case[case_map_special[idx]].conditions,
     418              :                                  src, srclen, srcoff))
     419              :     {
     420           83 :         *special = special_case[case_map_special[idx]].map[casekind];
     421           83 :         return CASEMAP_SPECIAL;
     422              :     }
     423              : 
     424         1250 :     *simple = casekind_map[casekind][idx];
     425              : 
     426         1250 :     return CASEMAP_SIMPLE;
     427              : }
     428              : 
     429              : /*
     430              :  * Find entry in simple case map.
     431              :  * If the entry does not exist, 0 will be returned.
     432              :  */
     433              : static char32_t
     434          650 : find_case_map(char32_t ucs, const char32_t *map)
     435              : {
     436              :     /* Fast path for codepoints < 0x80 */
     437          650 :     if (ucs < 0x80)
     438              :         /* The first elements in all tables are reserved as 0 (as NULL). */
     439          362 :         return map[ucs + 1];
     440          288 :     return map[case_index(ucs)];
     441              : }
        

Generated by: LCOV version 2.0-1