LCOV - code coverage report
Current view: top level - src/common - unicode_case.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 99 104 95.2 %
Date: 2025-01-18 04:15:08 Functions: 9 10 90.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  * unicode_case.c
       3             :  *      Unicode case mapping and case conversion.
       4             :  *
       5             :  * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
       6             :  *
       7             :  * IDENTIFICATION
       8             :  *    src/common/unicode_case.c
       9             :  *
      10             :  *-------------------------------------------------------------------------
      11             :  */
      12             : #ifndef FRONTEND
      13             : #include "postgres.h"
      14             : #else
      15             : #include "postgres_fe.h"
      16             : #endif
      17             : 
      18             : #include "common/unicode_case.h"
      19             : #include "common/unicode_case_table.h"
      20             : #include "common/unicode_category.h"
      21             : #include "mb/pg_wchar.h"
      22             : 
      23             : static const pg_case_map *find_case_map(pg_wchar ucs);
      24             : static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
      25             :                            CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
      26             :                            void *wbstate);
      27             : static bool check_special_conditions(int conditions, const char *str,
      28             :                                      size_t len, size_t offset);
      29             : 
      30             : pg_wchar
      31         528 : unicode_lowercase_simple(pg_wchar code)
      32             : {
      33         528 :     const pg_case_map *map = find_case_map(code);
      34             : 
      35         528 :     return map ? map->simplemap[CaseLower] : code;
      36             : }
      37             : 
      38             : pg_wchar
      39           0 : unicode_titlecase_simple(pg_wchar code)
      40             : {
      41           0 :     const pg_case_map *map = find_case_map(code);
      42             : 
      43           0 :     return map ? map->simplemap[CaseTitle] : code;
      44             : }
      45             : 
      46             : pg_wchar
      47         528 : unicode_uppercase_simple(pg_wchar code)
      48             : {
      49         528 :     const pg_case_map *map = find_case_map(code);
      50             : 
      51         528 :     return map ? map->simplemap[CaseUpper] : code;
      52             : }
      53             : 
      54             : /*
      55             :  * unicode_strlower()
      56             :  *
      57             :  * Convert src to lowercase, and return the result length (not including
      58             :  * terminating NUL).
      59             :  *
      60             :  * String src must be encoded in UTF-8. If srclen < 0, src must be
      61             :  * NUL-terminated.
      62             :  *
      63             :  * Result string is stored in dst, truncating if larger than dstsize. If
      64             :  * dstsize is greater than the result length, dst will be NUL-terminated;
      65             :  * otherwise not.
      66             :  *
      67             :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
      68             :  * required buffer size before allocating.
      69             :  *
      70             :  * If full is true, use special case mappings if available and if the
      71             :  * conditions are satisfied.
      72             :  */
      73             : size_t
      74       11922 : unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
      75             :                  bool full)
      76             : {
      77       11922 :     return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
      78             :                         NULL);
      79             : }
      80             : 
      81             : /*
      82             :  * unicode_strtitle()
      83             :  *
      84             :  * Convert src to titlecase, and return the result length (not including
      85             :  * terminating NUL).
      86             :  *
      87             :  * String src must be encoded in UTF-8. If srclen < 0, src must be
      88             :  * NUL-terminated.
      89             :  *
      90             :  * Result string is stored in dst, truncating if larger than dstsize. If
      91             :  * dstsize is greater than the result length, dst will be NUL-terminated;
      92             :  * otherwise not.
      93             :  *
      94             :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
      95             :  * required buffer size before allocating.
      96             :  *
      97             :  * If full is true, use special case mappings if available and if the
      98             :  * conditions are satisfied. Otherwise, use only simple mappings and use
      99             :  * uppercase instead of titlecase.
     100             :  *
     101             :  * Titlecasing requires knowledge about word boundaries, which is provided by
     102             :  * the callback wbnext. A word boundary is the offset of the start of a word
     103             :  * or the offset of the character immediately following a word.
     104             :  *
     105             :  * The caller is expected to initialize and free the callback state
     106             :  * wbstate. The callback should first return offset 0 for the first boundary;
     107             :  * then the offset of each subsequent word boundary; then the total length of
     108             :  * the string to indicate the final boundary.
     109             :  */
     110             : size_t
     111         170 : unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     112             :                  bool full, WordBoundaryNext wbnext, void *wbstate)
     113             : {
     114         170 :     return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
     115             :                         wbstate);
     116             : }
     117             : 
     118             : /*
     119             :  * unicode_strupper()
     120             :  *
     121             :  * Convert src to uppercase, and return the result length (not including
     122             :  * terminating NUL).
     123             :  *
     124             :  * String src must be encoded in UTF-8. If srclen < 0, src must be
     125             :  * NUL-terminated.
     126             :  *
     127             :  * Result string is stored in dst, truncating if larger than dstsize. If
     128             :  * dstsize is greater than the result length, dst will be NUL-terminated;
     129             :  * otherwise not.
     130             :  *
     131             :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
     132             :  * required buffer size before allocating.
     133             :  *
     134             :  * If full is true, use special case mappings if available and if the
     135             :  * conditions are satisfied.
     136             :  */
     137             : size_t
     138      316858 : unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     139             :                  bool full)
     140             : {
     141      316858 :     return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
     142             :                         NULL);
     143             : }
     144             : 
     145             : /*
     146             :  * Implement Unicode Default Case Conversion algorithm.
     147             :  *
     148             :  * If str_casekind is CaseLower or CaseUpper, map each character in the string
     149             :  * for which a mapping is available.
     150             :  *
     151             :  * If str_casekind is CaseTitle, maps characters found on a word boundary to
     152             :  * titlecase (or uppercase if full is false) and other characters to
     153             :  * lowercase. NB: does not currently implement the Unicode behavior in which
     154             :  * the word boundary is adjusted to the next Cased character. That behavior
     155             :  * could be implemented as an option, but it doesn't match the default
     156             :  * behavior of ICU, nor does it match the documented behavior of INITCAP().
     157             :  *
     158             :  * If full is true, use special mappings for relevant characters, which can
     159             :  * map a single codepoint to multiple codepoints, or depend on conditions.
     160             :  */
     161             : static size_t
     162      328950 : convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     163             :              CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
     164             :              void *wbstate)
     165             : {
     166             :     /* character CaseKind varies while titlecasing */
     167      328950 :     CaseKind    chr_casekind = str_casekind;
     168      328950 :     size_t      srcoff = 0;
     169      328950 :     size_t      result_len = 0;
     170      328950 :     size_t      boundary = 0;
     171             : 
     172             :     Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
     173             :            (str_casekind != CaseTitle && !wbnext && !wbstate));
     174             : 
     175      328950 :     if (str_casekind == CaseTitle)
     176             :     {
     177         170 :         boundary = wbnext(wbstate);
     178             :         Assert(boundary == 0);  /* start of text is always a boundary */
     179             :     }
     180             : 
     181      978330 :     while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
     182             :     {
     183      649380 :         pg_wchar    u1 = utf8_to_unicode((unsigned char *) src + srcoff);
     184      649380 :         int         u1len = unicode_utf8len(u1);
     185      649380 :         const pg_case_map *casemap = find_case_map(u1);
     186      649380 :         const pg_special_case *special = NULL;
     187             : 
     188      649380 :         if (str_casekind == CaseTitle)
     189             :         {
     190        1314 :             if (srcoff == boundary)
     191             :             {
     192         510 :                 chr_casekind = full ? CaseTitle : CaseUpper;
     193         510 :                 boundary = wbnext(wbstate);
     194             :             }
     195             :             else
     196         804 :                 chr_casekind = CaseLower;
     197             :         }
     198             : 
     199             :         /*
     200             :          * Find special case that matches the conditions, if any.
     201             :          *
     202             :          * Note: only a single special mapping per codepoint is currently
     203             :          * supported, though Unicode allows for multiple special mappings for
     204             :          * a single codepoint.
     205             :          */
     206      649380 :         if (full && casemap && casemap->special_case)
     207             :         {
     208         126 :             int16       conditions = casemap->special_case->conditions;
     209             : 
     210             :             Assert(casemap->special_case->codepoint == u1);
     211         126 :             if (check_special_conditions(conditions, src, srclen, srcoff))
     212          96 :                 special = casemap->special_case;
     213             :         }
     214             : 
     215             :         /* perform mapping, update result_len, and write to dst */
     216      649380 :         if (special)
     217             :         {
     218         228 :             for (int i = 0; i < MAX_CASE_EXPANSION; i++)
     219             :             {
     220         228 :                 pg_wchar    u2 = special->map[chr_casekind][i];
     221         228 :                 size_t      u2len = unicode_utf8len(u2);
     222             : 
     223         228 :                 if (u2 == '\0')
     224          96 :                     break;
     225             : 
     226         132 :                 if (result_len + u2len <= dstsize)
     227         132 :                     unicode_to_utf8(u2, (unsigned char *) dst + result_len);
     228             : 
     229         132 :                 result_len += u2len;
     230             :             }
     231             :         }
     232      649284 :         else if (casemap)
     233             :         {
     234      649236 :             pg_wchar    u2 = casemap->simplemap[chr_casekind];
     235      649236 :             pg_wchar    u2len = unicode_utf8len(u2);
     236             : 
     237      649236 :             if (result_len + u2len <= dstsize)
     238      649188 :                 unicode_to_utf8(u2, (unsigned char *) dst + result_len);
     239             : 
     240      649236 :             result_len += u2len;
     241             :         }
     242             :         else
     243             :         {
     244             :             /* no mapping; copy bytes from src */
     245          48 :             if (result_len + u1len <= dstsize)
     246          48 :                 memcpy(dst + result_len, src + srcoff, u1len);
     247             : 
     248          48 :             result_len += u1len;
     249             :         }
     250             : 
     251      649380 :         srcoff += u1len;
     252             :     }
     253             : 
     254      328950 :     if (result_len < dstsize)
     255      328878 :         dst[result_len] = '\0';
     256             : 
     257      328950 :     return result_len;
     258             : }
     259             : 
     260             : /*
     261             :  * Check that the condition matches Final_Sigma, described in Unicode Table
     262             :  * 3-17. The character at the given offset must be directly preceded by a
     263             :  * Cased character, and must not be directly followed by a Cased character.
     264             :  *
     265             :  * Case_Ignorable characters are ignored. NB: some characters may be both
     266             :  * Cased and Case_Ignorable, in which case they are ignored.
     267             :  */
     268             : static bool
     269          54 : check_final_sigma(const unsigned char *str, size_t len, size_t offset)
     270             : {
     271             :     /* the start of the string is not preceded by a Cased character */
     272          54 :     if (offset == 0)
     273           6 :         return false;
     274             : 
     275             :     /* iterate backwards, looking for Cased character */
     276         138 :     for (int i = offset - 1; i >= 0; i--)
     277             :     {
     278         138 :         if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
     279             :         {
     280          72 :             pg_wchar    curr = utf8_to_unicode(str + i);
     281             : 
     282          72 :             if (pg_u_prop_case_ignorable(curr))
     283          24 :                 continue;
     284          48 :             else if (pg_u_prop_cased(curr))
     285          42 :                 break;
     286             :             else
     287           6 :                 return false;
     288             :         }
     289          66 :         else if ((str[i] & 0xC0) == 0x80)
     290          66 :             continue;
     291             : 
     292             :         Assert(false);          /* invalid UTF-8 */
     293             :     }
     294             : 
     295             :     /* end of string is not followed by a Cased character */
     296          42 :     if (offset == len)
     297           0 :         return true;
     298             : 
     299             :     /* iterate forwards, looking for Cased character */
     300         132 :     for (int i = offset + 1; i < len && str[i] != '\0'; i++)
     301             :     {
     302         114 :         if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
     303             :         {
     304          48 :             pg_wchar    curr = utf8_to_unicode(str + i);
     305             : 
     306          48 :             if (pg_u_prop_case_ignorable(curr))
     307          24 :                 continue;
     308          24 :             else if (pg_u_prop_cased(curr))
     309          18 :                 return false;
     310             :             else
     311           6 :                 break;
     312             :         }
     313          66 :         else if ((str[i] & 0xC0) == 0x80)
     314          66 :             continue;
     315             : 
     316             :         Assert(false);          /* invalid UTF-8 */
     317             :     }
     318             : 
     319          24 :     return true;
     320             : }
     321             : 
     322             : static bool
     323         126 : check_special_conditions(int conditions, const char *str, size_t len,
     324             :                          size_t offset)
     325             : {
     326         126 :     if (conditions == 0)
     327          72 :         return true;
     328          54 :     else if (conditions == PG_U_FINAL_SIGMA)
     329          54 :         return check_final_sigma((unsigned char *) str, len, offset);
     330             : 
     331             :     /* no other conditions supported */
     332             :     Assert(false);
     333           0 :     return false;
     334             : }
     335             : 
     336             : /* find entry in simple case map, if any */
     337             : static const pg_case_map *
     338      650436 : find_case_map(pg_wchar ucs)
     339             : {
     340             :     int         min;
     341             :     int         mid;
     342             :     int         max;
     343             : 
     344             :     /* all chars <= 0x80 are stored in array for fast lookup */
     345             :     Assert(lengthof(case_map) >= 0x80);
     346      650436 :     if (ucs < 0x80)
     347             :     {
     348      648294 :         const pg_case_map *map = &case_map[ucs];
     349             : 
     350             :         Assert(map->codepoint == ucs);
     351      648294 :         return map;
     352             :     }
     353             : 
     354             :     /* otherwise, binary search */
     355        2142 :     min = 0x80;
     356        2142 :     max = lengthof(case_map) - 1;
     357       19404 :     while (max >= min)
     358             :     {
     359       19356 :         mid = (min + max) / 2;
     360       19356 :         if (ucs > case_map[mid].codepoint)
     361        6330 :             min = mid + 1;
     362       13026 :         else if (ucs < case_map[mid].codepoint)
     363       10932 :             max = mid - 1;
     364             :         else
     365        2094 :             return &case_map[mid];
     366             :     }
     367             : 
     368          48 :     return NULL;
     369             : }

Generated by: LCOV version 1.14