LCOV - code coverage report
Current view: top level - src/common - unicode_case.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 101 109 92.7 %
Date: 2025-02-22 07:14:56 Functions: 10 12 83.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  * unicode_case.c
       3             :  *      Unicode case mapping and case conversion.
       4             :  *
       5             :  * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
       6             :  *
       7             :  * IDENTIFICATION
       8             :  *    src/common/unicode_case.c
       9             :  *
      10             :  *-------------------------------------------------------------------------
      11             :  */
      12             : #ifndef FRONTEND
      13             : #include "postgres.h"
      14             : #else
      15             : #include "postgres_fe.h"
      16             : #endif
      17             : 
      18             : #include "common/unicode_case.h"
      19             : #include "common/unicode_case_table.h"
      20             : #include "common/unicode_category.h"
      21             : #include "mb/pg_wchar.h"
      22             : 
      23             : static const pg_case_map *find_case_map(pg_wchar ucs);
      24             : static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
      25             :                            CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
      26             :                            void *wbstate);
      27             : static bool check_special_conditions(int conditions, const char *str,
      28             :                                      size_t len, size_t offset);
      29             : 
      30             : pg_wchar
      31         528 : unicode_lowercase_simple(pg_wchar code)
      32             : {
      33         528 :     const pg_case_map *map = find_case_map(code);
      34             : 
      35         528 :     return map ? map->simplemap[CaseLower] : code;
      36             : }
      37             : 
      38             : pg_wchar
      39           0 : unicode_titlecase_simple(pg_wchar code)
      40             : {
      41           0 :     const pg_case_map *map = find_case_map(code);
      42             : 
      43           0 :     return map ? map->simplemap[CaseTitle] : code;
      44             : }
      45             : 
      46             : pg_wchar
      47         528 : unicode_uppercase_simple(pg_wchar code)
      48             : {
      49         528 :     const pg_case_map *map = find_case_map(code);
      50             : 
      51         528 :     return map ? map->simplemap[CaseUpper] : code;
      52             : }
      53             : 
      54             : pg_wchar
      55           0 : unicode_casefold_simple(pg_wchar code)
      56             : {
      57           0 :     const pg_case_map *map = find_case_map(code);
      58             : 
      59           0 :     return map ? map->simplemap[CaseFold] : code;
      60             : }
      61             : 
      62             : /*
      63             :  * unicode_strlower()
      64             :  *
      65             :  * Convert src to lowercase, and return the result length (not including
      66             :  * terminating NUL).
      67             :  *
      68             :  * String src must be encoded in UTF-8. If srclen < 0, src must be
      69             :  * NUL-terminated.
      70             :  *
      71             :  * Result string is stored in dst, truncating if larger than dstsize. If
      72             :  * dstsize is greater than the result length, dst will be NUL-terminated;
      73             :  * otherwise not.
      74             :  *
      75             :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
      76             :  * required buffer size before allocating.
      77             :  *
      78             :  * If full is true, use special case mappings if available and if the
      79             :  * conditions are satisfied.
      80             :  */
      81             : size_t
      82       11946 : unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
      83             :                  bool full)
      84             : {
      85       11946 :     return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
      86             :                         NULL);
      87             : }
      88             : 
      89             : /*
      90             :  * unicode_strtitle()
      91             :  *
      92             :  * Convert src to titlecase, and return the result length (not including
      93             :  * terminating NUL).
      94             :  *
      95             :  * String src must be encoded in UTF-8. If srclen < 0, src must be
      96             :  * NUL-terminated.
      97             :  *
      98             :  * Result string is stored in dst, truncating if larger than dstsize. If
      99             :  * dstsize is greater than the result length, dst will be NUL-terminated;
     100             :  * otherwise not.
     101             :  *
     102             :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
     103             :  * required buffer size before allocating.
     104             :  *
     105             :  * If full is true, use special case mappings if available and if the
     106             :  * conditions are satisfied. Otherwise, use only simple mappings and use
     107             :  * uppercase instead of titlecase.
     108             :  *
     109             :  * Titlecasing requires knowledge about word boundaries, which is provided by
     110             :  * the callback wbnext. A word boundary is the offset of the start of a word
     111             :  * or the offset of the character immediately following a word.
     112             :  *
     113             :  * The caller is expected to initialize and free the callback state
     114             :  * wbstate. The callback should first return offset 0 for the first boundary;
     115             :  * then the offset of each subsequent word boundary; then the total length of
     116             :  * the string to indicate the final boundary.
     117             :  */
     118             : size_t
     119         170 : unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     120             :                  bool full, WordBoundaryNext wbnext, void *wbstate)
     121             : {
     122         170 :     return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
     123             :                         wbstate);
     124             : }
     125             : 
     126             : /*
     127             :  * unicode_strupper()
     128             :  *
     129             :  * Convert src to uppercase, and return the result length (not including
     130             :  * terminating NUL).
     131             :  *
     132             :  * String src must be encoded in UTF-8. If srclen < 0, src must be
     133             :  * NUL-terminated.
     134             :  *
     135             :  * Result string is stored in dst, truncating if larger than dstsize. If
     136             :  * dstsize is greater than the result length, dst will be NUL-terminated;
     137             :  * otherwise not.
     138             :  *
     139             :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
     140             :  * required buffer size before allocating.
     141             :  *
     142             :  * If full is true, use special case mappings if available and if the
     143             :  * conditions are satisfied.
     144             :  */
     145             : size_t
     146      316858 : unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     147             :                  bool full)
     148             : {
     149      316858 :     return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
     150             :                         NULL);
     151             : }
     152             : 
     153             : /*
     154             :  * unicode_strfold()
     155             :  *
     156             :  * Case fold src, and return the result length (not including terminating
     157             :  * NUL).
     158             :  *
     159             :  * String src must be encoded in UTF-8. If srclen < 0, src must be
     160             :  * NUL-terminated.
     161             :  *
     162             :  * Result string is stored in dst, truncating if larger than dstsize. If
     163             :  * dstsize is greater than the result length, dst will be NUL-terminated;
     164             :  * otherwise not.
     165             :  *
     166             :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
     167             :  * required buffer size before allocating.
     168             :  */
     169             : size_t
     170          12 : unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     171             :                 bool full)
     172             : {
     173          12 :     return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
     174             :                         NULL);
     175             : }
     176             : 
     177             : /*
     178             :  * Implement Unicode Default Case Conversion algorithm.
     179             :  *
     180             :  * If str_casekind is CaseLower or CaseUpper, map each character in the string
     181             :  * for which a mapping is available.
     182             :  *
     183             :  * If str_casekind is CaseTitle, maps characters found on a word boundary to
     184             :  * titlecase (or uppercase if full is false) and other characters to
     185             :  * lowercase. NB: does not currently implement the Unicode behavior in which
     186             :  * the word boundary is adjusted to the next Cased character. That behavior
     187             :  * could be implemented as an option, but it doesn't match the default
     188             :  * behavior of ICU, nor does it match the documented behavior of INITCAP().
     189             :  *
     190             :  * If full is true, use special mappings for relevant characters, which can
     191             :  * map a single codepoint to multiple codepoints, or depend on conditions.
     192             :  */
     193             : static size_t
     194      328986 : convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     195             :              CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
     196             :              void *wbstate)
     197             : {
     198             :     /* character CaseKind varies while titlecasing */
     199      328986 :     CaseKind    chr_casekind = str_casekind;
     200      328986 :     size_t      srcoff = 0;
     201      328986 :     size_t      result_len = 0;
     202      328986 :     size_t      boundary = 0;
     203             : 
     204             :     Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
     205             :            (str_casekind != CaseTitle && !wbnext && !wbstate));
     206             : 
     207      328986 :     if (str_casekind == CaseTitle)
     208             :     {
     209         170 :         boundary = wbnext(wbstate);
     210             :         Assert(boundary == 0);  /* start of text is always a boundary */
     211             :     }
     212             : 
     213      979402 :     while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
     214             :     {
     215      650416 :         pg_wchar    u1 = utf8_to_unicode((unsigned char *) src + srcoff);
     216      650416 :         int         u1len = unicode_utf8len(u1);
     217      650416 :         const pg_case_map *casemap = find_case_map(u1);
     218      650416 :         const pg_special_case *special = NULL;
     219             : 
     220      650416 :         if (str_casekind == CaseTitle)
     221             :         {
     222        1314 :             if (srcoff == boundary)
     223             :             {
     224         510 :                 chr_casekind = full ? CaseTitle : CaseUpper;
     225         510 :                 boundary = wbnext(wbstate);
     226             :             }
     227             :             else
     228         804 :                 chr_casekind = CaseLower;
     229             :         }
     230             : 
     231             :         /*
     232             :          * Find special case that matches the conditions, if any.
     233             :          *
     234             :          * Note: only a single special mapping per codepoint is currently
     235             :          * supported, though Unicode allows for multiple special mappings for
     236             :          * a single codepoint.
     237             :          */
     238      650416 :         if (full && casemap && casemap->special_case)
     239             :         {
     240         150 :             int16       conditions = casemap->special_case->conditions;
     241             : 
     242             :             Assert(casemap->special_case->codepoint == u1);
     243         150 :             if (check_special_conditions(conditions, src, srclen, srcoff))
     244         114 :                 special = casemap->special_case;
     245             :         }
     246             : 
     247             :         /* perform mapping, update result_len, and write to dst */
     248      650416 :         if (special)
     249             :         {
     250         282 :             for (int i = 0; i < MAX_CASE_EXPANSION; i++)
     251             :             {
     252         282 :                 pg_wchar    u2 = special->map[chr_casekind][i];
     253         282 :                 size_t      u2len = unicode_utf8len(u2);
     254             : 
     255         282 :                 if (u2 == '\0')
     256         114 :                     break;
     257             : 
     258         168 :                 if (result_len + u2len <= dstsize)
     259         168 :                     unicode_to_utf8(u2, (unsigned char *) dst + result_len);
     260             : 
     261         168 :                 result_len += u2len;
     262             :             }
     263             :         }
     264      650302 :         else if (casemap)
     265             :         {
     266      650254 :             pg_wchar    u2 = casemap->simplemap[chr_casekind];
     267      650254 :             pg_wchar    u2len = unicode_utf8len(u2);
     268             : 
     269      650254 :             if (result_len + u2len <= dstsize)
     270      650206 :                 unicode_to_utf8(u2, (unsigned char *) dst + result_len);
     271             : 
     272      650254 :             result_len += u2len;
     273             :         }
     274             :         else
     275             :         {
     276             :             /* no mapping; copy bytes from src */
     277          48 :             if (result_len + u1len <= dstsize)
     278          48 :                 memcpy(dst + result_len, src + srcoff, u1len);
     279             : 
     280          48 :             result_len += u1len;
     281             :         }
     282             : 
     283      650416 :         srcoff += u1len;
     284             :     }
     285             : 
     286      328986 :     if (result_len < dstsize)
     287      328914 :         dst[result_len] = '\0';
     288             : 
     289      328986 :     return result_len;
     290             : }
     291             : 
     292             : /*
     293             :  * Check that the condition matches Final_Sigma, described in Unicode Table
     294             :  * 3-17. The character at the given offset must be directly preceded by a
     295             :  * Cased character, and must not be directly followed by a Cased character.
     296             :  *
     297             :  * Case_Ignorable characters are ignored. NB: some characters may be both
     298             :  * Cased and Case_Ignorable, in which case they are ignored.
     299             :  */
     300             : static bool
     301          60 : check_final_sigma(const unsigned char *str, size_t len, size_t offset)
     302             : {
     303             :     /* the start of the string is not preceded by a Cased character */
     304          60 :     if (offset == 0)
     305           6 :         return false;
     306             : 
     307             :     /* iterate backwards, looking for Cased character */
     308         144 :     for (int i = offset - 1; i >= 0; i--)
     309             :     {
     310         144 :         if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
     311             :         {
     312          78 :             pg_wchar    curr = utf8_to_unicode(str + i);
     313             : 
     314          78 :             if (pg_u_prop_case_ignorable(curr))
     315          24 :                 continue;
     316          54 :             else if (pg_u_prop_cased(curr))
     317          42 :                 break;
     318             :             else
     319          12 :                 return false;
     320             :         }
     321          66 :         else if ((str[i] & 0xC0) == 0x80)
     322          66 :             continue;
     323             : 
     324             :         Assert(false);          /* invalid UTF-8 */
     325             :     }
     326             : 
     327             :     /* end of string is not followed by a Cased character */
     328          42 :     if (offset == len)
     329           0 :         return true;
     330             : 
     331             :     /* iterate forwards, looking for Cased character */
     332         132 :     for (int i = offset + 1; i < len && str[i] != '\0'; i++)
     333             :     {
     334         114 :         if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
     335             :         {
     336          48 :             pg_wchar    curr = utf8_to_unicode(str + i);
     337             : 
     338          48 :             if (pg_u_prop_case_ignorable(curr))
     339          24 :                 continue;
     340          24 :             else if (pg_u_prop_cased(curr))
     341          18 :                 return false;
     342             :             else
     343           6 :                 break;
     344             :         }
     345          66 :         else if ((str[i] & 0xC0) == 0x80)
     346          66 :             continue;
     347             : 
     348             :         Assert(false);          /* invalid UTF-8 */
     349             :     }
     350             : 
     351          24 :     return true;
     352             : }
     353             : 
     354             : static bool
     355         150 : check_special_conditions(int conditions, const char *str, size_t len,
     356             :                          size_t offset)
     357             : {
     358         150 :     if (conditions == 0)
     359          90 :         return true;
     360          60 :     else if (conditions == PG_U_FINAL_SIGMA)
     361          60 :         return check_final_sigma((unsigned char *) str, len, offset);
     362             : 
     363             :     /* no other conditions supported */
     364             :     Assert(false);
     365           0 :     return false;
     366             : }
     367             : 
     368             : /* find entry in simple case map, if any */
     369             : static const pg_case_map *
     370      651472 : find_case_map(pg_wchar ucs)
     371             : {
     372             :     int         min;
     373             :     int         mid;
     374             :     int         max;
     375             : 
     376             :     /* all chars <= 0x80 are stored in array for fast lookup */
     377             :     Assert(lengthof(case_map) >= 0x80);
     378      651472 :     if (ucs < 0x80)
     379             :     {
     380      649210 :         const pg_case_map *map = &case_map[ucs];
     381             : 
     382             :         Assert(map->codepoint == ucs);
     383      649210 :         return map;
     384             :     }
     385             : 
     386             :     /* otherwise, binary search */
     387        2262 :     min = 0x80;
     388        2262 :     max = lengthof(case_map) - 1;
     389       20700 :     while (max >= min)
     390             :     {
     391       20652 :         mid = (min + max) / 2;
     392       20652 :         if (ucs > case_map[mid].codepoint)
     393        6774 :             min = mid + 1;
     394       13878 :         else if (ucs < case_map[mid].codepoint)
     395       11664 :             max = mid - 1;
     396             :         else
     397        2214 :             return &case_map[mid];
     398             :     }
     399             : 
     400          48 :     return NULL;
     401             : }

Generated by: LCOV version 1.14