LCOV - PostgreSQL 19devel - src/common/unicode

LCOV - code coverage report

Current view:	top level - src/common - unicode_case.c (source / functions)		Hit	Total	Coverage
Test:	PostgreSQL 19devel	Lines:	104	112	92.9 %
Date:	2025-08-15 00:17:59	Functions:	11	13	84.6 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*-------------------------------------------------------------------------
       2             :  * unicode_case.c
       3             :  *      Unicode case mapping and case conversion.
       4             :  *
       5             :  * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
       6             :  *
       7             :  * IDENTIFICATION
       8             :  *    src/common/unicode_case.c
       9             :  *
      10             :  *-------------------------------------------------------------------------
      11             :  */
      12             : #ifndef FRONTEND
      13             : #include "postgres.h"
      14             : #else
      15             : #include "postgres_fe.h"
      16             : #endif
      17             : 
      18             : #include "common/unicode_case.h"
      19             : #include "common/unicode_case_table.h"
      20             : #include "common/unicode_category.h"
      21             : #include "mb/pg_wchar.h"
      22             : 
      23             : enum CaseMapResult
      24             : {
      25             :     CASEMAP_SELF,
      26             :     CASEMAP_SIMPLE,
      27             :     CASEMAP_SPECIAL,
      28             : };
      29             : 
      30             : /*
      31             :  * Map for each case kind.
      32             :  */
      33             : static const pg_wchar *const casekind_map[NCaseKind] =
      34             : {
      35             :     [CaseLower] = case_map_lower,
      36             :     [CaseTitle] = case_map_title,
      37             :     [CaseUpper] = case_map_upper,
      38             :     [CaseFold] = case_map_fold,
      39             : };
      40             : 
      41             : static pg_wchar find_case_map(pg_wchar ucs, const pg_wchar *map);
      42             : static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
      43             :                            CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
      44             :                            void *wbstate);
      45             : static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,
      46             :                                   const char *src, size_t srclen, size_t srcoff,
      47             :                                   pg_wchar *simple, const pg_wchar **special);
      48             : 
      49             : pg_wchar
      50         528 : unicode_lowercase_simple(pg_wchar code)
      51             : {
      52         528 :     pg_wchar    cp = find_case_map(code, case_map_lower);
      53             : 
      54         528 :     return cp != 0 ? cp : code;
      55             : }
      56             : 
      57             : pg_wchar
      58           0 : unicode_titlecase_simple(pg_wchar code)
      59             : {
      60           0 :     pg_wchar    cp = find_case_map(code, case_map_title);
      61             : 
      62           0 :     return cp != 0 ? cp : code;
      63             : }
      64             : 
      65             : pg_wchar
      66         528 : unicode_uppercase_simple(pg_wchar code)
      67             : {
      68         528 :     pg_wchar    cp = find_case_map(code, case_map_upper);
      69             : 
      70         528 :     return cp != 0 ? cp : code;
      71             : }
      72             : 
      73             : pg_wchar
      74           0 : unicode_casefold_simple(pg_wchar code)
      75             : {
      76           0 :     pg_wchar    cp = find_case_map(code, case_map_fold);
      77             : 
      78           0 :     return cp != 0 ? cp : code;
      79             : }
      80             : 
      81             : /*
      82             :  * unicode_strlower()
      83             :  *
      84             :  * Convert src to lowercase, and return the result length (not including
      85             :  * terminating NUL).
      86             :  *
      87             :  * String src must be encoded in UTF-8. If srclen < 0, src must be
      88             :  * NUL-terminated.
      89             :  *
      90             :  * Result string is stored in dst, truncating if larger than dstsize. If
      91             :  * dstsize is greater than the result length, dst will be NUL-terminated;
      92             :  * otherwise not.
      93             :  *
      94             :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
      95             :  * required buffer size before allocating.
      96             :  *
      97             :  * If full is true, use special case mappings if available and if the
      98             :  * conditions are satisfied.
      99             :  */
     100             : size_t
     101       12006 : unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     102             :                  bool full)
     103             : {
     104       12006 :     return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
     105             :                         NULL);
     106             : }
     107             : 
     108             : /*
     109             :  * unicode_strtitle()
     110             :  *
     111             :  * Convert src to titlecase, and return the result length (not including
     112             :  * terminating NUL).
     113             :  *
     114             :  * String src must be encoded in UTF-8. If srclen < 0, src must be
     115             :  * NUL-terminated.
     116             :  *
     117             :  * Result string is stored in dst, truncating if larger than dstsize. If
     118             :  * dstsize is greater than the result length, dst will be NUL-terminated;
     119             :  * otherwise not.
     120             :  *
     121             :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
     122             :  * required buffer size before allocating.
     123             :  *
     124             :  * If full is true, use special case mappings if available and if the
     125             :  * conditions are satisfied. Otherwise, use only simple mappings and use
     126             :  * uppercase instead of titlecase.
     127             :  *
     128             :  * Titlecasing requires knowledge about word boundaries, which is provided by
     129             :  * the callback wbnext. A word boundary is the offset of the start of a word
     130             :  * or the offset of the character immediately following a word.
     131             :  *
     132             :  * The caller is expected to initialize and free the callback state
     133             :  * wbstate. The callback should first return offset 0 for the first boundary;
     134             :  * then the offset of each subsequent word boundary; then the total length of
     135             :  * the string to indicate the final boundary.
     136             :  */
     137             : size_t
     138         194 : unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     139             :                  bool full, WordBoundaryNext wbnext, void *wbstate)
     140             : {
     141         194 :     return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
     142             :                         wbstate);
     143             : }
     144             : 
     145             : /*
     146             :  * unicode_strupper()
     147             :  *
     148             :  * Convert src to uppercase, and return the result length (not including
     149             :  * terminating NUL).
     150             :  *
     151             :  * String src must be encoded in UTF-8. If srclen < 0, src must be
     152             :  * NUL-terminated.
     153             :  *
     154             :  * Result string is stored in dst, truncating if larger than dstsize. If
     155             :  * dstsize is greater than the result length, dst will be NUL-terminated;
     156             :  * otherwise not.
     157             :  *
     158             :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
     159             :  * required buffer size before allocating.
     160             :  *
     161             :  * If full is true, use special case mappings if available and if the
     162             :  * conditions are satisfied.
     163             :  */
     164             : size_t
     165      316882 : unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     166             :                  bool full)
     167             : {
     168      316882 :     return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
     169             :                         NULL);
     170             : }
     171             : 
     172             : /*
     173             :  * unicode_strfold()
     174             :  *
     175             :  * Case fold src, and return the result length (not including terminating
     176             :  * NUL).
     177             :  *
     178             :  * String src must be encoded in UTF-8. If srclen < 0, src must be
     179             :  * NUL-terminated.
     180             :  *
     181             :  * Result string is stored in dst, truncating if larger than dstsize. If
     182             :  * dstsize is greater than the result length, dst will be NUL-terminated;
     183             :  * otherwise not.
     184             :  *
     185             :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
     186             :  * required buffer size before allocating.
     187             :  */
     188             : size_t
     189          12 : unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     190             :                 bool full)
     191             : {
     192          12 :     return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
     193             :                         NULL);
     194             : }
     195             : 
     196             : /*
     197             :  * Implement Unicode Default Case Conversion algorithm.
     198             :  *
     199             :  * If str_casekind is CaseLower or CaseUpper, map each character in the string
     200             :  * for which a mapping is available.
     201             :  *
     202             :  * If str_casekind is CaseTitle, maps characters found on a word boundary to
     203             :  * titlecase (or uppercase if full is false) and other characters to
     204             :  * lowercase. NB: does not currently implement the Unicode behavior in which
     205             :  * the word boundary is adjusted to the next Cased character. That behavior
     206             :  * could be implemented as an option, but it doesn't match the default
     207             :  * behavior of ICU, nor does it match the documented behavior of INITCAP().
     208             :  *
     209             :  * If full is true, use special mappings for relevant characters, which can
     210             :  * map a single codepoint to multiple codepoints, or depend on conditions.
     211             :  */
     212             : static size_t
     213      329094 : convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     214             :              CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
     215             :              void *wbstate)
     216             : {
     217             :     /* character CaseKind varies while titlecasing */
     218      329094 :     CaseKind    chr_casekind = str_casekind;
     219      329094 :     size_t      srcoff = 0;
     220      329094 :     size_t      result_len = 0;
     221      329094 :     size_t      boundary = 0;
     222             : 
     223             :     Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
     224             :            (str_casekind != CaseTitle && !wbnext && !wbstate));
     225             : 
     226      329094 :     if (str_casekind == CaseTitle)
     227             :     {
     228         194 :         boundary = wbnext(wbstate);
     229             :         Assert(boundary == 0);  /* start of text is always a boundary */
     230             :     }
     231             : 
     232      980714 :     while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
     233             :     {
     234      651620 :         pg_wchar    u1 = utf8_to_unicode((unsigned char *) src + srcoff);
     235      651620 :         int         u1len = unicode_utf8len(u1);
     236      651620 :         pg_wchar    simple = 0;
     237      651620 :         const pg_wchar *special = NULL;
     238             :         enum CaseMapResult casemap_result;
     239             : 
     240      651620 :         if (str_casekind == CaseTitle)
     241             :         {
     242        1506 :             if (srcoff == boundary)
     243             :             {
     244         630 :                 chr_casekind = full ? CaseTitle : CaseUpper;
     245         630 :                 boundary = wbnext(wbstate);
     246             :             }
     247             :             else
     248         876 :                 chr_casekind = CaseLower;
     249             :         }
     250             : 
     251      651620 :         casemap_result = casemap(u1, chr_casekind, full, src, srclen, srcoff,
     252             :                                  &simple, &special);
     253             : 
     254      651620 :         switch (casemap_result)
     255             :         {
     256         120 :             case CASEMAP_SELF:
     257             :                 /* no mapping; copy bytes from src */
     258             :                 Assert(simple == 0);
     259             :                 Assert(special == NULL);
     260         120 :                 if (result_len + u1len <= dstsize)
     261         120 :                     memcpy(dst + result_len, src + srcoff, u1len);
     262             : 
     263         120 :                 result_len += u1len;
     264         120 :                 break;
     265      651386 :             case CASEMAP_SIMPLE:
     266             :                 {
     267             :                     /* replace with single character */
     268      651386 :                     pg_wchar    u2 = simple;
     269      651386 :                     pg_wchar    u2len = unicode_utf8len(u2);
     270             : 
     271             :                     Assert(special == NULL);
     272      651386 :                     if (result_len + u2len <= dstsize)
     273      651338 :                         unicode_to_utf8(u2, (unsigned char *) dst + result_len);
     274             : 
     275      651386 :                     result_len += u2len;
     276             :                 }
     277      651386 :                 break;
     278         114 :             case CASEMAP_SPECIAL:
     279             :                 /* replace with up to MAX_CASE_EXPANSION characters */
     280             :                 Assert(simple == 0);
     281         282 :                 for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
     282             :                 {
     283         168 :                     pg_wchar    u2 = special[i];
     284         168 :                     size_t      u2len = unicode_utf8len(u2);
     285             : 
     286         168 :                     if (result_len + u2len <= dstsize)
     287         168 :                         unicode_to_utf8(u2, (unsigned char *) dst + result_len);
     288             : 
     289         168 :                     result_len += u2len;
     290             :                 }
     291         114 :                 break;
     292             :         }
     293             : 
     294      651620 :         srcoff += u1len;
     295             :     }
     296             : 
     297      329094 :     if (result_len < dstsize)
     298      329022 :         dst[result_len] = '\0';
     299             : 
     300      329094 :     return result_len;
     301             : }
     302             : 
     303             : /*
     304             :  * Check that the condition matches Final_Sigma, described in Unicode Table
     305             :  * 3-17. The character at the given offset must be directly preceded by a
     306             :  * Cased character, and must not be directly followed by a Cased character.
     307             :  *
     308             :  * Case_Ignorable characters are ignored. NB: some characters may be both
     309             :  * Cased and Case_Ignorable, in which case they are ignored.
     310             :  */
     311             : static bool
     312          60 : check_final_sigma(const unsigned char *str, size_t len, size_t offset)
     313             : {
     314             :     /* the start of the string is not preceded by a Cased character */
     315          60 :     if (offset == 0)
     316           6 :         return false;
     317             : 
     318             :     /* iterate backwards, looking for Cased character */
     319         144 :     for (int i = offset - 1; i >= 0; i--)
     320             :     {
     321         144 :         if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
     322             :         {
     323          78 :             pg_wchar    curr = utf8_to_unicode(str + i);
     324             : 
     325          78 :             if (pg_u_prop_case_ignorable(curr))
     326          24 :                 continue;
     327          54 :             else if (pg_u_prop_cased(curr))
     328          42 :                 break;
     329             :             else
     330          12 :                 return false;
     331             :         }
     332          66 :         else if ((str[i] & 0xC0) == 0x80)
     333          66 :             continue;
     334             : 
     335             :         Assert(false);          /* invalid UTF-8 */
     336             :     }
     337             : 
     338             :     /* end of string is not followed by a Cased character */
     339          42 :     if (offset == len)
     340           0 :         return true;
     341             : 
     342             :     /* iterate forwards, looking for Cased character */
     343         132 :     for (int i = offset + 1; i < len && str[i] != '\0'; i++)
     344             :     {
     345         114 :         if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
     346             :         {
     347          48 :             pg_wchar    curr = utf8_to_unicode(str + i);
     348             : 
     349          48 :             if (pg_u_prop_case_ignorable(curr))
     350          24 :                 continue;
     351          24 :             else if (pg_u_prop_cased(curr))
     352          18 :                 return false;
     353             :             else
     354           6 :                 break;
     355             :         }
     356          66 :         else if ((str[i] & 0xC0) == 0x80)
     357          66 :             continue;
     358             : 
     359             :         Assert(false);          /* invalid UTF-8 */
     360             :     }
     361             : 
     362          24 :     return true;
     363             : }
     364             : 
     365             : /*
     366             :  * Unicode allows for special casing to be applied only under certain
     367             :  * circumstances. The only currently-supported condition is Final_Sigma.
     368             :  */
     369             : static bool
     370         150 : check_special_conditions(int conditions, const char *str, size_t len,
     371             :                          size_t offset)
     372             : {
     373         150 :     if (conditions == 0)
     374          90 :         return true;
     375          60 :     else if (conditions == PG_U_FINAL_SIGMA)
     376          60 :         return check_final_sigma((unsigned char *) str, len, offset);
     377             : 
     378             :     /* no other conditions supported */
     379             :     Assert(false);
     380           0 :     return false;
     381             : }
     382             : 
     383             : /*
     384             :  * Map the given character to the requested case.
     385             :  *
     386             :  * If full is true, and a special case mapping is found and the conditions are
     387             :  * met, 'special' is set to the mapping result (which is an array of up to
     388             :  * MAX_CASE_EXPANSION characters) and CASEMAP_SPECIAL is returned.
     389             :  *
     390             :  * Otherwise, search for a simple mapping, and if found, set 'simple' to the
     391             :  * result and return CASEMAP_SIMPLE.
     392             :  *
     393             :  * If no mapping is found, return CASEMAP_SELF, and the caller should copy the
     394             :  * character without modification.
     395             :  */
     396             : static enum CaseMapResult
     397      651620 : casemap(pg_wchar u1, CaseKind casekind, bool full,
     398             :         const char *src, size_t srclen, size_t srcoff,
     399             :         pg_wchar *simple, const pg_wchar **special)
     400             : {
     401             :     uint16      idx;
     402             : 
     403             :     /* Fast path for codepoints < 0x80 */
     404      651620 :     if (u1 < 0x80)
     405             :     {
     406             :         /*
     407             :          * The first elements in all tables are reserved as 0 (as NULL). The
     408             :          * data starts at index 1, not 0.
     409             :          */
     410      649574 :         *simple = casekind_map[casekind][u1 + 1];
     411             : 
     412      649574 :         return CASEMAP_SIMPLE;
     413             :     }
     414             : 
     415        2046 :     idx = case_index(u1);
     416             : 
     417        2046 :     if (idx == 0)
     418         120 :         return CASEMAP_SELF;
     419             : 
     420        2076 :     if (full && case_map_special[idx] &&
     421         150 :         check_special_conditions(special_case[case_map_special[idx]].conditions,
     422             :                                  src, srclen, srcoff))
     423             :     {
     424         114 :         *special = special_case[case_map_special[idx]].map[casekind];
     425         114 :         return CASEMAP_SPECIAL;
     426             :     }
     427             : 
     428        1812 :     *simple = casekind_map[casekind][idx];
     429             : 
     430        1812 :     return CASEMAP_SIMPLE;
     431             : }
     432             : 
     433             : /*
     434             :  * Find entry in simple case map.
     435             :  * If the entry does not exist, 0 will be returned.
     436             :  */
     437             : static pg_wchar
     438        1056 : find_case_map(pg_wchar ucs, const pg_wchar *map)
     439             : {
     440             :     /* Fast path for codepoints < 0x80 */
     441        1056 :     if (ucs < 0x80)
     442             :         /* The first elements in all tables are reserved as 0 (as NULL). */
     443         624 :         return map[ucs + 1];
     444         432 :     return map[case_index(ucs)];
     445             : }

Generated by: LCOV version 1.16