LCOV - code coverage report
Current view: top level - src/common - unicode_case.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 55 58 94.8 %
Date: 2024-11-21 08:14:44 Functions: 7 8 87.5 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  * unicode_case.c
       3             :  *      Unicode case mapping and case conversion.
       4             :  *
       5             :  * Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
       6             :  *
       7             :  * IDENTIFICATION
       8             :  *    src/common/unicode_case.c
       9             :  *
      10             :  *-------------------------------------------------------------------------
      11             :  */
      12             : #ifndef FRONTEND
      13             : #include "postgres.h"
      14             : #else
      15             : #include "postgres_fe.h"
      16             : #endif
      17             : 
      18             : #include "common/unicode_case.h"
      19             : #include "common/unicode_case_table.h"
      20             : #include "mb/pg_wchar.h"
      21             : 
      22             : static const pg_case_map *find_case_map(pg_wchar ucs);
      23             : static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
      24             :                            CaseKind str_casekind, WordBoundaryNext wbnext,
      25             :                            void *wbstate);
      26             : 
      27             : pg_wchar
      28         372 : unicode_lowercase_simple(pg_wchar code)
      29             : {
      30         372 :     const pg_case_map *map = find_case_map(code);
      31             : 
      32         372 :     return map ? map->simplemap[CaseLower] : code;
      33             : }
      34             : 
      35             : pg_wchar
      36           0 : unicode_titlecase_simple(pg_wchar code)
      37             : {
      38           0 :     const pg_case_map *map = find_case_map(code);
      39             : 
      40           0 :     return map ? map->simplemap[CaseTitle] : code;
      41             : }
      42             : 
      43             : pg_wchar
      44         372 : unicode_uppercase_simple(pg_wchar code)
      45             : {
      46         372 :     const pg_case_map *map = find_case_map(code);
      47             : 
      48         372 :     return map ? map->simplemap[CaseUpper] : code;
      49             : }
      50             : 
      51             : /*
      52             :  * unicode_strlower()
      53             :  *
      54             :  * Convert src to lowercase, and return the result length (not including
      55             :  * terminating NUL).
      56             :  *
      57             :  * String src must be encoded in UTF-8. If srclen < 0, src must be
      58             :  * NUL-terminated.
      59             :  *
      60             :  * Result string is stored in dst, truncating if larger than dstsize. If
      61             :  * dstsize is greater than the result length, dst will be NUL-terminated;
      62             :  * otherwise not.
      63             :  *
      64             :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
      65             :  * required buffer size before allocating.
      66             :  */
      67             : size_t
      68        2046 : unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
      69             : {
      70        2046 :     return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
      71             : }
      72             : 
      73             : /*
      74             :  * unicode_strtitle()
      75             :  *
      76             :  * Convert src to titlecase, and return the result length (not including
      77             :  * terminating NUL).
      78             :  *
      79             :  * String src must be encoded in UTF-8. If srclen < 0, src must be
      80             :  * NUL-terminated.
      81             :  *
      82             :  * Result string is stored in dst, truncating if larger than dstsize. If
      83             :  * dstsize is greater than the result length, dst will be NUL-terminated;
      84             :  * otherwise not.
      85             :  *
      86             :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
      87             :  * required buffer size before allocating.
      88             :  *
      89             :  * Titlecasing requires knowledge about word boundaries, which is provided by
      90             :  * the callback wbnext. A word boundary is the offset of the start of a word
      91             :  * or the offset of the character immediately following a word.
      92             :  *
      93             :  * The caller is expected to initialize and free the callback state
      94             :  * wbstate. The callback should first return offset 0 for the first boundary;
      95             :  * then the offset of each subsequent word boundary; then the total length of
      96             :  * the string to indicate the final boundary.
      97             :  */
      98             : size_t
      99          86 : unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     100             :                  WordBoundaryNext wbnext, void *wbstate)
     101             : {
     102          86 :     return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
     103             :                         wbstate);
     104             : }
     105             : 
     106             : /*
     107             :  * unicode_strupper()
     108             :  *
     109             :  * Convert src to uppercase, and return the result length (not including
     110             :  * terminating NUL).
     111             :  *
     112             :  * String src must be encoded in UTF-8. If srclen < 0, src must be
     113             :  * NUL-terminated.
     114             :  *
     115             :  * Result string is stored in dst, truncating if larger than dstsize. If
     116             :  * dstsize is greater than the result length, dst will be NUL-terminated;
     117             :  * otherwise not.
     118             :  *
     119             :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
     120             :  * required buffer size before allocating.
     121             :  */
     122             : size_t
     123      316786 : unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
     124             : {
     125      316786 :     return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
     126             : }
     127             : 
     128             : /*
     129             :  * If str_casekind is CaseLower or CaseUpper, map each character in the string
     130             :  * for which a mapping is available.
     131             :  *
     132             :  * If str_casekind is CaseTitle, maps characters found on a word boundary to
     133             :  * uppercase and other characters to lowercase.
     134             :  */
     135             : static size_t
     136      318918 : convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     137             :              CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
     138             : {
     139             :     /* character CaseKind varies while titlecasing */
     140      318918 :     CaseKind    chr_casekind = str_casekind;
     141      318918 :     size_t      srcoff = 0;
     142      318918 :     size_t      result_len = 0;
     143      318918 :     size_t      boundary = 0;
     144             : 
     145             :     Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
     146             :            (str_casekind != CaseTitle && !wbnext && !wbstate));
     147             : 
     148      318918 :     if (str_casekind == CaseTitle)
     149             :     {
     150          86 :         boundary = wbnext(wbstate);
     151             :         Assert(boundary == 0);  /* start of text is always a boundary */
     152             :     }
     153             : 
     154      919206 :     while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
     155             :     {
     156      600288 :         pg_wchar    u1 = utf8_to_unicode((unsigned char *) src + srcoff);
     157      600288 :         int         u1len = unicode_utf8len(u1);
     158      600288 :         const pg_case_map *casemap = find_case_map(u1);
     159             : 
     160      600288 :         if (str_casekind == CaseTitle)
     161             :         {
     162         666 :             if (srcoff == boundary)
     163             :             {
     164         258 :                 chr_casekind = CaseUpper;
     165         258 :                 boundary = wbnext(wbstate);
     166             :             }
     167             :             else
     168         408 :                 chr_casekind = CaseLower;
     169             :         }
     170             : 
     171             :         /* perform mapping, update result_len, and write to dst */
     172      600288 :         if (casemap)
     173             :         {
     174      600192 :             pg_wchar    u2 = casemap->simplemap[chr_casekind];
     175      600192 :             pg_wchar    u2len = unicode_utf8len(u2);
     176             : 
     177      600192 :             if (result_len + u2len <= dstsize)
     178      600168 :                 unicode_to_utf8(u2, (unsigned char *) dst + result_len);
     179             : 
     180      600192 :             result_len += u2len;
     181             :         }
     182             :         else
     183             :         {
     184             :             /* no mapping; copy bytes from src */
     185          96 :             if (result_len + u1len <= dstsize)
     186          96 :                 memcpy(dst + result_len, src + srcoff, u1len);
     187             : 
     188          96 :             result_len += u1len;
     189             :         }
     190             : 
     191      600288 :         srcoff += u1len;
     192             :     }
     193             : 
     194      318918 :     if (result_len < dstsize)
     195      318882 :         dst[result_len] = '\0';
     196             : 
     197      318918 :     return result_len;
     198             : }
     199             : 
     200             : /* find entry in simple case map, if any */
     201             : static const pg_case_map *
     202      601032 : find_case_map(pg_wchar ucs)
     203             : {
     204             :     int         min;
     205             :     int         mid;
     206             :     int         max;
     207             : 
     208             :     /* all chars <= 0x80 are stored in array for fast lookup */
     209             :     Assert(lengthof(case_map) >= 0x80);
     210      601032 :     if (ucs < 0x80)
     211             :     {
     212      600012 :         const pg_case_map *map = &case_map[ucs];
     213             : 
     214             :         Assert(map->codepoint == ucs);
     215      600012 :         return map;
     216             :     }
     217             : 
     218             :     /* otherwise, binary search */
     219        1020 :     min = 0x80;
     220        1020 :     max = lengthof(case_map) - 1;
     221       10536 :     while (max >= min)
     222             :     {
     223       10440 :         mid = (min + max) / 2;
     224       10440 :         if (ucs > case_map[mid].codepoint)
     225        3150 :             min = mid + 1;
     226        7290 :         else if (ucs < case_map[mid].codepoint)
     227        6366 :             max = mid - 1;
     228             :         else
     229         924 :             return &case_map[mid];
     230             :     }
     231             : 
     232          96 :     return NULL;
     233             : }

Generated by: LCOV version 1.14