LCOV - code coverage report
Current view: top level - src/common - unicode_case.c (source / functions) Hit Total Coverage
Test: PostgreSQL 17devel Lines: 55 58 94.8 %
Date: 2024-04-29 19:11:25 Functions: 7 8 87.5 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  * unicode_case.c
       3             :  *      Unicode case mapping and case conversion.
       4             :  *
       5             :  * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
       6             :  *
       7             :  * IDENTIFICATION
       8             :  *    src/common/unicode_case.c
       9             :  *
      10             :  *-------------------------------------------------------------------------
      11             :  */
      12             : #ifndef FRONTEND
      13             : #include "postgres.h"
      14             : #else
      15             : #include "postgres_fe.h"
      16             : #endif
      17             : 
      18             : #include "common/unicode_case.h"
      19             : #include "common/unicode_case_table.h"
      20             : #include "common/unicode_category.h"
      21             : #include "mb/pg_wchar.h"
      22             : 
      23             : static const pg_case_map *find_case_map(pg_wchar ucs);
      24             : static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
      25             :                            CaseKind str_casekind, WordBoundaryNext wbnext,
      26             :                            void *wbstate);
      27             : 
      28             : pg_wchar
      29         372 : unicode_lowercase_simple(pg_wchar code)
      30             : {
      31         372 :     const       pg_case_map *map = find_case_map(code);
      32             : 
      33         372 :     return map ? map->simplemap[CaseLower] : code;
      34             : }
      35             : 
      36             : pg_wchar
      37           0 : unicode_titlecase_simple(pg_wchar code)
      38             : {
      39           0 :     const       pg_case_map *map = find_case_map(code);
      40             : 
      41           0 :     return map ? map->simplemap[CaseTitle] : code;
      42             : }
      43             : 
      44             : pg_wchar
      45         372 : unicode_uppercase_simple(pg_wchar code)
      46             : {
      47         372 :     const       pg_case_map *map = find_case_map(code);
      48             : 
      49         372 :     return map ? map->simplemap[CaseUpper] : code;
      50             : }
      51             : 
      52             : /*
      53             :  * unicode_strlower()
      54             :  *
      55             :  * Convert src to lowercase, and return the result length (not including
      56             :  * terminating NUL).
      57             :  *
      58             :  * String src must be encoded in UTF-8. If srclen < 0, src must be
      59             :  * NUL-terminated.
      60             :  *
      61             :  * Result string is stored in dst, truncating if larger than dstsize. If
      62             :  * dstsize is greater than the result length, dst will be NUL-terminated;
      63             :  * otherwise not.
      64             :  *
      65             :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
      66             :  * required buffer size before allocating.
      67             :  */
      68             : size_t
      69        2014 : unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
      70             : {
      71        2014 :     return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
      72             : }
      73             : 
      74             : /*
      75             :  * unicode_strtitle()
      76             :  *
      77             :  * Convert src to titlecase, and return the result length (not including
      78             :  * terminating NUL).
      79             :  *
      80             :  * String src must be encoded in UTF-8. If srclen < 0, src must be
      81             :  * NUL-terminated.
      82             :  *
      83             :  * Result string is stored in dst, truncating if larger than dstsize. If
      84             :  * dstsize is greater than the result length, dst will be NUL-terminated;
      85             :  * otherwise not.
      86             :  *
      87             :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
      88             :  * required buffer size before allocating.
      89             :  *
      90             :  * Titlecasing requires knowledge about word boundaries, which is provided by
      91             :  * the callback wbnext. A word boundary is the offset of the start of a word
      92             :  * or the offset of the character immediately following a word.
      93             :  *
      94             :  * The caller is expected to initialize and free the callback state
      95             :  * wbstate. The callback should first return offset 0 for the first boundary;
      96             :  * then the offset of each subsequent word boundary; then the total length of
      97             :  * the string to indicate the final boundary.
      98             :  */
      99             : size_t
     100          86 : unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     101             :                  WordBoundaryNext wbnext, void *wbstate)
     102             : {
     103          86 :     return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
     104             :                         wbstate);
     105             : }
     106             : 
     107             : /*
     108             :  * unicode_strupper()
     109             :  *
     110             :  * Convert src to uppercase, and return the result length (not including
     111             :  * terminating NUL).
     112             :  *
     113             :  * String src must be encoded in UTF-8. If srclen < 0, src must be
     114             :  * NUL-terminated.
     115             :  *
     116             :  * Result string is stored in dst, truncating if larger than dstsize. If
     117             :  * dstsize is greater than the result length, dst will be NUL-terminated;
     118             :  * otherwise not.
     119             :  *
     120             :  * If dstsize is zero, dst may be NULL. This is useful for calculating the
     121             :  * required buffer size before allocating.
     122             :  */
     123             : size_t
     124      316786 : unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
     125             : {
     126      316786 :     return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
     127             : }
     128             : 
     129             : /*
     130             :  * If str_casekind is CaseLower or CaseUpper, map each character in the string
     131             :  * for which a mapping is available.
     132             :  *
     133             :  * If str_casekind is CaseTitle, maps characters found on a word boundary to
     134             :  * uppercase and other characters to lowercase.
     135             :  */
     136             : static size_t
     137      318886 : convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
     138             :              CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
     139             : {
     140             :     /* character CaseKind varies while titlecasing */
     141      318886 :     CaseKind    chr_casekind = str_casekind;
     142      318886 :     size_t      srcoff = 0;
     143      318886 :     size_t      result_len = 0;
     144      318886 :     size_t      boundary = 0;
     145             : 
     146             :     Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
     147             :            (str_casekind != CaseTitle && !wbnext && !wbstate));
     148             : 
     149      318886 :     if (str_casekind == CaseTitle)
     150             :     {
     151          86 :         boundary = wbnext(wbstate);
     152             :         Assert(boundary == 0);  /* start of text is always a boundary */
     153             :     }
     154             : 
     155      918522 :     while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
     156             :     {
     157      599636 :         pg_wchar    u1 = utf8_to_unicode((unsigned char *) src + srcoff);
     158      599636 :         int         u1len = unicode_utf8len(u1);
     159      599636 :         const       pg_case_map *casemap = find_case_map(u1);
     160             : 
     161      599636 :         if (str_casekind == CaseTitle)
     162             :         {
     163         666 :             if (srcoff == boundary)
     164             :             {
     165         258 :                 chr_casekind = CaseUpper;
     166         258 :                 boundary = wbnext(wbstate);
     167             :             }
     168             :             else
     169         408 :                 chr_casekind = CaseLower;
     170             :         }
     171             : 
     172             :         /* perform mapping, update result_len, and write to dst */
     173      599636 :         if (casemap)
     174             :         {
     175      599540 :             pg_wchar    u2 = casemap->simplemap[chr_casekind];
     176      599540 :             pg_wchar    u2len = unicode_utf8len(u2);
     177             : 
     178      599540 :             if (result_len + u2len <= dstsize)
     179      599516 :                 unicode_to_utf8(u2, (unsigned char *) dst + result_len);
     180             : 
     181      599540 :             result_len += u2len;
     182             :         }
     183             :         else
     184             :         {
     185             :             /* no mapping; copy bytes from src */
     186          96 :             if (result_len + u1len <= dstsize)
     187          96 :                 memcpy(dst + result_len, src + srcoff, u1len);
     188             : 
     189          96 :             result_len += u1len;
     190             :         }
     191             : 
     192      599636 :         srcoff += u1len;
     193             :     }
     194             : 
     195      318886 :     if (result_len < dstsize)
     196      318850 :         dst[result_len] = '\0';
     197             : 
     198      318886 :     return result_len;
     199             : }
     200             : 
     201             : /* find entry in simple case map, if any */
     202             : static const pg_case_map *
     203      600380 : find_case_map(pg_wchar ucs)
     204             : {
     205             :     int         min;
     206             :     int         mid;
     207             :     int         max;
     208             : 
     209             :     /* all chars <= 0x80 are stored in array for fast lookup */
     210             :     Assert(lengthof(case_map) >= 0x80);
     211      600380 :     if (ucs < 0x80)
     212             :     {
     213      599360 :         const       pg_case_map *map = &case_map[ucs];
     214             : 
     215             :         Assert(map->codepoint == ucs);
     216      599360 :         return map;
     217             :     }
     218             : 
     219             :     /* otherwise, binary search */
     220        1020 :     min = 0x80;
     221        1020 :     max = lengthof(case_map) - 1;
     222       10536 :     while (max >= min)
     223             :     {
     224       10440 :         mid = (min + max) / 2;
     225       10440 :         if (ucs > case_map[mid].codepoint)
     226        3150 :             min = mid + 1;
     227        7290 :         else if (ucs < case_map[mid].codepoint)
     228        6366 :             max = mid - 1;
     229             :         else
     230         924 :             return &case_map[mid];
     231             :     }
     232             : 
     233          96 :     return NULL;
     234             : }

Generated by: LCOV version 1.14