LCOV - PostgreSQL 19devel - src/common/unicode

LCOV - code coverage report

Current view:	top level - src/common - unicode_category.c (source / functions)		Hit	Total	Coverage
Test:	PostgreSQL 19devel	Lines:	75	235	31.9 %
Date:	2025-12-24 10:18:01	Functions:	15	25	60.0 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*-------------------------------------------------------------------------
       2             :  * unicode_category.c
       3             :  *      Determine general category and character properties of Unicode
       4             :  *      characters. Encoding must be UTF8, where we assume that the char32_t
       5             :  *      representation is a code point.
       6             :  *
       7             :  * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
       8             :  *
       9             :  * IDENTIFICATION
      10             :  *    src/common/unicode_category.c
      11             :  *
      12             :  *-------------------------------------------------------------------------
      13             :  */
      14             : #ifndef FRONTEND
      15             : #include "postgres.h"
      16             : #else
      17             : #include "postgres_fe.h"
      18             : #endif
      19             : 
      20             : #include "common/unicode_category.h"
      21             : #include "common/unicode_category_table.h"
      22             : 
      23             : /*
      24             :  * Create bitmasks from pg_unicode_category values for efficient comparison of
      25             :  * multiple categories. For instance, PG_U_MN_MASK is a bitmask representing
      26             :  * the general category Mn; and PG_U_M_MASK represents general categories Mn,
      27             :  * Me, and Mc.
      28             :  *
      29             :  * The number of Unicode General Categories should never grow, so a 32-bit
      30             :  * mask is fine.
      31             :  */
      32             : #define PG_U_CATEGORY_MASK(X) ((uint32)(1 << (X)))
      33             : 
      34             : #define PG_U_LU_MASK PG_U_CATEGORY_MASK(PG_U_UPPERCASE_LETTER)
      35             : #define PG_U_LL_MASK PG_U_CATEGORY_MASK(PG_U_LOWERCASE_LETTER)
      36             : #define PG_U_LT_MASK PG_U_CATEGORY_MASK(PG_U_TITLECASE_LETTER)
      37             : #define PG_U_LC_MASK (PG_U_LU_MASK|PG_U_LL_MASK|PG_U_LT_MASK)
      38             : #define PG_U_LM_MASK PG_U_CATEGORY_MASK(PG_U_MODIFIER_LETTER)
      39             : #define PG_U_LO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_LETTER)
      40             : #define PG_U_L_MASK (PG_U_LU_MASK|PG_U_LL_MASK|PG_U_LT_MASK|PG_U_LM_MASK|\
      41             :                      PG_U_LO_MASK)
      42             : #define PG_U_MN_MASK PG_U_CATEGORY_MASK(PG_U_NONSPACING_MARK)
      43             : #define PG_U_ME_MASK PG_U_CATEGORY_MASK(PG_U_ENCLOSING_MARK)
      44             : #define PG_U_MC_MASK PG_U_CATEGORY_MASK(PG_U_SPACING_MARK)
      45             : #define PG_U_M_MASK (PG_U_MN_MASK|PG_U_MC_MASK|PG_U_ME_MASK)
      46             : #define PG_U_ND_MASK PG_U_CATEGORY_MASK(PG_U_DECIMAL_NUMBER)
      47             : #define PG_U_NL_MASK PG_U_CATEGORY_MASK(PG_U_LETTER_NUMBER)
      48             : #define PG_U_NO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_NUMBER)
      49             : #define PG_U_N_MASK (PG_U_ND_MASK|PG_U_NL_MASK|PG_U_NO_MASK)
      50             : #define PG_U_PC_MASK PG_U_CATEGORY_MASK(PG_U_CONNECTOR_PUNCTUATION)
      51             : #define PG_U_PD_MASK PG_U_CATEGORY_MASK(PG_U_DASH_PUNCTUATION)
      52             : #define PG_U_PS_MASK PG_U_CATEGORY_MASK(PG_U_OPEN_PUNCTUATION)
      53             : #define PG_U_PE_MASK PG_U_CATEGORY_MASK(PG_U_CLOSE_PUNCTUATION)
      54             : #define PG_U_PI_MASK PG_U_CATEGORY_MASK(PG_U_INITIAL_PUNCTUATION)
      55             : #define PG_U_PF_MASK PG_U_CATEGORY_MASK(PG_U_FINAL_PUNCTUATION)
      56             : #define PG_U_PO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_PUNCTUATION)
      57             : #define PG_U_P_MASK (PG_U_PC_MASK|PG_U_PD_MASK|PG_U_PS_MASK|PG_U_PE_MASK|\
      58             :                      PG_U_PI_MASK|PG_U_PF_MASK|PG_U_PO_MASK)
      59             : #define PG_U_SM_MASK PG_U_CATEGORY_MASK(PG_U_MATH_SYMBOL)
      60             : #define PG_U_SC_MASK PG_U_CATEGORY_MASK(PG_U_CURRENCY_SYMBOL)
      61             : #define PG_U_SK_MASK PG_U_CATEGORY_MASK(PG_U_MODIFIER_SYMBOL)
      62             : #define PG_U_SO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_SYMBOL)
      63             : #define PG_U_S_MASK (PG_U_SM_MASK|PG_U_SC_MASK|PG_U_SK_MASK|PG_U_SO_MASK)
      64             : #define PG_U_ZS_MASK PG_U_CATEGORY_MASK(PG_U_SPACE_SEPARATOR)
      65             : #define PG_U_ZL_MASK PG_U_CATEGORY_MASK(PG_U_LINE_SEPARATOR)
      66             : #define PG_U_ZP_MASK PG_U_CATEGORY_MASK(PG_U_PARAGRAPH_SEPARATOR)
      67             : #define PG_U_Z_MASK (PG_U_ZS_MASK|PG_U_ZL_MASK|PG_U_ZP_MASK)
      68             : #define PG_U_CC_MASK PG_U_CATEGORY_MASK(PG_U_CONTROL)
      69             : #define PG_U_CF_MASK PG_U_CATEGORY_MASK(PG_U_FORMAT)
      70             : #define PG_U_CS_MASK PG_U_CATEGORY_MASK(PG_U_SURROGATE)
      71             : #define PG_U_CO_MASK PG_U_CATEGORY_MASK(PG_U_PRIVATE_USE)
      72             : #define PG_U_CN_MASK PG_U_CATEGORY_MASK(PG_U_UNASSIGNED)
      73             : #define PG_U_C_MASK (PG_U_CC_MASK|PG_U_CF_MASK|PG_U_CS_MASK|PG_U_CO_MASK|\
      74             :                      PG_U_CN_MASK)
      75             : 
      76             : #define PG_U_CHARACTER_TAB  0x09
      77             : 
      78             : static bool range_search(const pg_unicode_range *tbl, size_t size,
      79             :                          char32_t code);
      80             : 
      81             : /*
      82             :  * Unicode general category for the given codepoint.
      83             :  */
      84             : pg_unicode_category
      85       30948 : unicode_category(char32_t code)
      86             : {
      87       30948 :     int         min = 0;
      88             :     int         mid;
      89       30948 :     int         max = lengthof(unicode_categories) - 1;
      90             : 
      91             :     Assert(code <= 0x10ffff);
      92             : 
      93       30948 :     if (code < 0x80)
      94        2640 :         return unicode_opt_ascii[code].category;
      95             : 
      96      305046 :     while (max >= min)
      97             :     {
      98      303672 :         mid = (min + max) / 2;
      99      303672 :         if (code > unicode_categories[mid].last)
     100      123942 :             min = mid + 1;
     101      179730 :         else if (code < unicode_categories[mid].first)
     102      152796 :             max = mid - 1;
     103             :         else
     104       26934 :             return unicode_categories[mid].category;
     105             :     }
     106             : 
     107        1374 :     return PG_U_UNASSIGNED;
     108             : }
     109             : 
     110             : bool
     111       94666 : pg_u_prop_alphabetic(char32_t code)
     112             : {
     113       94666 :     if (code < 0x80)
     114       44170 :         return unicode_opt_ascii[code].properties & PG_U_PROP_ALPHABETIC;
     115             : 
     116       50496 :     return range_search(unicode_alphabetic,
     117             :                         lengthof(unicode_alphabetic),
     118             :                         code);
     119             : }
     120             : 
     121             : bool
     122          60 : pg_u_prop_lowercase(char32_t code)
     123             : {
     124          60 :     if (code < 0x80)
     125           0 :         return unicode_opt_ascii[code].properties & PG_U_PROP_LOWERCASE;
     126             : 
     127          60 :     return range_search(unicode_lowercase,
     128             :                         lengthof(unicode_lowercase),
     129             :                         code);
     130             : }
     131             : 
     132             : bool
     133       24636 : pg_u_prop_uppercase(char32_t code)
     134             : {
     135       24636 :     if (code < 0x80)
     136        1536 :         return unicode_opt_ascii[code].properties & PG_U_PROP_UPPERCASE;
     137             : 
     138       23100 :     return range_search(unicode_uppercase,
     139             :                         lengthof(unicode_uppercase),
     140             :                         code);
     141             : }
     142             : 
     143             : bool
     144          78 : pg_u_prop_cased(char32_t code)
     145             : {
     146             :     uint32      category_mask;
     147             : 
     148          78 :     if (code < 0x80)
     149          18 :         return unicode_opt_ascii[code].properties & PG_U_PROP_CASED;
     150             : 
     151          60 :     category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
     152             : 
     153          60 :     return category_mask & PG_U_LT_MASK ||
     154         120 :         pg_u_prop_lowercase(code) ||
     155          60 :         pg_u_prop_uppercase(code);
     156             : }
     157             : 
     158             : bool
     159         126 : pg_u_prop_case_ignorable(char32_t code)
     160             : {
     161         126 :     if (code < 0x80)
     162          18 :         return unicode_opt_ascii[code].properties & PG_U_PROP_CASE_IGNORABLE;
     163             : 
     164         108 :     return range_search(unicode_case_ignorable,
     165             :                         lengthof(unicode_case_ignorable),
     166             :                         code);
     167             : }
     168             : 
     169             : bool
     170       16624 : pg_u_prop_white_space(char32_t code)
     171             : {
     172       16624 :     if (code < 0x80)
     173        1264 :         return unicode_opt_ascii[code].properties & PG_U_PROP_WHITE_SPACE;
     174             : 
     175       15360 :     return range_search(unicode_white_space,
     176             :                         lengthof(unicode_white_space),
     177             :                         code);
     178             : }
     179             : 
     180             : bool
     181           0 : pg_u_prop_hex_digit(char32_t code)
     182             : {
     183           0 :     if (code < 0x80)
     184           0 :         return unicode_opt_ascii[code].properties & PG_U_PROP_HEX_DIGIT;
     185             : 
     186           0 :     return range_search(unicode_hex_digit,
     187             :                         lengthof(unicode_hex_digit),
     188             :                         code);
     189             : }
     190             : 
     191             : bool
     192           0 : pg_u_prop_join_control(char32_t code)
     193             : {
     194           0 :     if (code < 0x80)
     195           0 :         return unicode_opt_ascii[code].properties & PG_U_PROP_JOIN_CONTROL;
     196             : 
     197           0 :     return range_search(unicode_join_control,
     198             :                         lengthof(unicode_join_control),
     199             :                         code);
     200             : }
     201             : 
     202             : /*
     203             :  * The following functions implement the Compatibility Properties described
     204             :  * at: http://www.unicode.org/reports/tr18/#Compatibility_Properties
     205             :  *
     206             :  * If 'posix' is true, implements the "POSIX Compatible" variant, otherwise
     207             :  * the "Standard" variant.
     208             :  */
     209             : 
     210             : bool
     211       88880 : pg_u_isdigit(char32_t code, bool posix)
     212             : {
     213       88880 :     if (posix)
     214       73370 :         return ('0' <= code && code <= '9');
     215             :     else
     216       15510 :         return unicode_category(code) == PG_U_DECIMAL_NUMBER;
     217             : }
     218             : 
     219             : bool
     220       94666 : pg_u_isalpha(char32_t code)
     221             : {
     222       94666 :     return pg_u_prop_alphabetic(code);
     223             : }
     224             : 
     225             : bool
     226       42724 : pg_u_isalnum(char32_t code, bool posix)
     227             : {
     228       42724 :     return pg_u_isalpha(code) || pg_u_isdigit(code, posix);
     229             : }
     230             : 
     231             : bool
     232           0 : pg_u_isword(char32_t code)
     233             : {
     234           0 :     uint32      category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
     235             : 
     236             :     return
     237           0 :         category_mask & (PG_U_M_MASK | PG_U_ND_MASK | PG_U_PC_MASK) ||
     238           0 :         pg_u_isalpha(code) ||
     239           0 :         pg_u_prop_join_control(code);
     240             : }
     241             : 
     242             : bool
     243       24576 : pg_u_isupper(char32_t code)
     244             : {
     245       24576 :     return pg_u_prop_uppercase(code);
     246             : }
     247             : 
     248             : bool
     249           0 : pg_u_islower(char32_t code)
     250             : {
     251           0 :     return pg_u_prop_lowercase(code);
     252             : }
     253             : 
     254             : bool
     255           0 : pg_u_isblank(char32_t code)
     256             : {
     257           0 :     return code == PG_U_CHARACTER_TAB ||
     258           0 :         unicode_category(code) == PG_U_SPACE_SEPARATOR;
     259             : }
     260             : 
     261             : bool
     262           0 : pg_u_iscntrl(char32_t code)
     263             : {
     264           0 :     return unicode_category(code) == PG_U_CONTROL;
     265             : }
     266             : 
     267             : bool
     268           0 : pg_u_isgraph(char32_t code)
     269             : {
     270           0 :     uint32      category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
     271             : 
     272           0 :     if (category_mask & (PG_U_CC_MASK | PG_U_CS_MASK | PG_U_CN_MASK) ||
     273           0 :         pg_u_isspace(code))
     274           0 :         return false;
     275           0 :     return true;
     276             : }
     277             : 
     278             : bool
     279           0 : pg_u_isprint(char32_t code)
     280             : {
     281           0 :     pg_unicode_category category = unicode_category(code);
     282             : 
     283           0 :     if (category == PG_U_CONTROL)
     284           0 :         return false;
     285             : 
     286           0 :     return pg_u_isgraph(code) || pg_u_isblank(code);
     287             : }
     288             : 
     289             : bool
     290       24576 : pg_u_ispunct(char32_t code, bool posix)
     291             : {
     292             :     uint32      category_mask;
     293             : 
     294       24576 :     if (posix)
     295             :     {
     296       12288 :         if (pg_u_isalpha(code))
     297        9240 :             return false;
     298             : 
     299        3048 :         category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
     300        3048 :         return category_mask & (PG_U_P_MASK | PG_U_S_MASK);
     301             :     }
     302             :     else
     303             :     {
     304       12288 :         category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
     305             : 
     306       12288 :         return category_mask & PG_U_P_MASK;
     307             :     }
     308             : }
     309             : 
     310             : bool
     311       16624 : pg_u_isspace(char32_t code)
     312             : {
     313       16624 :     return pg_u_prop_white_space(code);
     314             : }
     315             : 
     316             : bool
     317           6 : pg_u_isxdigit(char32_t code, bool posix)
     318             : {
     319           6 :     if (posix)
     320           6 :         return (('0' <= code && code <= '9') ||
     321          14 :                 ('A' <= code && code <= 'F') ||
     322           2 :                 ('a' <= code && code <= 'f'));
     323             :     else
     324           0 :         return unicode_category(code) == PG_U_DECIMAL_NUMBER ||
     325           0 :             pg_u_prop_hex_digit(code);
     326             : }
     327             : 
     328             : /*
     329             :  * Description of Unicode general category.
     330             :  */
     331             : const char *
     332           0 : unicode_category_string(pg_unicode_category category)
     333             : {
     334           0 :     switch (category)
     335             :     {
     336           0 :         case PG_U_UNASSIGNED:
     337           0 :             return "Unassigned";
     338           0 :         case PG_U_UPPERCASE_LETTER:
     339           0 :             return "Uppercase_Letter";
     340           0 :         case PG_U_LOWERCASE_LETTER:
     341           0 :             return "Lowercase_Letter";
     342           0 :         case PG_U_TITLECASE_LETTER:
     343           0 :             return "Titlecase_Letter";
     344           0 :         case PG_U_MODIFIER_LETTER:
     345           0 :             return "Modifier_Letter";
     346           0 :         case PG_U_OTHER_LETTER:
     347           0 :             return "Other_Letter";
     348           0 :         case PG_U_NONSPACING_MARK:
     349           0 :             return "Nonspacing_Mark";
     350           0 :         case PG_U_ENCLOSING_MARK:
     351           0 :             return "Enclosing_Mark";
     352           0 :         case PG_U_SPACING_MARK:
     353           0 :             return "Spacing_Mark";
     354           0 :         case PG_U_DECIMAL_NUMBER:
     355           0 :             return "Decimal_Number";
     356           0 :         case PG_U_LETTER_NUMBER:
     357           0 :             return "Letter_Number";
     358           0 :         case PG_U_OTHER_NUMBER:
     359           0 :             return "Other_Number";
     360           0 :         case PG_U_SPACE_SEPARATOR:
     361           0 :             return "Space_Separator";
     362           0 :         case PG_U_LINE_SEPARATOR:
     363           0 :             return "Line_Separator";
     364           0 :         case PG_U_PARAGRAPH_SEPARATOR:
     365           0 :             return "Paragraph_Separator";
     366           0 :         case PG_U_CONTROL:
     367           0 :             return "Control";
     368           0 :         case PG_U_FORMAT:
     369           0 :             return "Format";
     370           0 :         case PG_U_PRIVATE_USE:
     371           0 :             return "Private_Use";
     372           0 :         case PG_U_SURROGATE:
     373           0 :             return "Surrogate";
     374           0 :         case PG_U_DASH_PUNCTUATION:
     375           0 :             return "Dash_Punctuation";
     376           0 :         case PG_U_OPEN_PUNCTUATION:
     377           0 :             return "Open_Punctuation";
     378           0 :         case PG_U_CLOSE_PUNCTUATION:
     379           0 :             return "Close_Punctuation";
     380           0 :         case PG_U_CONNECTOR_PUNCTUATION:
     381           0 :             return "Connector_Punctuation";
     382           0 :         case PG_U_OTHER_PUNCTUATION:
     383           0 :             return "Other_Punctuation";
     384           0 :         case PG_U_MATH_SYMBOL:
     385           0 :             return "Math_Symbol";
     386           0 :         case PG_U_CURRENCY_SYMBOL:
     387           0 :             return "Currency_Symbol";
     388           0 :         case PG_U_MODIFIER_SYMBOL:
     389           0 :             return "Modifier_Symbol";
     390           0 :         case PG_U_OTHER_SYMBOL:
     391           0 :             return "Other_Symbol";
     392           0 :         case PG_U_INITIAL_PUNCTUATION:
     393           0 :             return "Initial_Punctuation";
     394           0 :         case PG_U_FINAL_PUNCTUATION:
     395           0 :             return "Final_Punctuation";
     396             :     }
     397             : 
     398             :     Assert(false);
     399           0 :     return "Unrecognized";        /* keep compiler quiet */
     400             : }
     401             : 
     402             : /*
     403             :  * Short code for Unicode general category.
     404             :  */
     405             : const char *
     406           0 : unicode_category_abbrev(pg_unicode_category category)
     407             : {
     408           0 :     switch (category)
     409             :     {
     410           0 :         case PG_U_UNASSIGNED:
     411           0 :             return "Cn";
     412           0 :         case PG_U_UPPERCASE_LETTER:
     413           0 :             return "Lu";
     414           0 :         case PG_U_LOWERCASE_LETTER:
     415           0 :             return "Ll";
     416           0 :         case PG_U_TITLECASE_LETTER:
     417           0 :             return "Lt";
     418           0 :         case PG_U_MODIFIER_LETTER:
     419           0 :             return "Lm";
     420           0 :         case PG_U_OTHER_LETTER:
     421           0 :             return "Lo";
     422           0 :         case PG_U_NONSPACING_MARK:
     423           0 :             return "Mn";
     424           0 :         case PG_U_ENCLOSING_MARK:
     425           0 :             return "Me";
     426           0 :         case PG_U_SPACING_MARK:
     427           0 :             return "Mc";
     428           0 :         case PG_U_DECIMAL_NUMBER:
     429           0 :             return "Nd";
     430           0 :         case PG_U_LETTER_NUMBER:
     431           0 :             return "Nl";
     432           0 :         case PG_U_OTHER_NUMBER:
     433           0 :             return "No";
     434           0 :         case PG_U_SPACE_SEPARATOR:
     435           0 :             return "Zs";
     436           0 :         case PG_U_LINE_SEPARATOR:
     437           0 :             return "Zl";
     438           0 :         case PG_U_PARAGRAPH_SEPARATOR:
     439           0 :             return "Zp";
     440           0 :         case PG_U_CONTROL:
     441           0 :             return "Cc";
     442           0 :         case PG_U_FORMAT:
     443           0 :             return "Cf";
     444           0 :         case PG_U_PRIVATE_USE:
     445           0 :             return "Co";
     446           0 :         case PG_U_SURROGATE:
     447           0 :             return "Cs";
     448           0 :         case PG_U_DASH_PUNCTUATION:
     449           0 :             return "Pd";
     450           0 :         case PG_U_OPEN_PUNCTUATION:
     451           0 :             return "Ps";
     452           0 :         case PG_U_CLOSE_PUNCTUATION:
     453           0 :             return "Pe";
     454           0 :         case PG_U_CONNECTOR_PUNCTUATION:
     455           0 :             return "Pc";
     456           0 :         case PG_U_OTHER_PUNCTUATION:
     457           0 :             return "Po";
     458           0 :         case PG_U_MATH_SYMBOL:
     459           0 :             return "Sm";
     460           0 :         case PG_U_CURRENCY_SYMBOL:
     461           0 :             return "Sc";
     462           0 :         case PG_U_MODIFIER_SYMBOL:
     463           0 :             return "Sk";
     464           0 :         case PG_U_OTHER_SYMBOL:
     465           0 :             return "So";
     466           0 :         case PG_U_INITIAL_PUNCTUATION:
     467           0 :             return "Pi";
     468           0 :         case PG_U_FINAL_PUNCTUATION:
     469           0 :             return "Pf";
     470             :     }
     471             : 
     472             :     Assert(false);
     473           0 :     return "??";              /* keep compiler quiet */
     474             : }
     475             : 
     476             : /*
     477             :  * Binary search to test if given codepoint exists in one of the ranges in the
     478             :  * given table.
     479             :  */
     480             : static bool
     481       89124 : range_search(const pg_unicode_range *tbl, size_t size, char32_t code)
     482             : {
     483       89124 :     int         min = 0;
     484             :     int         mid;
     485       89124 :     int         max = size - 1;
     486             : 
     487             :     Assert(code <= 0x10ffff);
     488             : 
     489      822122 :     while (max >= min)
     490             :     {
     491      777666 :         mid = (min + max) / 2;
     492      777666 :         if (code > tbl[mid].last)
     493      278542 :             min = mid + 1;
     494      499124 :         else if (code < tbl[mid].first)
     495      454456 :             max = mid - 1;
     496             :         else
     497       44668 :             return true;
     498             :     }
     499             : 
     500       44456 :     return false;
     501             : }

Generated by: LCOV version 1.16