LCOV - code coverage report
Current view: top level - src/common - unicode_category.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 31.9 % 235 75
Test Date: 2026-03-12 06:14:44 Functions: 60.0 % 25 15
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  * unicode_category.c
       3              :  *      Determine general category and character properties of Unicode
       4              :  *      characters. Encoding must be UTF8, where we assume that the char32_t
       5              :  *      representation is a code point.
       6              :  *
       7              :  * Portions Copyright (c) 2017-2026, PostgreSQL Global Development Group
       8              :  *
       9              :  * IDENTIFICATION
      10              :  *    src/common/unicode_category.c
      11              :  *
      12              :  *-------------------------------------------------------------------------
      13              :  */
      14              : #ifndef FRONTEND
      15              : #include "postgres.h"
      16              : #else
      17              : #include "postgres_fe.h"
      18              : #endif
      19              : 
      20              : #include "common/unicode_category.h"
      21              : #include "common/unicode_category_table.h"
      22              : 
      23              : /*
      24              :  * Create bitmasks from pg_unicode_category values for efficient comparison of
      25              :  * multiple categories. For instance, PG_U_MN_MASK is a bitmask representing
      26              :  * the general category Mn; and PG_U_M_MASK represents general categories Mn,
      27              :  * Me, and Mc.
      28              :  *
      29              :  * The number of Unicode General Categories should never grow, so a 32-bit
      30              :  * mask is fine.
      31              :  */
      32              : #define PG_U_CATEGORY_MASK(X) ((uint32)(1 << (X)))
      33              : 
      34              : #define PG_U_LU_MASK PG_U_CATEGORY_MASK(PG_U_UPPERCASE_LETTER)
      35              : #define PG_U_LL_MASK PG_U_CATEGORY_MASK(PG_U_LOWERCASE_LETTER)
      36              : #define PG_U_LT_MASK PG_U_CATEGORY_MASK(PG_U_TITLECASE_LETTER)
      37              : #define PG_U_LC_MASK (PG_U_LU_MASK|PG_U_LL_MASK|PG_U_LT_MASK)
      38              : #define PG_U_LM_MASK PG_U_CATEGORY_MASK(PG_U_MODIFIER_LETTER)
      39              : #define PG_U_LO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_LETTER)
      40              : #define PG_U_L_MASK (PG_U_LU_MASK|PG_U_LL_MASK|PG_U_LT_MASK|PG_U_LM_MASK|\
      41              :                      PG_U_LO_MASK)
      42              : #define PG_U_MN_MASK PG_U_CATEGORY_MASK(PG_U_NONSPACING_MARK)
      43              : #define PG_U_ME_MASK PG_U_CATEGORY_MASK(PG_U_ENCLOSING_MARK)
      44              : #define PG_U_MC_MASK PG_U_CATEGORY_MASK(PG_U_SPACING_MARK)
      45              : #define PG_U_M_MASK (PG_U_MN_MASK|PG_U_MC_MASK|PG_U_ME_MASK)
      46              : #define PG_U_ND_MASK PG_U_CATEGORY_MASK(PG_U_DECIMAL_NUMBER)
      47              : #define PG_U_NL_MASK PG_U_CATEGORY_MASK(PG_U_LETTER_NUMBER)
      48              : #define PG_U_NO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_NUMBER)
      49              : #define PG_U_N_MASK (PG_U_ND_MASK|PG_U_NL_MASK|PG_U_NO_MASK)
      50              : #define PG_U_PC_MASK PG_U_CATEGORY_MASK(PG_U_CONNECTOR_PUNCTUATION)
      51              : #define PG_U_PD_MASK PG_U_CATEGORY_MASK(PG_U_DASH_PUNCTUATION)
      52              : #define PG_U_PS_MASK PG_U_CATEGORY_MASK(PG_U_OPEN_PUNCTUATION)
      53              : #define PG_U_PE_MASK PG_U_CATEGORY_MASK(PG_U_CLOSE_PUNCTUATION)
      54              : #define PG_U_PI_MASK PG_U_CATEGORY_MASK(PG_U_INITIAL_PUNCTUATION)
      55              : #define PG_U_PF_MASK PG_U_CATEGORY_MASK(PG_U_FINAL_PUNCTUATION)
      56              : #define PG_U_PO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_PUNCTUATION)
      57              : #define PG_U_P_MASK (PG_U_PC_MASK|PG_U_PD_MASK|PG_U_PS_MASK|PG_U_PE_MASK|\
      58              :                      PG_U_PI_MASK|PG_U_PF_MASK|PG_U_PO_MASK)
      59              : #define PG_U_SM_MASK PG_U_CATEGORY_MASK(PG_U_MATH_SYMBOL)
      60              : #define PG_U_SC_MASK PG_U_CATEGORY_MASK(PG_U_CURRENCY_SYMBOL)
      61              : #define PG_U_SK_MASK PG_U_CATEGORY_MASK(PG_U_MODIFIER_SYMBOL)
      62              : #define PG_U_SO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_SYMBOL)
      63              : #define PG_U_S_MASK (PG_U_SM_MASK|PG_U_SC_MASK|PG_U_SK_MASK|PG_U_SO_MASK)
      64              : #define PG_U_ZS_MASK PG_U_CATEGORY_MASK(PG_U_SPACE_SEPARATOR)
      65              : #define PG_U_ZL_MASK PG_U_CATEGORY_MASK(PG_U_LINE_SEPARATOR)
      66              : #define PG_U_ZP_MASK PG_U_CATEGORY_MASK(PG_U_PARAGRAPH_SEPARATOR)
      67              : #define PG_U_Z_MASK (PG_U_ZS_MASK|PG_U_ZL_MASK|PG_U_ZP_MASK)
      68              : #define PG_U_CC_MASK PG_U_CATEGORY_MASK(PG_U_CONTROL)
      69              : #define PG_U_CF_MASK PG_U_CATEGORY_MASK(PG_U_FORMAT)
      70              : #define PG_U_CS_MASK PG_U_CATEGORY_MASK(PG_U_SURROGATE)
      71              : #define PG_U_CO_MASK PG_U_CATEGORY_MASK(PG_U_PRIVATE_USE)
      72              : #define PG_U_CN_MASK PG_U_CATEGORY_MASK(PG_U_UNASSIGNED)
      73              : #define PG_U_C_MASK (PG_U_CC_MASK|PG_U_CF_MASK|PG_U_CS_MASK|PG_U_CO_MASK|\
      74              :                      PG_U_CN_MASK)
      75              : 
      76              : #define PG_U_CHARACTER_TAB  0x09
      77              : 
      78              : static bool range_search(const pg_unicode_range *tbl, size_t size,
      79              :                          char32_t code);
      80              : 
      81              : /*
      82              :  * Unicode general category for the given codepoint.
      83              :  */
      84              : pg_unicode_category
      85        15474 : unicode_category(char32_t code)
      86              : {
      87        15474 :     int         min = 0;
      88              :     int         mid;
      89        15474 :     int         max = lengthof(unicode_categories) - 1;
      90              : 
      91              :     Assert(code <= 0x10ffff);
      92              : 
      93        15474 :     if (code < 0x80)
      94         1320 :         return unicode_opt_ascii[code].category;
      95              : 
      96       152523 :     while (max >= min)
      97              :     {
      98       151836 :         mid = (min + max) / 2;
      99       151836 :         if (code > unicode_categories[mid].last)
     100        61971 :             min = mid + 1;
     101        89865 :         else if (code < unicode_categories[mid].first)
     102        76398 :             max = mid - 1;
     103              :         else
     104        13467 :             return unicode_categories[mid].category;
     105              :     }
     106              : 
     107          687 :     return PG_U_UNASSIGNED;
     108              : }
     109              : 
     110              : bool
     111        47359 : pg_u_prop_alphabetic(char32_t code)
     112              : {
     113        47359 :     if (code < 0x80)
     114        22111 :         return unicode_opt_ascii[code].properties & PG_U_PROP_ALPHABETIC;
     115              : 
     116        25248 :     return range_search(unicode_alphabetic,
     117              :                         lengthof(unicode_alphabetic),
     118              :                         code);
     119              : }
     120              : 
     121              : bool
     122           30 : pg_u_prop_lowercase(char32_t code)
     123              : {
     124           30 :     if (code < 0x80)
     125            0 :         return unicode_opt_ascii[code].properties & PG_U_PROP_LOWERCASE;
     126              : 
     127           30 :     return range_search(unicode_lowercase,
     128              :                         lengthof(unicode_lowercase),
     129              :                         code);
     130              : }
     131              : 
     132              : bool
     133        12318 : pg_u_prop_uppercase(char32_t code)
     134              : {
     135        12318 :     if (code < 0x80)
     136          768 :         return unicode_opt_ascii[code].properties & PG_U_PROP_UPPERCASE;
     137              : 
     138        11550 :     return range_search(unicode_uppercase,
     139              :                         lengthof(unicode_uppercase),
     140              :                         code);
     141              : }
     142              : 
     143              : bool
     144           39 : pg_u_prop_cased(char32_t code)
     145              : {
     146              :     uint32      category_mask;
     147              : 
     148           39 :     if (code < 0x80)
     149            9 :         return unicode_opt_ascii[code].properties & PG_U_PROP_CASED;
     150              : 
     151           30 :     category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
     152              : 
     153           30 :     return category_mask & PG_U_LT_MASK ||
     154           60 :         pg_u_prop_lowercase(code) ||
     155           30 :         pg_u_prop_uppercase(code);
     156              : }
     157              : 
     158              : bool
     159           63 : pg_u_prop_case_ignorable(char32_t code)
     160              : {
     161           63 :     if (code < 0x80)
     162            9 :         return unicode_opt_ascii[code].properties & PG_U_PROP_CASE_IGNORABLE;
     163              : 
     164           54 :     return range_search(unicode_case_ignorable,
     165              :                         lengthof(unicode_case_ignorable),
     166              :                         code);
     167              : }
     168              : 
     169              : bool
     170         8312 : pg_u_prop_white_space(char32_t code)
     171              : {
     172         8312 :     if (code < 0x80)
     173          632 :         return unicode_opt_ascii[code].properties & PG_U_PROP_WHITE_SPACE;
     174              : 
     175         7680 :     return range_search(unicode_white_space,
     176              :                         lengthof(unicode_white_space),
     177              :                         code);
     178              : }
     179              : 
     180              : bool
     181            0 : pg_u_prop_hex_digit(char32_t code)
     182              : {
     183            0 :     if (code < 0x80)
     184            0 :         return unicode_opt_ascii[code].properties & PG_U_PROP_HEX_DIGIT;
     185              : 
     186            0 :     return range_search(unicode_hex_digit,
     187              :                         lengthof(unicode_hex_digit),
     188              :                         code);
     189              : }
     190              : 
     191              : bool
     192            0 : pg_u_prop_join_control(char32_t code)
     193              : {
     194            0 :     if (code < 0x80)
     195            0 :         return unicode_opt_ascii[code].properties & PG_U_PROP_JOIN_CONTROL;
     196              : 
     197            0 :     return range_search(unicode_join_control,
     198              :                         lengthof(unicode_join_control),
     199              :                         code);
     200              : }
     201              : 
     202              : /*
     203              :  * The following functions implement the Compatibility Properties described
     204              :  * at: http://www.unicode.org/reports/tr18/#Compatibility_Properties
     205              :  *
     206              :  * If 'posix' is true, implements the "POSIX Compatible" variant, otherwise
     207              :  * the "Standard" variant.
     208              :  */
     209              : 
     210              : bool
     211        44440 : pg_u_isdigit(char32_t code, bool posix)
     212              : {
     213        44440 :     if (posix)
     214        36685 :         return ('0' <= code && code <= '9');
     215              :     else
     216         7755 :         return unicode_category(code) == PG_U_DECIMAL_NUMBER;
     217              : }
     218              : 
     219              : bool
     220        47359 : pg_u_isalpha(char32_t code)
     221              : {
     222        47359 :     return pg_u_prop_alphabetic(code);
     223              : }
     224              : 
     225              : bool
     226        21362 : pg_u_isalnum(char32_t code, bool posix)
     227              : {
     228        21362 :     return pg_u_isalpha(code) || pg_u_isdigit(code, posix);
     229              : }
     230              : 
     231              : bool
     232            0 : pg_u_isword(char32_t code)
     233              : {
     234            0 :     uint32      category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
     235              : 
     236              :     return
     237            0 :         category_mask & (PG_U_M_MASK | PG_U_ND_MASK | PG_U_PC_MASK) ||
     238            0 :         pg_u_isalpha(code) ||
     239            0 :         pg_u_prop_join_control(code);
     240              : }
     241              : 
     242              : bool
     243        12288 : pg_u_isupper(char32_t code)
     244              : {
     245        12288 :     return pg_u_prop_uppercase(code);
     246              : }
     247              : 
     248              : bool
     249            0 : pg_u_islower(char32_t code)
     250              : {
     251            0 :     return pg_u_prop_lowercase(code);
     252              : }
     253              : 
     254              : bool
     255            0 : pg_u_isblank(char32_t code)
     256              : {
     257            0 :     return code == PG_U_CHARACTER_TAB ||
     258            0 :         unicode_category(code) == PG_U_SPACE_SEPARATOR;
     259              : }
     260              : 
     261              : bool
     262            0 : pg_u_iscntrl(char32_t code)
     263              : {
     264            0 :     return unicode_category(code) == PG_U_CONTROL;
     265              : }
     266              : 
     267              : bool
     268            0 : pg_u_isgraph(char32_t code)
     269              : {
     270            0 :     uint32      category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
     271              : 
     272            0 :     if (category_mask & (PG_U_CC_MASK | PG_U_CS_MASK | PG_U_CN_MASK) ||
     273            0 :         pg_u_isspace(code))
     274            0 :         return false;
     275            0 :     return true;
     276              : }
     277              : 
     278              : bool
     279            0 : pg_u_isprint(char32_t code)
     280              : {
     281            0 :     pg_unicode_category category = unicode_category(code);
     282              : 
     283            0 :     if (category == PG_U_CONTROL)
     284            0 :         return false;
     285              : 
     286            0 :     return pg_u_isgraph(code) || pg_u_isblank(code);
     287              : }
     288              : 
     289              : bool
     290        12288 : pg_u_ispunct(char32_t code, bool posix)
     291              : {
     292              :     uint32      category_mask;
     293              : 
     294        12288 :     if (posix)
     295              :     {
     296         6144 :         if (pg_u_isalpha(code))
     297         4620 :             return false;
     298              : 
     299         1524 :         category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
     300         1524 :         return category_mask & (PG_U_P_MASK | PG_U_S_MASK);
     301              :     }
     302              :     else
     303              :     {
     304         6144 :         category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
     305              : 
     306         6144 :         return category_mask & PG_U_P_MASK;
     307              :     }
     308              : }
     309              : 
     310              : bool
     311         8312 : pg_u_isspace(char32_t code)
     312              : {
     313         8312 :     return pg_u_prop_white_space(code);
     314              : }
     315              : 
     316              : bool
     317            3 : pg_u_isxdigit(char32_t code, bool posix)
     318              : {
     319            3 :     if (posix)
     320            3 :         return (('0' <= code && code <= '9') ||
     321            7 :                 ('A' <= code && code <= 'F') ||
     322            1 :                 ('a' <= code && code <= 'f'));
     323              :     else
     324            0 :         return unicode_category(code) == PG_U_DECIMAL_NUMBER ||
     325            0 :             pg_u_prop_hex_digit(code);
     326              : }
     327              : 
     328              : /*
     329              :  * Description of Unicode general category.
     330              :  */
     331              : const char *
     332            0 : unicode_category_string(pg_unicode_category category)
     333              : {
     334            0 :     switch (category)
     335              :     {
     336            0 :         case PG_U_UNASSIGNED:
     337            0 :             return "Unassigned";
     338            0 :         case PG_U_UPPERCASE_LETTER:
     339            0 :             return "Uppercase_Letter";
     340            0 :         case PG_U_LOWERCASE_LETTER:
     341            0 :             return "Lowercase_Letter";
     342            0 :         case PG_U_TITLECASE_LETTER:
     343            0 :             return "Titlecase_Letter";
     344            0 :         case PG_U_MODIFIER_LETTER:
     345            0 :             return "Modifier_Letter";
     346            0 :         case PG_U_OTHER_LETTER:
     347            0 :             return "Other_Letter";
     348            0 :         case PG_U_NONSPACING_MARK:
     349            0 :             return "Nonspacing_Mark";
     350            0 :         case PG_U_ENCLOSING_MARK:
     351            0 :             return "Enclosing_Mark";
     352            0 :         case PG_U_SPACING_MARK:
     353            0 :             return "Spacing_Mark";
     354            0 :         case PG_U_DECIMAL_NUMBER:
     355            0 :             return "Decimal_Number";
     356            0 :         case PG_U_LETTER_NUMBER:
     357            0 :             return "Letter_Number";
     358            0 :         case PG_U_OTHER_NUMBER:
     359            0 :             return "Other_Number";
     360            0 :         case PG_U_SPACE_SEPARATOR:
     361            0 :             return "Space_Separator";
     362            0 :         case PG_U_LINE_SEPARATOR:
     363            0 :             return "Line_Separator";
     364            0 :         case PG_U_PARAGRAPH_SEPARATOR:
     365            0 :             return "Paragraph_Separator";
     366            0 :         case PG_U_CONTROL:
     367            0 :             return "Control";
     368            0 :         case PG_U_FORMAT:
     369            0 :             return "Format";
     370            0 :         case PG_U_PRIVATE_USE:
     371            0 :             return "Private_Use";
     372            0 :         case PG_U_SURROGATE:
     373            0 :             return "Surrogate";
     374            0 :         case PG_U_DASH_PUNCTUATION:
     375            0 :             return "Dash_Punctuation";
     376            0 :         case PG_U_OPEN_PUNCTUATION:
     377            0 :             return "Open_Punctuation";
     378            0 :         case PG_U_CLOSE_PUNCTUATION:
     379            0 :             return "Close_Punctuation";
     380            0 :         case PG_U_CONNECTOR_PUNCTUATION:
     381            0 :             return "Connector_Punctuation";
     382            0 :         case PG_U_OTHER_PUNCTUATION:
     383            0 :             return "Other_Punctuation";
     384            0 :         case PG_U_MATH_SYMBOL:
     385            0 :             return "Math_Symbol";
     386            0 :         case PG_U_CURRENCY_SYMBOL:
     387            0 :             return "Currency_Symbol";
     388            0 :         case PG_U_MODIFIER_SYMBOL:
     389            0 :             return "Modifier_Symbol";
     390            0 :         case PG_U_OTHER_SYMBOL:
     391            0 :             return "Other_Symbol";
     392            0 :         case PG_U_INITIAL_PUNCTUATION:
     393            0 :             return "Initial_Punctuation";
     394            0 :         case PG_U_FINAL_PUNCTUATION:
     395            0 :             return "Final_Punctuation";
     396              :     }
     397              : 
     398              :     Assert(false);
     399            0 :     return "Unrecognized";        /* keep compiler quiet */
     400              : }
     401              : 
     402              : /*
     403              :  * Short code for Unicode general category.
     404              :  */
     405              : const char *
     406            0 : unicode_category_abbrev(pg_unicode_category category)
     407              : {
     408            0 :     switch (category)
     409              :     {
     410            0 :         case PG_U_UNASSIGNED:
     411            0 :             return "Cn";
     412            0 :         case PG_U_UPPERCASE_LETTER:
     413            0 :             return "Lu";
     414            0 :         case PG_U_LOWERCASE_LETTER:
     415            0 :             return "Ll";
     416            0 :         case PG_U_TITLECASE_LETTER:
     417            0 :             return "Lt";
     418            0 :         case PG_U_MODIFIER_LETTER:
     419            0 :             return "Lm";
     420            0 :         case PG_U_OTHER_LETTER:
     421            0 :             return "Lo";
     422            0 :         case PG_U_NONSPACING_MARK:
     423            0 :             return "Mn";
     424            0 :         case PG_U_ENCLOSING_MARK:
     425            0 :             return "Me";
     426            0 :         case PG_U_SPACING_MARK:
     427            0 :             return "Mc";
     428            0 :         case PG_U_DECIMAL_NUMBER:
     429            0 :             return "Nd";
     430            0 :         case PG_U_LETTER_NUMBER:
     431            0 :             return "Nl";
     432            0 :         case PG_U_OTHER_NUMBER:
     433            0 :             return "No";
     434            0 :         case PG_U_SPACE_SEPARATOR:
     435            0 :             return "Zs";
     436            0 :         case PG_U_LINE_SEPARATOR:
     437            0 :             return "Zl";
     438            0 :         case PG_U_PARAGRAPH_SEPARATOR:
     439            0 :             return "Zp";
     440            0 :         case PG_U_CONTROL:
     441            0 :             return "Cc";
     442            0 :         case PG_U_FORMAT:
     443            0 :             return "Cf";
     444            0 :         case PG_U_PRIVATE_USE:
     445            0 :             return "Co";
     446            0 :         case PG_U_SURROGATE:
     447            0 :             return "Cs";
     448            0 :         case PG_U_DASH_PUNCTUATION:
     449            0 :             return "Pd";
     450            0 :         case PG_U_OPEN_PUNCTUATION:
     451            0 :             return "Ps";
     452            0 :         case PG_U_CLOSE_PUNCTUATION:
     453            0 :             return "Pe";
     454            0 :         case PG_U_CONNECTOR_PUNCTUATION:
     455            0 :             return "Pc";
     456            0 :         case PG_U_OTHER_PUNCTUATION:
     457            0 :             return "Po";
     458            0 :         case PG_U_MATH_SYMBOL:
     459            0 :             return "Sm";
     460            0 :         case PG_U_CURRENCY_SYMBOL:
     461            0 :             return "Sc";
     462            0 :         case PG_U_MODIFIER_SYMBOL:
     463            0 :             return "Sk";
     464            0 :         case PG_U_OTHER_SYMBOL:
     465            0 :             return "So";
     466            0 :         case PG_U_INITIAL_PUNCTUATION:
     467            0 :             return "Pi";
     468            0 :         case PG_U_FINAL_PUNCTUATION:
     469            0 :             return "Pf";
     470              :     }
     471              : 
     472              :     Assert(false);
     473            0 :     return "??";              /* keep compiler quiet */
     474              : }
     475              : 
     476              : /*
     477              :  * Binary search to test if given codepoint exists in one of the ranges in the
     478              :  * given table.
     479              :  */
     480              : static bool
     481        44562 : range_search(const pg_unicode_range *tbl, size_t size, char32_t code)
     482              : {
     483        44562 :     int         min = 0;
     484              :     int         mid;
     485        44562 :     int         max = size - 1;
     486              : 
     487              :     Assert(code <= 0x10ffff);
     488              : 
     489       411061 :     while (max >= min)
     490              :     {
     491       388833 :         mid = (min + max) / 2;
     492       388833 :         if (code > tbl[mid].last)
     493       139271 :             min = mid + 1;
     494       249562 :         else if (code < tbl[mid].first)
     495       227228 :             max = mid - 1;
     496              :         else
     497        22334 :             return true;
     498              :     }
     499              : 
     500        22228 :     return false;
     501              : }
        

Generated by: LCOV version 2.0-1