Line data Source code
1 : /*------------------------------------------------------------------------- 2 : * unicode_category.c 3 : * Determine general category and character properties of Unicode 4 : * characters. Encoding must be UTF8, where we assume that the pg_wchar 5 : * representation is a code point. 6 : * 7 : * Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group 8 : * 9 : * IDENTIFICATION 10 : * src/common/unicode_category.c 11 : * 12 : *------------------------------------------------------------------------- 13 : */ 14 : #ifndef FRONTEND 15 : #include "postgres.h" 16 : #else 17 : #include "postgres_fe.h" 18 : #endif 19 : 20 : #include "common/unicode_category.h" 21 : #include "common/unicode_category_table.h" 22 : 23 : /* 24 : * Create bitmasks from pg_unicode_category values for efficient comparison of 25 : * multiple categories. For instance, PG_U_MN_MASK is a bitmask representing 26 : * the general category Mn; and PG_U_M_MASK represents general categories Mn, 27 : * Me, and Mc. 28 : * 29 : * The number of Unicode General Categories should never grow, so a 32-bit 30 : * mask is fine. 31 : */ 32 : #define PG_U_CATEGORY_MASK(X) ((uint32)(1 << (X))) 33 : 34 : #define PG_U_LU_MASK PG_U_CATEGORY_MASK(PG_U_UPPERCASE_LETTER) 35 : #define PG_U_LL_MASK PG_U_CATEGORY_MASK(PG_U_LOWERCASE_LETTER) 36 : #define PG_U_LT_MASK PG_U_CATEGORY_MASK(PG_U_TITLECASE_LETTER) 37 : #define PG_U_LC_MASK (PG_U_LU_MASK|PG_U_LL_MASK|PG_U_LT_MASK) 38 : #define PG_U_LM_MASK PG_U_CATEGORY_MASK(PG_U_MODIFIER_LETTER) 39 : #define PG_U_LO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_LETTER) 40 : #define PG_U_L_MASK (PG_U_LU_MASK|PG_U_LL_MASK|PG_U_LT_MASK|PG_U_LM_MASK|\ 41 : PG_U_LO_MASK) 42 : #define PG_U_MN_MASK PG_U_CATEGORY_MASK(PG_U_NONSPACING_MARK) 43 : #define PG_U_ME_MASK PG_U_CATEGORY_MASK(PG_U_ENCLOSING_MARK) 44 : #define PG_U_MC_MASK PG_U_CATEGORY_MASK(PG_U_SPACING_MARK) 45 : #define PG_U_M_MASK (PG_U_MN_MASK|PG_U_MC_MASK|PG_U_ME_MASK) 46 : #define PG_U_ND_MASK PG_U_CATEGORY_MASK(PG_U_DECIMAL_NUMBER) 47 : #define PG_U_NL_MASK PG_U_CATEGORY_MASK(PG_U_LETTER_NUMBER) 48 : #define PG_U_NO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_NUMBER) 49 : #define PG_U_N_MASK (PG_U_ND_MASK|PG_U_NL_MASK|PG_U_NO_MASK) 50 : #define PG_U_PC_MASK PG_U_CATEGORY_MASK(PG_U_CONNECTOR_PUNCTUATION) 51 : #define PG_U_PD_MASK PG_U_CATEGORY_MASK(PG_U_DASH_PUNCTUATION) 52 : #define PG_U_PS_MASK PG_U_CATEGORY_MASK(PG_U_OPEN_PUNCTUATION) 53 : #define PG_U_PE_MASK PG_U_CATEGORY_MASK(PG_U_CLOSE_PUNCTUATION) 54 : #define PG_U_PI_MASK PG_U_CATEGORY_MASK(PG_U_INITIAL_PUNCTUATION) 55 : #define PG_U_PF_MASK PG_U_CATEGORY_MASK(PG_U_FINAL_PUNCTUATION) 56 : #define PG_U_PO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_PUNCTUATION) 57 : #define PG_U_P_MASK (PG_U_PC_MASK|PG_U_PD_MASK|PG_U_PS_MASK|PG_U_PE_MASK|\ 58 : PG_U_PI_MASK|PG_U_PF_MASK|PG_U_PO_MASK) 59 : #define PG_U_SM_MASK PG_U_CATEGORY_MASK(PG_U_MATH_SYMBOL) 60 : #define PG_U_SC_MASK PG_U_CATEGORY_MASK(PG_U_CURRENCY_SYMBOL) 61 : #define PG_U_SK_MASK PG_U_CATEGORY_MASK(PG_U_MODIFIER_SYMBOL) 62 : #define PG_U_SO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_SYMBOL) 63 : #define PG_U_S_MASK (PG_U_SM_MASK|PG_U_SC_MASK|PG_U_SK_MASK|PG_U_SO_MASK) 64 : #define PG_U_ZS_MASK PG_U_CATEGORY_MASK(PG_U_SPACE_SEPARATOR) 65 : #define PG_U_ZL_MASK PG_U_CATEGORY_MASK(PG_U_LINE_SEPARATOR) 66 : #define PG_U_ZP_MASK PG_U_CATEGORY_MASK(PG_U_PARAGRAPH_SEPARATOR) 67 : #define PG_U_Z_MASK (PG_U_ZS_MASK|PG_U_ZL_MASK|PG_U_ZP_MASK) 68 : #define PG_U_CC_MASK PG_U_CATEGORY_MASK(PG_U_CONTROL) 69 : #define PG_U_CF_MASK PG_U_CATEGORY_MASK(PG_U_FORMAT) 70 : #define PG_U_CS_MASK PG_U_CATEGORY_MASK(PG_U_SURROGATE) 71 : #define PG_U_CO_MASK PG_U_CATEGORY_MASK(PG_U_PRIVATE_USE) 72 : #define PG_U_CN_MASK PG_U_CATEGORY_MASK(PG_U_UNASSIGNED) 73 : #define PG_U_C_MASK (PG_U_CC_MASK|PG_U_CF_MASK|PG_U_CS_MASK|PG_U_CO_MASK|\ 74 : PG_U_CN_MASK) 75 : 76 : #define PG_U_CHARACTER_TAB 0x09 77 : 78 : static bool range_search(const pg_unicode_range *tbl, size_t size, 79 : pg_wchar code); 80 : 81 : /* 82 : * Unicode general category for the given codepoint. 83 : */ 84 : pg_unicode_category 85 3168 : unicode_category(pg_wchar code) 86 : { 87 3168 : int min = 0; 88 : int mid; 89 3168 : int max = lengthof(unicode_categories) - 1; 90 : 91 : Assert(code <= 0x10ffff); 92 : 93 3168 : if (code < 0x80) 94 492 : return unicode_opt_ascii[code].category; 95 : 96 28734 : while (max >= min) 97 : { 98 28386 : mid = (min + max) / 2; 99 28386 : if (code > unicode_categories[mid].last) 100 9792 : min = mid + 1; 101 18594 : else if (code < unicode_categories[mid].first) 102 16266 : max = mid - 1; 103 : else 104 2328 : return unicode_categories[mid].category; 105 : } 106 : 107 348 : return PG_U_UNASSIGNED; 108 : } 109 : 110 : bool 111 33452 : pg_u_prop_alphabetic(pg_wchar code) 112 : { 113 33452 : if (code < 0x80) 114 2480 : return unicode_opt_ascii[code].properties & PG_U_PROP_ALPHABETIC; 115 : 116 30972 : return range_search(unicode_alphabetic, 117 : lengthof(unicode_alphabetic), 118 : code); 119 : } 120 : 121 : bool 122 0 : pg_u_prop_lowercase(pg_wchar code) 123 : { 124 0 : if (code < 0x80) 125 0 : return unicode_opt_ascii[code].properties & PG_U_PROP_LOWERCASE; 126 : 127 0 : return range_search(unicode_lowercase, 128 : lengthof(unicode_lowercase), 129 : code); 130 : } 131 : 132 : bool 133 12288 : pg_u_prop_uppercase(pg_wchar code) 134 : { 135 12288 : if (code < 0x80) 136 768 : return unicode_opt_ascii[code].properties & PG_U_PROP_UPPERCASE; 137 : 138 11520 : return range_search(unicode_uppercase, 139 : lengthof(unicode_uppercase), 140 : code); 141 : } 142 : 143 : bool 144 0 : pg_u_prop_cased(pg_wchar code) 145 : { 146 : uint32 category_mask; 147 : 148 0 : if (code < 0x80) 149 0 : return unicode_opt_ascii[code].properties & PG_U_PROP_CASED; 150 : 151 0 : category_mask = PG_U_CATEGORY_MASK(unicode_category(code)); 152 : 153 0 : return category_mask & PG_U_LT_MASK || 154 0 : pg_u_prop_lowercase(code) || 155 0 : pg_u_prop_uppercase(code); 156 : } 157 : 158 : bool 159 0 : pg_u_prop_case_ignorable(pg_wchar code) 160 : { 161 0 : if (code < 0x80) 162 0 : return unicode_opt_ascii[code].properties & PG_U_PROP_CASE_IGNORABLE; 163 : 164 0 : return range_search(unicode_case_ignorable, 165 : lengthof(unicode_case_ignorable), 166 : code); 167 : } 168 : 169 : bool 170 16398 : pg_u_prop_white_space(pg_wchar code) 171 : { 172 16398 : if (code < 0x80) 173 1038 : return unicode_opt_ascii[code].properties & PG_U_PROP_WHITE_SPACE; 174 : 175 15360 : return range_search(unicode_white_space, 176 : lengthof(unicode_white_space), 177 : code); 178 : } 179 : 180 : bool 181 0 : pg_u_prop_hex_digit(pg_wchar code) 182 : { 183 0 : if (code < 0x80) 184 0 : return unicode_opt_ascii[code].properties & PG_U_PROP_HEX_DIGIT; 185 : 186 0 : return range_search(unicode_hex_digit, 187 : lengthof(unicode_hex_digit), 188 : code); 189 : } 190 : 191 : bool 192 0 : pg_u_prop_join_control(pg_wchar code) 193 : { 194 0 : if (code < 0x80) 195 0 : return unicode_opt_ascii[code].properties & PG_U_PROP_JOIN_CONTROL; 196 : 197 0 : return range_search(unicode_join_control, 198 : lengthof(unicode_join_control), 199 : code); 200 : } 201 : 202 : /* 203 : * The following functions implement the Compatibility Properties described 204 : * at: http://www.unicode.org/reports/tr18/#Compatibility_Properties 205 : * 206 : * If 'posix' is true, implements the "POSIX Compatible" variant, otherwise 207 : * the "Standard" variant. 208 : */ 209 : 210 : bool 211 54590 : pg_u_isdigit(pg_wchar code, bool posix) 212 : { 213 54590 : if (posix) 214 54590 : return ('0' <= code && code <= '9'); 215 : else 216 0 : return unicode_category(code) == PG_U_DECIMAL_NUMBER; 217 : } 218 : 219 : bool 220 33452 : pg_u_isalpha(pg_wchar code) 221 : { 222 33452 : return pg_u_prop_alphabetic(code); 223 : } 224 : 225 : bool 226 21142 : pg_u_isalnum(pg_wchar code, bool posix) 227 : { 228 21142 : return pg_u_isalpha(code) || pg_u_isdigit(code, posix); 229 : } 230 : 231 : bool 232 0 : pg_u_isword(pg_wchar code) 233 : { 234 0 : uint32 category_mask = PG_U_CATEGORY_MASK(unicode_category(code)); 235 : 236 : return 237 0 : category_mask & (PG_U_M_MASK | PG_U_ND_MASK | PG_U_PC_MASK) || 238 0 : pg_u_isalpha(code) || 239 0 : pg_u_prop_join_control(code); 240 : } 241 : 242 : bool 243 12288 : pg_u_isupper(pg_wchar code) 244 : { 245 12288 : return pg_u_prop_uppercase(code); 246 : } 247 : 248 : bool 249 0 : pg_u_islower(pg_wchar code) 250 : { 251 0 : return pg_u_prop_lowercase(code); 252 : } 253 : 254 : bool 255 0 : pg_u_isblank(pg_wchar code) 256 : { 257 0 : return code == PG_U_CHARACTER_TAB || 258 0 : unicode_category(code) == PG_U_SPACE_SEPARATOR; 259 : } 260 : 261 : bool 262 0 : pg_u_iscntrl(pg_wchar code) 263 : { 264 0 : return unicode_category(code) == PG_U_CONTROL; 265 : } 266 : 267 : bool 268 0 : pg_u_isgraph(pg_wchar code) 269 : { 270 0 : uint32 category_mask = PG_U_CATEGORY_MASK(unicode_category(code)); 271 : 272 0 : if (category_mask & (PG_U_CC_MASK | PG_U_CS_MASK | PG_U_CN_MASK) || 273 0 : pg_u_isspace(code)) 274 0 : return false; 275 0 : return true; 276 : } 277 : 278 : bool 279 0 : pg_u_isprint(pg_wchar code) 280 : { 281 0 : pg_unicode_category category = unicode_category(code); 282 : 283 0 : if (category == PG_U_CONTROL) 284 0 : return false; 285 : 286 0 : return pg_u_isgraph(code) || pg_u_isblank(code); 287 : } 288 : 289 : bool 290 12288 : pg_u_ispunct(pg_wchar code, bool posix) 291 : { 292 : uint32 category_mask; 293 : 294 12288 : if (posix) 295 : { 296 12288 : if (pg_u_isalpha(code)) 297 9162 : return false; 298 : 299 3126 : category_mask = PG_U_CATEGORY_MASK(unicode_category(code)); 300 3126 : return category_mask & (PG_U_P_MASK | PG_U_S_MASK); 301 : } 302 : else 303 : { 304 0 : category_mask = PG_U_CATEGORY_MASK(unicode_category(code)); 305 : 306 0 : return category_mask & PG_U_P_MASK; 307 : } 308 : } 309 : 310 : bool 311 16398 : pg_u_isspace(pg_wchar code) 312 : { 313 16398 : return pg_u_prop_white_space(code); 314 : } 315 : 316 : bool 317 0 : pg_u_isxdigit(pg_wchar code, bool posix) 318 : { 319 0 : if (posix) 320 0 : return (('0' <= code && code <= '9') || 321 0 : ('A' <= code && code <= 'F') || 322 0 : ('a' <= code && code <= 'f')); 323 : else 324 0 : return unicode_category(code) == PG_U_DECIMAL_NUMBER || 325 0 : pg_u_prop_hex_digit(code); 326 : } 327 : 328 : /* 329 : * Description of Unicode general category. 330 : */ 331 : const char * 332 0 : unicode_category_string(pg_unicode_category category) 333 : { 334 0 : switch (category) 335 : { 336 0 : case PG_U_UNASSIGNED: 337 0 : return "Unassigned"; 338 0 : case PG_U_UPPERCASE_LETTER: 339 0 : return "Uppercase_Letter"; 340 0 : case PG_U_LOWERCASE_LETTER: 341 0 : return "Lowercase_Letter"; 342 0 : case PG_U_TITLECASE_LETTER: 343 0 : return "Titlecase_Letter"; 344 0 : case PG_U_MODIFIER_LETTER: 345 0 : return "Modifier_Letter"; 346 0 : case PG_U_OTHER_LETTER: 347 0 : return "Other_Letter"; 348 0 : case PG_U_NONSPACING_MARK: 349 0 : return "Nonspacing_Mark"; 350 0 : case PG_U_ENCLOSING_MARK: 351 0 : return "Enclosing_Mark"; 352 0 : case PG_U_SPACING_MARK: 353 0 : return "Spacing_Mark"; 354 0 : case PG_U_DECIMAL_NUMBER: 355 0 : return "Decimal_Number"; 356 0 : case PG_U_LETTER_NUMBER: 357 0 : return "Letter_Number"; 358 0 : case PG_U_OTHER_NUMBER: 359 0 : return "Other_Number"; 360 0 : case PG_U_SPACE_SEPARATOR: 361 0 : return "Space_Separator"; 362 0 : case PG_U_LINE_SEPARATOR: 363 0 : return "Line_Separator"; 364 0 : case PG_U_PARAGRAPH_SEPARATOR: 365 0 : return "Paragraph_Separator"; 366 0 : case PG_U_CONTROL: 367 0 : return "Control"; 368 0 : case PG_U_FORMAT: 369 0 : return "Format"; 370 0 : case PG_U_PRIVATE_USE: 371 0 : return "Private_Use"; 372 0 : case PG_U_SURROGATE: 373 0 : return "Surrogate"; 374 0 : case PG_U_DASH_PUNCTUATION: 375 0 : return "Dash_Punctuation"; 376 0 : case PG_U_OPEN_PUNCTUATION: 377 0 : return "Open_Punctuation"; 378 0 : case PG_U_CLOSE_PUNCTUATION: 379 0 : return "Close_Punctuation"; 380 0 : case PG_U_CONNECTOR_PUNCTUATION: 381 0 : return "Connector_Punctuation"; 382 0 : case PG_U_OTHER_PUNCTUATION: 383 0 : return "Other_Punctuation"; 384 0 : case PG_U_MATH_SYMBOL: 385 0 : return "Math_Symbol"; 386 0 : case PG_U_CURRENCY_SYMBOL: 387 0 : return "Currency_Symbol"; 388 0 : case PG_U_MODIFIER_SYMBOL: 389 0 : return "Modifier_Symbol"; 390 0 : case PG_U_OTHER_SYMBOL: 391 0 : return "Other_Symbol"; 392 0 : case PG_U_INITIAL_PUNCTUATION: 393 0 : return "Initial_Punctuation"; 394 0 : case PG_U_FINAL_PUNCTUATION: 395 0 : return "Final_Punctuation"; 396 : } 397 : 398 : Assert(false); 399 0 : return "Unrecognized"; /* keep compiler quiet */ 400 : } 401 : 402 : /* 403 : * Short code for Unicode general category. 404 : */ 405 : const char * 406 0 : unicode_category_abbrev(pg_unicode_category category) 407 : { 408 0 : switch (category) 409 : { 410 0 : case PG_U_UNASSIGNED: 411 0 : return "Cn"; 412 0 : case PG_U_UPPERCASE_LETTER: 413 0 : return "Lu"; 414 0 : case PG_U_LOWERCASE_LETTER: 415 0 : return "Ll"; 416 0 : case PG_U_TITLECASE_LETTER: 417 0 : return "Lt"; 418 0 : case PG_U_MODIFIER_LETTER: 419 0 : return "Lm"; 420 0 : case PG_U_OTHER_LETTER: 421 0 : return "Lo"; 422 0 : case PG_U_NONSPACING_MARK: 423 0 : return "Mn"; 424 0 : case PG_U_ENCLOSING_MARK: 425 0 : return "Me"; 426 0 : case PG_U_SPACING_MARK: 427 0 : return "Mc"; 428 0 : case PG_U_DECIMAL_NUMBER: 429 0 : return "Nd"; 430 0 : case PG_U_LETTER_NUMBER: 431 0 : return "Nl"; 432 0 : case PG_U_OTHER_NUMBER: 433 0 : return "No"; 434 0 : case PG_U_SPACE_SEPARATOR: 435 0 : return "Zs"; 436 0 : case PG_U_LINE_SEPARATOR: 437 0 : return "Zl"; 438 0 : case PG_U_PARAGRAPH_SEPARATOR: 439 0 : return "Zp"; 440 0 : case PG_U_CONTROL: 441 0 : return "Cc"; 442 0 : case PG_U_FORMAT: 443 0 : return "Cf"; 444 0 : case PG_U_PRIVATE_USE: 445 0 : return "Co"; 446 0 : case PG_U_SURROGATE: 447 0 : return "Cs"; 448 0 : case PG_U_DASH_PUNCTUATION: 449 0 : return "Pd"; 450 0 : case PG_U_OPEN_PUNCTUATION: 451 0 : return "Ps"; 452 0 : case PG_U_CLOSE_PUNCTUATION: 453 0 : return "Pe"; 454 0 : case PG_U_CONNECTOR_PUNCTUATION: 455 0 : return "Pc"; 456 0 : case PG_U_OTHER_PUNCTUATION: 457 0 : return "Po"; 458 0 : case PG_U_MATH_SYMBOL: 459 0 : return "Sm"; 460 0 : case PG_U_CURRENCY_SYMBOL: 461 0 : return "Sc"; 462 0 : case PG_U_MODIFIER_SYMBOL: 463 0 : return "Sk"; 464 0 : case PG_U_OTHER_SYMBOL: 465 0 : return "So"; 466 0 : case PG_U_INITIAL_PUNCTUATION: 467 0 : return "Pi"; 468 0 : case PG_U_FINAL_PUNCTUATION: 469 0 : return "Pf"; 470 : } 471 : 472 : Assert(false); 473 0 : return "??"; /* keep compiler quiet */ 474 : } 475 : 476 : /* 477 : * Binary search to test if given codepoint exists in one of the ranges in the 478 : * given table. 479 : */ 480 : static bool 481 57852 : range_search(const pg_unicode_range *tbl, size_t size, pg_wchar code) 482 : { 483 57852 : int min = 0; 484 : int mid; 485 57852 : int max = size - 1; 486 : 487 : Assert(code <= 0x10ffff); 488 : 489 488566 : while (max >= min) 490 : { 491 457234 : mid = (min + max) / 2; 492 457234 : if (code > tbl[mid].last) 493 162048 : min = mid + 1; 494 295186 : else if (code < tbl[mid].first) 495 268666 : max = mid - 1; 496 : else 497 26520 : return true; 498 : } 499 : 500 31332 : return false; 501 : }