Line data Source code
1 : /*-------------------------------------------------------------------------
2 : * unicode_category.c
3 : * Determine general category and character properties of Unicode
4 : * characters. Encoding must be UTF8, where we assume that the char32_t
5 : * representation is a code point.
6 : *
7 : * Portions Copyright (c) 2017-2026, PostgreSQL Global Development Group
8 : *
9 : * IDENTIFICATION
10 : * src/common/unicode_category.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 : #ifndef FRONTEND
15 : #include "postgres.h"
16 : #else
17 : #include "postgres_fe.h"
18 : #endif
19 :
20 : #include "common/unicode_category.h"
21 : #include "common/unicode_category_table.h"
22 :
23 : /*
24 : * Create bitmasks from pg_unicode_category values for efficient comparison of
25 : * multiple categories. For instance, PG_U_MN_MASK is a bitmask representing
26 : * the general category Mn; and PG_U_M_MASK represents general categories Mn,
27 : * Me, and Mc.
28 : *
29 : * The number of Unicode General Categories should never grow, so a 32-bit
30 : * mask is fine.
31 : */
32 : #define PG_U_CATEGORY_MASK(X) ((uint32)(1 << (X)))
33 :
34 : #define PG_U_LU_MASK PG_U_CATEGORY_MASK(PG_U_UPPERCASE_LETTER)
35 : #define PG_U_LL_MASK PG_U_CATEGORY_MASK(PG_U_LOWERCASE_LETTER)
36 : #define PG_U_LT_MASK PG_U_CATEGORY_MASK(PG_U_TITLECASE_LETTER)
37 : #define PG_U_LC_MASK (PG_U_LU_MASK|PG_U_LL_MASK|PG_U_LT_MASK)
38 : #define PG_U_LM_MASK PG_U_CATEGORY_MASK(PG_U_MODIFIER_LETTER)
39 : #define PG_U_LO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_LETTER)
40 : #define PG_U_L_MASK (PG_U_LU_MASK|PG_U_LL_MASK|PG_U_LT_MASK|PG_U_LM_MASK|\
41 : PG_U_LO_MASK)
42 : #define PG_U_MN_MASK PG_U_CATEGORY_MASK(PG_U_NONSPACING_MARK)
43 : #define PG_U_ME_MASK PG_U_CATEGORY_MASK(PG_U_ENCLOSING_MARK)
44 : #define PG_U_MC_MASK PG_U_CATEGORY_MASK(PG_U_SPACING_MARK)
45 : #define PG_U_M_MASK (PG_U_MN_MASK|PG_U_MC_MASK|PG_U_ME_MASK)
46 : #define PG_U_ND_MASK PG_U_CATEGORY_MASK(PG_U_DECIMAL_NUMBER)
47 : #define PG_U_NL_MASK PG_U_CATEGORY_MASK(PG_U_LETTER_NUMBER)
48 : #define PG_U_NO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_NUMBER)
49 : #define PG_U_N_MASK (PG_U_ND_MASK|PG_U_NL_MASK|PG_U_NO_MASK)
50 : #define PG_U_PC_MASK PG_U_CATEGORY_MASK(PG_U_CONNECTOR_PUNCTUATION)
51 : #define PG_U_PD_MASK PG_U_CATEGORY_MASK(PG_U_DASH_PUNCTUATION)
52 : #define PG_U_PS_MASK PG_U_CATEGORY_MASK(PG_U_OPEN_PUNCTUATION)
53 : #define PG_U_PE_MASK PG_U_CATEGORY_MASK(PG_U_CLOSE_PUNCTUATION)
54 : #define PG_U_PI_MASK PG_U_CATEGORY_MASK(PG_U_INITIAL_PUNCTUATION)
55 : #define PG_U_PF_MASK PG_U_CATEGORY_MASK(PG_U_FINAL_PUNCTUATION)
56 : #define PG_U_PO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_PUNCTUATION)
57 : #define PG_U_P_MASK (PG_U_PC_MASK|PG_U_PD_MASK|PG_U_PS_MASK|PG_U_PE_MASK|\
58 : PG_U_PI_MASK|PG_U_PF_MASK|PG_U_PO_MASK)
59 : #define PG_U_SM_MASK PG_U_CATEGORY_MASK(PG_U_MATH_SYMBOL)
60 : #define PG_U_SC_MASK PG_U_CATEGORY_MASK(PG_U_CURRENCY_SYMBOL)
61 : #define PG_U_SK_MASK PG_U_CATEGORY_MASK(PG_U_MODIFIER_SYMBOL)
62 : #define PG_U_SO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_SYMBOL)
63 : #define PG_U_S_MASK (PG_U_SM_MASK|PG_U_SC_MASK|PG_U_SK_MASK|PG_U_SO_MASK)
64 : #define PG_U_ZS_MASK PG_U_CATEGORY_MASK(PG_U_SPACE_SEPARATOR)
65 : #define PG_U_ZL_MASK PG_U_CATEGORY_MASK(PG_U_LINE_SEPARATOR)
66 : #define PG_U_ZP_MASK PG_U_CATEGORY_MASK(PG_U_PARAGRAPH_SEPARATOR)
67 : #define PG_U_Z_MASK (PG_U_ZS_MASK|PG_U_ZL_MASK|PG_U_ZP_MASK)
68 : #define PG_U_CC_MASK PG_U_CATEGORY_MASK(PG_U_CONTROL)
69 : #define PG_U_CF_MASK PG_U_CATEGORY_MASK(PG_U_FORMAT)
70 : #define PG_U_CS_MASK PG_U_CATEGORY_MASK(PG_U_SURROGATE)
71 : #define PG_U_CO_MASK PG_U_CATEGORY_MASK(PG_U_PRIVATE_USE)
72 : #define PG_U_CN_MASK PG_U_CATEGORY_MASK(PG_U_UNASSIGNED)
73 : #define PG_U_C_MASK (PG_U_CC_MASK|PG_U_CF_MASK|PG_U_CS_MASK|PG_U_CO_MASK|\
74 : PG_U_CN_MASK)
75 :
76 : #define PG_U_CHARACTER_TAB 0x09
77 :
78 : static bool range_search(const pg_unicode_range *tbl, size_t size,
79 : char32_t code);
80 :
81 : /*
82 : * Unicode general category for the given codepoint.
83 : */
84 : pg_unicode_category
85 15474 : unicode_category(char32_t code)
86 : {
87 15474 : int min = 0;
88 : int mid;
89 15474 : int max = lengthof(unicode_categories) - 1;
90 :
91 : Assert(code <= 0x10ffff);
92 :
93 15474 : if (code < 0x80)
94 1320 : return unicode_opt_ascii[code].category;
95 :
96 152523 : while (max >= min)
97 : {
98 151836 : mid = (min + max) / 2;
99 151836 : if (code > unicode_categories[mid].last)
100 61971 : min = mid + 1;
101 89865 : else if (code < unicode_categories[mid].first)
102 76398 : max = mid - 1;
103 : else
104 13467 : return unicode_categories[mid].category;
105 : }
106 :
107 687 : return PG_U_UNASSIGNED;
108 : }
109 :
110 : bool
111 47359 : pg_u_prop_alphabetic(char32_t code)
112 : {
113 47359 : if (code < 0x80)
114 22111 : return unicode_opt_ascii[code].properties & PG_U_PROP_ALPHABETIC;
115 :
116 25248 : return range_search(unicode_alphabetic,
117 : lengthof(unicode_alphabetic),
118 : code);
119 : }
120 :
121 : bool
122 30 : pg_u_prop_lowercase(char32_t code)
123 : {
124 30 : if (code < 0x80)
125 0 : return unicode_opt_ascii[code].properties & PG_U_PROP_LOWERCASE;
126 :
127 30 : return range_search(unicode_lowercase,
128 : lengthof(unicode_lowercase),
129 : code);
130 : }
131 :
132 : bool
133 12318 : pg_u_prop_uppercase(char32_t code)
134 : {
135 12318 : if (code < 0x80)
136 768 : return unicode_opt_ascii[code].properties & PG_U_PROP_UPPERCASE;
137 :
138 11550 : return range_search(unicode_uppercase,
139 : lengthof(unicode_uppercase),
140 : code);
141 : }
142 :
143 : bool
144 39 : pg_u_prop_cased(char32_t code)
145 : {
146 : uint32 category_mask;
147 :
148 39 : if (code < 0x80)
149 9 : return unicode_opt_ascii[code].properties & PG_U_PROP_CASED;
150 :
151 30 : category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
152 :
153 30 : return category_mask & PG_U_LT_MASK ||
154 60 : pg_u_prop_lowercase(code) ||
155 30 : pg_u_prop_uppercase(code);
156 : }
157 :
158 : bool
159 63 : pg_u_prop_case_ignorable(char32_t code)
160 : {
161 63 : if (code < 0x80)
162 9 : return unicode_opt_ascii[code].properties & PG_U_PROP_CASE_IGNORABLE;
163 :
164 54 : return range_search(unicode_case_ignorable,
165 : lengthof(unicode_case_ignorable),
166 : code);
167 : }
168 :
169 : bool
170 8312 : pg_u_prop_white_space(char32_t code)
171 : {
172 8312 : if (code < 0x80)
173 632 : return unicode_opt_ascii[code].properties & PG_U_PROP_WHITE_SPACE;
174 :
175 7680 : return range_search(unicode_white_space,
176 : lengthof(unicode_white_space),
177 : code);
178 : }
179 :
180 : bool
181 0 : pg_u_prop_hex_digit(char32_t code)
182 : {
183 0 : if (code < 0x80)
184 0 : return unicode_opt_ascii[code].properties & PG_U_PROP_HEX_DIGIT;
185 :
186 0 : return range_search(unicode_hex_digit,
187 : lengthof(unicode_hex_digit),
188 : code);
189 : }
190 :
191 : bool
192 0 : pg_u_prop_join_control(char32_t code)
193 : {
194 0 : if (code < 0x80)
195 0 : return unicode_opt_ascii[code].properties & PG_U_PROP_JOIN_CONTROL;
196 :
197 0 : return range_search(unicode_join_control,
198 : lengthof(unicode_join_control),
199 : code);
200 : }
201 :
202 : /*
203 : * The following functions implement the Compatibility Properties described
204 : * at: http://www.unicode.org/reports/tr18/#Compatibility_Properties
205 : *
206 : * If 'posix' is true, implements the "POSIX Compatible" variant, otherwise
207 : * the "Standard" variant.
208 : */
209 :
210 : bool
211 44440 : pg_u_isdigit(char32_t code, bool posix)
212 : {
213 44440 : if (posix)
214 36685 : return ('0' <= code && code <= '9');
215 : else
216 7755 : return unicode_category(code) == PG_U_DECIMAL_NUMBER;
217 : }
218 :
219 : bool
220 47359 : pg_u_isalpha(char32_t code)
221 : {
222 47359 : return pg_u_prop_alphabetic(code);
223 : }
224 :
225 : bool
226 21362 : pg_u_isalnum(char32_t code, bool posix)
227 : {
228 21362 : return pg_u_isalpha(code) || pg_u_isdigit(code, posix);
229 : }
230 :
231 : bool
232 0 : pg_u_isword(char32_t code)
233 : {
234 0 : uint32 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
235 :
236 : return
237 0 : category_mask & (PG_U_M_MASK | PG_U_ND_MASK | PG_U_PC_MASK) ||
238 0 : pg_u_isalpha(code) ||
239 0 : pg_u_prop_join_control(code);
240 : }
241 :
242 : bool
243 12288 : pg_u_isupper(char32_t code)
244 : {
245 12288 : return pg_u_prop_uppercase(code);
246 : }
247 :
248 : bool
249 0 : pg_u_islower(char32_t code)
250 : {
251 0 : return pg_u_prop_lowercase(code);
252 : }
253 :
254 : bool
255 0 : pg_u_isblank(char32_t code)
256 : {
257 0 : return code == PG_U_CHARACTER_TAB ||
258 0 : unicode_category(code) == PG_U_SPACE_SEPARATOR;
259 : }
260 :
261 : bool
262 0 : pg_u_iscntrl(char32_t code)
263 : {
264 0 : return unicode_category(code) == PG_U_CONTROL;
265 : }
266 :
267 : bool
268 0 : pg_u_isgraph(char32_t code)
269 : {
270 0 : uint32 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
271 :
272 0 : if (category_mask & (PG_U_CC_MASK | PG_U_CS_MASK | PG_U_CN_MASK) ||
273 0 : pg_u_isspace(code))
274 0 : return false;
275 0 : return true;
276 : }
277 :
278 : bool
279 0 : pg_u_isprint(char32_t code)
280 : {
281 0 : pg_unicode_category category = unicode_category(code);
282 :
283 0 : if (category == PG_U_CONTROL)
284 0 : return false;
285 :
286 0 : return pg_u_isgraph(code) || pg_u_isblank(code);
287 : }
288 :
289 : bool
290 12288 : pg_u_ispunct(char32_t code, bool posix)
291 : {
292 : uint32 category_mask;
293 :
294 12288 : if (posix)
295 : {
296 6144 : if (pg_u_isalpha(code))
297 4620 : return false;
298 :
299 1524 : category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
300 1524 : return category_mask & (PG_U_P_MASK | PG_U_S_MASK);
301 : }
302 : else
303 : {
304 6144 : category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
305 :
306 6144 : return category_mask & PG_U_P_MASK;
307 : }
308 : }
309 :
310 : bool
311 8312 : pg_u_isspace(char32_t code)
312 : {
313 8312 : return pg_u_prop_white_space(code);
314 : }
315 :
316 : bool
317 3 : pg_u_isxdigit(char32_t code, bool posix)
318 : {
319 3 : if (posix)
320 3 : return (('0' <= code && code <= '9') ||
321 7 : ('A' <= code && code <= 'F') ||
322 1 : ('a' <= code && code <= 'f'));
323 : else
324 0 : return unicode_category(code) == PG_U_DECIMAL_NUMBER ||
325 0 : pg_u_prop_hex_digit(code);
326 : }
327 :
328 : /*
329 : * Description of Unicode general category.
330 : */
331 : const char *
332 0 : unicode_category_string(pg_unicode_category category)
333 : {
334 0 : switch (category)
335 : {
336 0 : case PG_U_UNASSIGNED:
337 0 : return "Unassigned";
338 0 : case PG_U_UPPERCASE_LETTER:
339 0 : return "Uppercase_Letter";
340 0 : case PG_U_LOWERCASE_LETTER:
341 0 : return "Lowercase_Letter";
342 0 : case PG_U_TITLECASE_LETTER:
343 0 : return "Titlecase_Letter";
344 0 : case PG_U_MODIFIER_LETTER:
345 0 : return "Modifier_Letter";
346 0 : case PG_U_OTHER_LETTER:
347 0 : return "Other_Letter";
348 0 : case PG_U_NONSPACING_MARK:
349 0 : return "Nonspacing_Mark";
350 0 : case PG_U_ENCLOSING_MARK:
351 0 : return "Enclosing_Mark";
352 0 : case PG_U_SPACING_MARK:
353 0 : return "Spacing_Mark";
354 0 : case PG_U_DECIMAL_NUMBER:
355 0 : return "Decimal_Number";
356 0 : case PG_U_LETTER_NUMBER:
357 0 : return "Letter_Number";
358 0 : case PG_U_OTHER_NUMBER:
359 0 : return "Other_Number";
360 0 : case PG_U_SPACE_SEPARATOR:
361 0 : return "Space_Separator";
362 0 : case PG_U_LINE_SEPARATOR:
363 0 : return "Line_Separator";
364 0 : case PG_U_PARAGRAPH_SEPARATOR:
365 0 : return "Paragraph_Separator";
366 0 : case PG_U_CONTROL:
367 0 : return "Control";
368 0 : case PG_U_FORMAT:
369 0 : return "Format";
370 0 : case PG_U_PRIVATE_USE:
371 0 : return "Private_Use";
372 0 : case PG_U_SURROGATE:
373 0 : return "Surrogate";
374 0 : case PG_U_DASH_PUNCTUATION:
375 0 : return "Dash_Punctuation";
376 0 : case PG_U_OPEN_PUNCTUATION:
377 0 : return "Open_Punctuation";
378 0 : case PG_U_CLOSE_PUNCTUATION:
379 0 : return "Close_Punctuation";
380 0 : case PG_U_CONNECTOR_PUNCTUATION:
381 0 : return "Connector_Punctuation";
382 0 : case PG_U_OTHER_PUNCTUATION:
383 0 : return "Other_Punctuation";
384 0 : case PG_U_MATH_SYMBOL:
385 0 : return "Math_Symbol";
386 0 : case PG_U_CURRENCY_SYMBOL:
387 0 : return "Currency_Symbol";
388 0 : case PG_U_MODIFIER_SYMBOL:
389 0 : return "Modifier_Symbol";
390 0 : case PG_U_OTHER_SYMBOL:
391 0 : return "Other_Symbol";
392 0 : case PG_U_INITIAL_PUNCTUATION:
393 0 : return "Initial_Punctuation";
394 0 : case PG_U_FINAL_PUNCTUATION:
395 0 : return "Final_Punctuation";
396 : }
397 :
398 : Assert(false);
399 0 : return "Unrecognized"; /* keep compiler quiet */
400 : }
401 :
402 : /*
403 : * Short code for Unicode general category.
404 : */
405 : const char *
406 0 : unicode_category_abbrev(pg_unicode_category category)
407 : {
408 0 : switch (category)
409 : {
410 0 : case PG_U_UNASSIGNED:
411 0 : return "Cn";
412 0 : case PG_U_UPPERCASE_LETTER:
413 0 : return "Lu";
414 0 : case PG_U_LOWERCASE_LETTER:
415 0 : return "Ll";
416 0 : case PG_U_TITLECASE_LETTER:
417 0 : return "Lt";
418 0 : case PG_U_MODIFIER_LETTER:
419 0 : return "Lm";
420 0 : case PG_U_OTHER_LETTER:
421 0 : return "Lo";
422 0 : case PG_U_NONSPACING_MARK:
423 0 : return "Mn";
424 0 : case PG_U_ENCLOSING_MARK:
425 0 : return "Me";
426 0 : case PG_U_SPACING_MARK:
427 0 : return "Mc";
428 0 : case PG_U_DECIMAL_NUMBER:
429 0 : return "Nd";
430 0 : case PG_U_LETTER_NUMBER:
431 0 : return "Nl";
432 0 : case PG_U_OTHER_NUMBER:
433 0 : return "No";
434 0 : case PG_U_SPACE_SEPARATOR:
435 0 : return "Zs";
436 0 : case PG_U_LINE_SEPARATOR:
437 0 : return "Zl";
438 0 : case PG_U_PARAGRAPH_SEPARATOR:
439 0 : return "Zp";
440 0 : case PG_U_CONTROL:
441 0 : return "Cc";
442 0 : case PG_U_FORMAT:
443 0 : return "Cf";
444 0 : case PG_U_PRIVATE_USE:
445 0 : return "Co";
446 0 : case PG_U_SURROGATE:
447 0 : return "Cs";
448 0 : case PG_U_DASH_PUNCTUATION:
449 0 : return "Pd";
450 0 : case PG_U_OPEN_PUNCTUATION:
451 0 : return "Ps";
452 0 : case PG_U_CLOSE_PUNCTUATION:
453 0 : return "Pe";
454 0 : case PG_U_CONNECTOR_PUNCTUATION:
455 0 : return "Pc";
456 0 : case PG_U_OTHER_PUNCTUATION:
457 0 : return "Po";
458 0 : case PG_U_MATH_SYMBOL:
459 0 : return "Sm";
460 0 : case PG_U_CURRENCY_SYMBOL:
461 0 : return "Sc";
462 0 : case PG_U_MODIFIER_SYMBOL:
463 0 : return "Sk";
464 0 : case PG_U_OTHER_SYMBOL:
465 0 : return "So";
466 0 : case PG_U_INITIAL_PUNCTUATION:
467 0 : return "Pi";
468 0 : case PG_U_FINAL_PUNCTUATION:
469 0 : return "Pf";
470 : }
471 :
472 : Assert(false);
473 0 : return "??"; /* keep compiler quiet */
474 : }
475 :
476 : /*
477 : * Binary search to test if given codepoint exists in one of the ranges in the
478 : * given table.
479 : */
480 : static bool
481 44562 : range_search(const pg_unicode_range *tbl, size_t size, char32_t code)
482 : {
483 44562 : int min = 0;
484 : int mid;
485 44562 : int max = size - 1;
486 :
487 : Assert(code <= 0x10ffff);
488 :
489 411061 : while (max >= min)
490 : {
491 388833 : mid = (min + max) / 2;
492 388833 : if (code > tbl[mid].last)
493 139271 : min = mid + 1;
494 249562 : else if (code < tbl[mid].first)
495 227228 : max = mid - 1;
496 : else
497 22334 : return true;
498 : }
499 :
500 22228 : return false;
501 : }
|