Line data Source code
1 : /*-------------------------------------------------------------------------
2 : * unicode_norm.c
3 : * Normalize a Unicode string
4 : *
5 : * This implements Unicode normalization, per the documentation at
6 : * https://www.unicode.org/reports/tr15/.
7 : *
8 : * Portions Copyright (c) 2017-2026, PostgreSQL Global Development Group
9 : *
10 : * IDENTIFICATION
11 : * src/common/unicode_norm.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #ifndef FRONTEND
16 : #include "postgres.h"
17 : #else
18 : #include "postgres_fe.h"
19 : #endif
20 :
21 : #include "common/unicode_norm.h"
22 : #ifndef FRONTEND
23 : #include "common/unicode_norm_hashfunc.h"
24 : #include "common/unicode_normprops_table.h"
25 : #include "port/pg_bswap.h"
26 : #include "utils/memutils.h"
27 : #else
28 : #include "common/unicode_norm_table.h"
29 : #endif
30 :
31 : #ifndef FRONTEND
32 : #define ALLOC(size) palloc(size)
33 : #define FREE(size) pfree(size)
34 : #else
35 : #define ALLOC(size) malloc(size)
36 : #define FREE(size) free(size)
37 : #endif
38 :
39 : /* Constants for calculations with Hangul characters */
40 : #define SBASE 0xAC00 /* U+AC00 */
41 : #define LBASE 0x1100 /* U+1100 */
42 : #define VBASE 0x1161 /* U+1161 */
43 : #define TBASE 0x11A7 /* U+11A7 */
44 : #define LCOUNT 19
45 : #define VCOUNT 21
46 : #define TCOUNT 28
47 : #define NCOUNT VCOUNT * TCOUNT
48 : #define SCOUNT LCOUNT * NCOUNT
49 :
50 : #ifdef FRONTEND
51 : /* comparison routine for bsearch() of decomposition lookup table. */
52 : static int
53 14699 : conv_compare(const void *p1, const void *p2)
54 : {
55 : uint32 v1,
56 : v2;
57 :
58 14699 : v1 = *(const uint32 *) p1;
59 14699 : v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
60 14699 : return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
61 : }
62 :
63 : #endif
64 :
65 : /*
66 : * get_code_entry
67 : *
68 : * Get the entry corresponding to code in the decomposition lookup table.
69 : * The backend version of this code uses a perfect hash function for the
70 : * lookup, while the frontend version uses a binary search.
71 : */
72 : static const pg_unicode_decomposition *
73 14871 : get_code_entry(char32_t code)
74 : {
75 : #ifndef FRONTEND
76 : int h;
77 : uint32 hashkey;
78 13740 : pg_unicode_decompinfo decompinfo = UnicodeDecompInfo;
79 :
80 : /*
81 : * Compute the hash function. The hash key is the codepoint with the bytes
82 : * in network order.
83 : */
84 13740 : hashkey = pg_hton32(code);
85 13740 : h = decompinfo.hash(&hashkey);
86 :
87 : /* An out-of-range result implies no match */
88 13740 : if (h < 0 || h >= decompinfo.num_decomps)
89 3505 : return NULL;
90 :
91 : /*
92 : * Since it's a perfect hash, we need only match to the specific codepoint
93 : * it identifies.
94 : */
95 10235 : if (code != decompinfo.decomps[h].codepoint)
96 9529 : return NULL;
97 :
98 : /* Success! */
99 706 : return &decompinfo.decomps[h];
100 : #else
101 1131 : return bsearch(&(code),
102 : UnicodeDecompMain,
103 : lengthof(UnicodeDecompMain),
104 : sizeof(pg_unicode_decomposition),
105 : conv_compare);
106 : #endif
107 : }
108 :
109 : /*
110 : * Get the combining class of the given codepoint.
111 : */
112 : static uint8
113 8403 : get_canonical_class(char32_t code)
114 : {
115 8403 : const pg_unicode_decomposition *entry = get_code_entry(code);
116 :
117 : /*
118 : * If no entries are found, the character used is either a Hangul
119 : * character or a character with a class of 0 and no decompositions.
120 : */
121 8403 : if (!entry)
122 8063 : return 0;
123 : else
124 340 : return entry->comb_class;
125 : }
126 :
127 : /*
128 : * Given a decomposition entry looked up earlier, get the decomposed
129 : * characters.
130 : *
131 : * Note: the returned pointer can point to statically allocated buffer, and
132 : * is only valid until next call to this function!
133 : */
134 : static const char32_t *
135 138 : get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
136 : {
137 : static char32_t x;
138 :
139 138 : if (DECOMPOSITION_IS_INLINE(entry))
140 : {
141 : Assert(DECOMPOSITION_SIZE(entry) == 1);
142 42 : x = (char32_t) entry->dec_index;
143 42 : *dec_size = 1;
144 42 : return &x;
145 : }
146 : else
147 : {
148 96 : *dec_size = DECOMPOSITION_SIZE(entry);
149 96 : return &UnicodeDecomp_codepoints[entry->dec_index];
150 : }
151 : }
152 :
153 : /*
154 : * Calculate how many characters a given character will decompose to.
155 : *
156 : * This needs to recurse, if the character decomposes into characters that
157 : * are, in turn, decomposable.
158 : */
159 : static int
160 3269 : get_decomposed_size(char32_t code, bool compat)
161 : {
162 : const pg_unicode_decomposition *entry;
163 3269 : int size = 0;
164 : int i;
165 : const uint32 *decomp;
166 : int dec_size;
167 :
168 : /*
169 : * Fast path for Hangul characters not stored in tables to save memory as
170 : * decomposition is algorithmic. See
171 : * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
172 : * on the matter.
173 : */
174 3269 : if (code >= SBASE && code < SBASE + SCOUNT)
175 : {
176 : uint32 tindex,
177 : sindex;
178 :
179 35 : sindex = code - SBASE;
180 35 : tindex = sindex % TCOUNT;
181 :
182 35 : if (tindex != 0)
183 10 : return 3;
184 25 : return 2;
185 : }
186 :
187 3234 : entry = get_code_entry(code);
188 :
189 : /*
190 : * Just count current code if no other decompositions. A NULL entry is
191 : * equivalent to a character with class 0 and no decompositions.
192 : */
193 3234 : if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
194 96 : (!compat && DECOMPOSITION_IS_COMPAT(entry)))
195 3165 : return 1;
196 :
197 : /*
198 : * If this entry has other decomposition codes look at them as well. First
199 : * get its decomposition in the list of tables available.
200 : */
201 69 : decomp = get_code_decomposition(entry, &dec_size);
202 186 : for (i = 0; i < dec_size; i++)
203 : {
204 117 : uint32 lcode = decomp[i];
205 :
206 117 : size += get_decomposed_size(lcode, compat);
207 : }
208 :
209 69 : return size;
210 : }
211 :
212 : /*
213 : * Recompose a set of characters. For hangul characters, the calculation
214 : * is algorithmic. For others, an inverse lookup at the decomposition
215 : * table is necessary. Returns true if a recomposition can be done, and
216 : * false otherwise.
217 : */
218 : static bool
219 2613 : recompose_code(uint32 start, uint32 code, uint32 *result)
220 : {
221 : /*
222 : * Handle Hangul characters algorithmically, per the Unicode spec.
223 : *
224 : * Check if two current characters are L and V.
225 : */
226 2613 : if (start >= LBASE && start < LBASE + LCOUNT &&
227 45 : code >= VBASE && code < VBASE + VCOUNT)
228 : {
229 : /* make syllable of form LV */
230 45 : uint32 lindex = start - LBASE;
231 45 : uint32 vindex = code - VBASE;
232 :
233 45 : *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
234 45 : return true;
235 : }
236 : /* Check if two current characters are LV and T */
237 2568 : else if (start >= SBASE && start < (SBASE + SCOUNT) &&
238 35 : ((start - SBASE) % TCOUNT) == 0 &&
239 25 : code > TBASE && code < (TBASE + TCOUNT))
240 : {
241 : /* make syllable of form LVT */
242 25 : uint32 tindex = code - TBASE;
243 :
244 25 : *result = start + tindex;
245 25 : return true;
246 : }
247 : else
248 : {
249 : const pg_unicode_decomposition *entry;
250 :
251 : /*
252 : * Do an inverse lookup of the decomposition tables to see if anything
253 : * matches. The comparison just needs to be a perfect match on the
254 : * sub-table of size two, because the start character has already been
255 : * recomposed partially. This lookup uses a perfect hash function for
256 : * the backend code.
257 : */
258 : #ifndef FRONTEND
259 :
260 : int h,
261 : inv_lookup_index;
262 : uint64 hashkey;
263 2344 : pg_unicode_recompinfo recompinfo = UnicodeRecompInfo;
264 :
265 : /*
266 : * Compute the hash function. The hash key is formed by concatenating
267 : * bytes of the two codepoints in network order. See also
268 : * src/common/unicode/generate-unicode_norm_table.pl.
269 : */
270 2344 : hashkey = pg_hton64(((uint64) start << 32) | (uint64) code);
271 2344 : h = recompinfo.hash(&hashkey);
272 :
273 : /* An out-of-range result implies no match */
274 2344 : if (h < 0 || h >= recompinfo.num_recomps)
275 1937 : return false;
276 :
277 439 : inv_lookup_index = recompinfo.inverse_lookup[h];
278 439 : entry = &UnicodeDecompMain[inv_lookup_index];
279 :
280 439 : if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
281 36 : code == UnicodeDecomp_codepoints[entry->dec_index + 1])
282 : {
283 32 : *result = entry->codepoint;
284 32 : return true;
285 : }
286 :
287 : #else
288 :
289 : int i;
290 :
291 1368921 : for (i = 0; i < lengthof(UnicodeDecompMain); i++)
292 : {
293 1368722 : entry = &UnicodeDecompMain[i];
294 :
295 1368722 : if (DECOMPOSITION_SIZE(entry) != 2)
296 1031616 : continue;
297 :
298 337106 : if (DECOMPOSITION_NO_COMPOSE(entry))
299 145867 : continue;
300 :
301 191239 : if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
302 1718 : code == UnicodeDecomp_codepoints[entry->dec_index + 1])
303 : {
304 0 : *result = entry->codepoint;
305 0 : return true;
306 : }
307 : }
308 : #endif /* !FRONTEND */
309 : }
310 :
311 606 : return false;
312 : }
313 :
314 : /*
315 : * Decompose the given code into the array given by caller. The
316 : * decomposition begins at the position given by caller, saving one
317 : * lookup on the decomposition table. The current position needs to be
318 : * updated here to let the caller know from where to continue filling
319 : * in the array result.
320 : */
321 : static void
322 3269 : decompose_code(char32_t code, bool compat, char32_t **result, int *current)
323 : {
324 : const pg_unicode_decomposition *entry;
325 : int i;
326 : const uint32 *decomp;
327 : int dec_size;
328 :
329 : /*
330 : * Fast path for Hangul characters not stored in tables to save memory as
331 : * decomposition is algorithmic. See
332 : * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
333 : * on the matter.
334 : */
335 3269 : if (code >= SBASE && code < SBASE + SCOUNT)
336 : {
337 : uint32 l,
338 : v,
339 : tindex,
340 : sindex;
341 35 : char32_t *res = *result;
342 :
343 35 : sindex = code - SBASE;
344 35 : l = LBASE + sindex / (VCOUNT * TCOUNT);
345 35 : v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
346 35 : tindex = sindex % TCOUNT;
347 :
348 35 : res[*current] = l;
349 35 : (*current)++;
350 35 : res[*current] = v;
351 35 : (*current)++;
352 :
353 35 : if (tindex != 0)
354 : {
355 10 : res[*current] = TBASE + tindex;
356 10 : (*current)++;
357 : }
358 :
359 3200 : return;
360 : }
361 :
362 3234 : entry = get_code_entry(code);
363 :
364 : /*
365 : * Just fill in with the current decomposition if there are no
366 : * decomposition codes to recurse to. A NULL entry is equivalent to a
367 : * character with class 0 and no decompositions, so just leave also in
368 : * this case.
369 : */
370 3234 : if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
371 96 : (!compat && DECOMPOSITION_IS_COMPAT(entry)))
372 : {
373 3165 : char32_t *res = *result;
374 :
375 3165 : res[*current] = code;
376 3165 : (*current)++;
377 3165 : return;
378 : }
379 :
380 : /*
381 : * If this entry has other decomposition codes look at them as well.
382 : */
383 69 : decomp = get_code_decomposition(entry, &dec_size);
384 186 : for (i = 0; i < dec_size; i++)
385 : {
386 117 : char32_t lcode = (char32_t) decomp[i];
387 :
388 : /* Leave if no more decompositions */
389 117 : decompose_code(lcode, compat, result, current);
390 : }
391 : }
392 :
393 : /*
394 : * unicode_normalize - Normalize a Unicode string to the specified form.
395 : *
396 : * The input is a 0-terminated array of codepoints.
397 : *
398 : * In frontend, returns a 0-terminated array of codepoints, allocated with
399 : * malloc. Or NULL if we run out of memory. In backend, the returned
400 : * string is palloc'd instead, and OOM is reported with ereport().
401 : */
402 : char32_t *
403 430 : unicode_normalize(UnicodeNormalizationForm form, const char32_t *input)
404 : {
405 430 : bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
406 430 : bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
407 : char32_t *decomp_chars;
408 : char32_t *recomp_chars;
409 : int decomp_size,
410 : current_size;
411 : int count;
412 : const char32_t *p;
413 :
414 : /* variables for recomposition */
415 : int last_class;
416 : int starter_pos;
417 : int target_pos;
418 : uint32 starter_ch;
419 :
420 : /* First, do character decomposition */
421 :
422 : /*
423 : * Calculate how many characters long the decomposed version will be.
424 : *
425 : * Some characters decompose to quite a few code points, so that the
426 : * decomposed version's size could overrun MaxAllocSize, and even 32-bit
427 : * size_t, even though the input string presumably fits in that. In
428 : * frontend we want to just return NULL in that case, so monitor the sum
429 : * and exit early once we'd need more than MaxAllocSize bytes.
430 : */
431 430 : decomp_size = 0;
432 3582 : for (p = input; *p; p++)
433 : {
434 3152 : decomp_size += get_decomposed_size(*p, compat);
435 3152 : if (unlikely(decomp_size > MaxAllocSize / sizeof(char32_t)))
436 : {
437 : #ifndef FRONTEND
438 : /* Exit loop and let palloc() throw error below */
439 0 : break;
440 : #else
441 : /* Just return NULL with no explicit error */
442 0 : return NULL;
443 : #endif
444 : }
445 : }
446 :
447 430 : decomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
448 430 : if (decomp_chars == NULL)
449 0 : return NULL;
450 :
451 : /*
452 : * Now fill in each entry recursively. This needs a second pass on the
453 : * decomposition table.
454 : */
455 430 : current_size = 0;
456 3582 : for (p = input; *p; p++)
457 3152 : decompose_code(*p, compat, &decomp_chars, ¤t_size);
458 430 : decomp_chars[decomp_size] = '\0';
459 : Assert(decomp_size == current_size);
460 :
461 : /* Leave if there is nothing to decompose */
462 430 : if (decomp_size == 0)
463 13 : return decomp_chars;
464 :
465 : /*
466 : * Now apply canonical ordering.
467 : */
468 3245 : for (count = 1; count < decomp_size; count++)
469 : {
470 2828 : char32_t prev = decomp_chars[count - 1];
471 2828 : char32_t next = decomp_chars[count];
472 : char32_t tmp;
473 2828 : const uint8 prevClass = get_canonical_class(prev);
474 2828 : const uint8 nextClass = get_canonical_class(next);
475 :
476 : /*
477 : * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
478 : * annex 4, a sequence of two adjacent characters in a string is an
479 : * exchangeable pair if the combining class (from the Unicode
480 : * Character Database) for the first character is greater than the
481 : * combining class for the second, and the second is not a starter. A
482 : * character is a starter if its combining class is 0.
483 : */
484 2828 : if (prevClass == 0 || nextClass == 0)
485 2828 : continue;
486 :
487 0 : if (prevClass <= nextClass)
488 0 : continue;
489 :
490 : /* exchange can happen */
491 0 : tmp = decomp_chars[count - 1];
492 0 : decomp_chars[count - 1] = decomp_chars[count];
493 0 : decomp_chars[count] = tmp;
494 :
495 : /* backtrack to check again */
496 0 : if (count > 1)
497 0 : count -= 2;
498 : }
499 :
500 417 : if (!recompose)
501 73 : return decomp_chars;
502 :
503 : /*
504 : * The last phase of NFC and NFKC is the recomposition of the reordered
505 : * Unicode string using combining classes. The recomposed string cannot be
506 : * longer than the decomposed one, so make the allocation of the output
507 : * string based on that assumption.
508 : */
509 344 : recomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
510 344 : if (!recomp_chars)
511 : {
512 0 : FREE(decomp_chars);
513 0 : return NULL;
514 : }
515 :
516 344 : last_class = -1; /* this eliminates a special check */
517 344 : starter_pos = 0;
518 344 : target_pos = 1;
519 344 : starter_ch = recomp_chars[0] = decomp_chars[0];
520 :
521 2957 : for (count = 1; count < decomp_size; count++)
522 : {
523 2613 : char32_t ch = decomp_chars[count];
524 2613 : int ch_class = get_canonical_class(ch);
525 : char32_t composite;
526 :
527 5226 : if (last_class < ch_class &&
528 2613 : recompose_code(starter_ch, ch, &composite))
529 : {
530 102 : recomp_chars[starter_pos] = composite;
531 102 : starter_ch = composite;
532 : }
533 2511 : else if (ch_class == 0)
534 : {
535 2511 : starter_pos = target_pos;
536 2511 : starter_ch = ch;
537 2511 : last_class = -1;
538 2511 : recomp_chars[target_pos++] = ch;
539 : }
540 : else
541 : {
542 0 : last_class = ch_class;
543 0 : recomp_chars[target_pos++] = ch;
544 : }
545 : }
546 344 : recomp_chars[target_pos] = (char32_t) '\0';
547 :
548 344 : FREE(decomp_chars);
549 :
550 344 : return recomp_chars;
551 : }
552 :
553 : /*
554 : * Normalization "quick check" algorithm; see
555 : * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
556 : */
557 :
558 : /* We only need this in the backend. */
559 : #ifndef FRONTEND
560 :
561 : static const pg_unicode_normprops *
562 134 : qc_hash_lookup(char32_t ch, const pg_unicode_norminfo *norminfo)
563 : {
564 : int h;
565 : uint32 hashkey;
566 :
567 : /*
568 : * Compute the hash function. The hash key is the codepoint with the bytes
569 : * in network order.
570 : */
571 134 : hashkey = pg_hton32(ch);
572 134 : h = norminfo->hash(&hashkey);
573 :
574 : /* An out-of-range result implies no match */
575 134 : if (h < 0 || h >= norminfo->num_normprops)
576 92 : return NULL;
577 :
578 : /*
579 : * Since it's a perfect hash, we need only match to the specific codepoint
580 : * it identifies.
581 : */
582 42 : if (ch != norminfo->normprops[h].codepoint)
583 18 : return NULL;
584 :
585 : /* Success! */
586 24 : return &norminfo->normprops[h];
587 : }
588 :
589 : /*
590 : * Look up the normalization quick check character property
591 : */
592 : static UnicodeNormalizationQC
593 134 : qc_is_allowed(UnicodeNormalizationForm form, char32_t ch)
594 : {
595 134 : const pg_unicode_normprops *found = NULL;
596 :
597 134 : switch (form)
598 : {
599 86 : case UNICODE_NFC:
600 86 : found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
601 86 : break;
602 48 : case UNICODE_NFKC:
603 48 : found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
604 48 : break;
605 0 : default:
606 : Assert(false);
607 0 : break;
608 : }
609 :
610 134 : if (found)
611 24 : return found->quickcheck;
612 : else
613 110 : return UNICODE_NORM_QC_YES;
614 : }
615 :
616 : UnicodeNormalizationQC
617 90 : unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const char32_t *input)
618 : {
619 90 : uint8 lastCanonicalClass = 0;
620 90 : UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
621 :
622 : /*
623 : * For the "D" forms, we don't run the quickcheck. We don't include the
624 : * lookup tables for those because they are huge, checking for these
625 : * particular forms is less common, and running the slow path is faster
626 : * for the "D" forms than the "C" forms because you don't need to
627 : * recompose, which is slow.
628 : */
629 90 : if (form == UNICODE_NFD || form == UNICODE_NFKD)
630 40 : return UNICODE_NORM_QC_MAYBE;
631 :
632 176 : for (const char32_t *p = input; *p; p++)
633 : {
634 134 : char32_t ch = *p;
635 : uint8 canonicalClass;
636 : UnicodeNormalizationQC check;
637 :
638 134 : canonicalClass = get_canonical_class(ch);
639 134 : if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
640 0 : return UNICODE_NORM_QC_NO;
641 :
642 134 : check = qc_is_allowed(form, ch);
643 134 : if (check == UNICODE_NORM_QC_NO)
644 8 : return UNICODE_NORM_QC_NO;
645 126 : else if (check == UNICODE_NORM_QC_MAYBE)
646 16 : result = UNICODE_NORM_QC_MAYBE;
647 :
648 126 : lastCanonicalClass = canonicalClass;
649 : }
650 42 : return result;
651 : }
652 :
653 : #endif /* !FRONTEND */
|