Line data Source code
1 : /*-------------------------------------------------------------------------
2 : * unicode_norm.c
3 : * Normalize a Unicode string
4 : *
5 : * This implements Unicode normalization, per the documentation at
6 : * https://www.unicode.org/reports/tr15/.
7 : *
8 : * Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
9 : *
10 : * IDENTIFICATION
11 : * src/common/unicode_norm.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #ifndef FRONTEND
16 : #include "postgres.h"
17 : #else
18 : #include "postgres_fe.h"
19 : #endif
20 :
21 : #include "common/unicode_norm.h"
22 : #ifndef FRONTEND
23 : #include "common/unicode_norm_hashfunc.h"
24 : #include "common/unicode_normprops_table.h"
25 : #include "port/pg_bswap.h"
26 : #else
27 : #include "common/unicode_norm_table.h"
28 : #endif
29 :
30 : #ifndef FRONTEND
31 : #define ALLOC(size) palloc(size)
32 : #define FREE(size) pfree(size)
33 : #else
34 : #define ALLOC(size) malloc(size)
35 : #define FREE(size) free(size)
36 : #endif
37 :
38 : /* Constants for calculations with Hangul characters */
39 : #define SBASE 0xAC00 /* U+AC00 */
40 : #define LBASE 0x1100 /* U+1100 */
41 : #define VBASE 0x1161 /* U+1161 */
42 : #define TBASE 0x11A7 /* U+11A7 */
43 : #define LCOUNT 19
44 : #define VCOUNT 21
45 : #define TCOUNT 28
46 : #define NCOUNT VCOUNT * TCOUNT
47 : #define SCOUNT LCOUNT * NCOUNT
48 :
49 : #ifdef FRONTEND
50 : /* comparison routine for bsearch() of decomposition lookup table. */
51 : static int
52 2520 : conv_compare(const void *p1, const void *p2)
53 : {
54 : uint32 v1,
55 : v2;
56 :
57 2520 : v1 = *(const uint32 *) p1;
58 2520 : v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
59 2520 : return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
60 : }
61 :
62 : #endif
63 :
64 : /*
65 : * get_code_entry
66 : *
67 : * Get the entry corresponding to code in the decomposition lookup table.
68 : * The backend version of this code uses a perfect hash function for the
69 : * lookup, while the frontend version uses a binary search.
70 : */
71 : static const pg_unicode_decomposition *
72 2156 : get_code_entry(pg_wchar code)
73 : {
74 : #ifndef FRONTEND
75 : int h;
76 : uint32 hashkey;
77 1960 : pg_unicode_decompinfo decompinfo = UnicodeDecompInfo;
78 :
79 : /*
80 : * Compute the hash function. The hash key is the codepoint with the bytes
81 : * in network order.
82 : */
83 1960 : hashkey = pg_hton32(code);
84 1960 : h = decompinfo.hash(&hashkey);
85 :
86 : /* An out-of-range result implies no match */
87 1960 : if (h < 0 || h >= decompinfo.num_decomps)
88 1116 : return NULL;
89 :
90 : /*
91 : * Since it's a perfect hash, we need only match to the specific codepoint
92 : * it identifies.
93 : */
94 844 : if (code != decompinfo.decomps[h].codepoint)
95 0 : return NULL;
96 :
97 : /* Success! */
98 844 : return &decompinfo.decomps[h];
99 : #else
100 196 : return bsearch(&(code),
101 : UnicodeDecompMain,
102 : lengthof(UnicodeDecompMain),
103 : sizeof(pg_unicode_decomposition),
104 : conv_compare);
105 : #endif
106 : }
107 :
108 : /*
109 : * Get the combining class of the given codepoint.
110 : */
111 : static uint8
112 1068 : get_canonical_class(pg_wchar code)
113 : {
114 1068 : const pg_unicode_decomposition *entry = get_code_entry(code);
115 :
116 : /*
117 : * If no entries are found, the character used is either an Hangul
118 : * character or a character with a class of 0 and no decompositions.
119 : */
120 1068 : if (!entry)
121 636 : return 0;
122 : else
123 432 : return entry->comb_class;
124 : }
125 :
126 : /*
127 : * Given a decomposition entry looked up earlier, get the decomposed
128 : * characters.
129 : *
130 : * Note: the returned pointer can point to statically allocated buffer, and
131 : * is only valid until next call to this function!
132 : */
133 : static const pg_wchar *
134 148 : get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
135 : {
136 : static pg_wchar x;
137 :
138 148 : if (DECOMPOSITION_IS_INLINE(entry))
139 : {
140 : Assert(DECOMPOSITION_SIZE(entry) == 1);
141 60 : x = (pg_wchar) entry->dec_index;
142 60 : *dec_size = 1;
143 60 : return &x;
144 : }
145 : else
146 : {
147 88 : *dec_size = DECOMPOSITION_SIZE(entry);
148 88 : return &UnicodeDecomp_codepoints[entry->dec_index];
149 : }
150 : }
151 :
152 : /*
153 : * Calculate how many characters a given character will decompose to.
154 : *
155 : * This needs to recurse, if the character decomposes into characters that
156 : * are, in turn, decomposable.
157 : */
158 : static int
159 544 : get_decomposed_size(pg_wchar code, bool compat)
160 : {
161 : const pg_unicode_decomposition *entry;
162 544 : int size = 0;
163 : int i;
164 : const uint32 *decomp;
165 : int dec_size;
166 :
167 : /*
168 : * Fast path for Hangul characters not stored in tables to save memory as
169 : * decomposition is algorithmic. See
170 : * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
171 : * on the matter.
172 : */
173 544 : if (code >= SBASE && code < SBASE + SCOUNT)
174 : {
175 : uint32 tindex,
176 : sindex;
177 :
178 0 : sindex = code - SBASE;
179 0 : tindex = sindex % TCOUNT;
180 :
181 0 : if (tindex != 0)
182 0 : return 3;
183 0 : return 2;
184 : }
185 :
186 544 : entry = get_code_entry(code);
187 :
188 : /*
189 : * Just count current code if no other decompositions. A NULL entry is
190 : * equivalent to a character with class 0 and no decompositions.
191 : */
192 544 : if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
193 110 : (!compat && DECOMPOSITION_IS_COMPAT(entry)))
194 470 : return 1;
195 :
196 : /*
197 : * If this entry has other decomposition codes look at them as well. First
198 : * get its decomposition in the list of tables available.
199 : */
200 74 : decomp = get_code_decomposition(entry, &dec_size);
201 192 : for (i = 0; i < dec_size; i++)
202 : {
203 118 : uint32 lcode = decomp[i];
204 :
205 118 : size += get_decomposed_size(lcode, compat);
206 : }
207 :
208 74 : return size;
209 : }
210 :
211 : /*
212 : * Recompose a set of characters. For hangul characters, the calculation
213 : * is algorithmic. For others, an inverse lookup at the decomposition
214 : * table is necessary. Returns true if a recomposition can be done, and
215 : * false otherwise.
216 : */
217 : static bool
218 172 : recompose_code(uint32 start, uint32 code, uint32 *result)
219 : {
220 : /*
221 : * Handle Hangul characters algorithmically, per the Unicode spec.
222 : *
223 : * Check if two current characters are L and V.
224 : */
225 172 : if (start >= LBASE && start < LBASE + LCOUNT &&
226 0 : code >= VBASE && code < VBASE + VCOUNT)
227 : {
228 : /* make syllable of form LV */
229 0 : uint32 lindex = start - LBASE;
230 0 : uint32 vindex = code - VBASE;
231 :
232 0 : *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
233 0 : return true;
234 : }
235 : /* Check if two current characters are LV and T */
236 172 : else if (start >= SBASE && start < (SBASE + SCOUNT) &&
237 0 : ((start - SBASE) % TCOUNT) == 0 &&
238 0 : code >= TBASE && code < (TBASE + TCOUNT))
239 : {
240 : /* make syllable of form LVT */
241 0 : uint32 tindex = code - TBASE;
242 :
243 0 : *result = start + tindex;
244 0 : return true;
245 : }
246 : else
247 : {
248 : const pg_unicode_decomposition *entry;
249 :
250 : /*
251 : * Do an inverse lookup of the decomposition tables to see if anything
252 : * matches. The comparison just needs to be a perfect match on the
253 : * sub-table of size two, because the start character has already been
254 : * recomposed partially. This lookup uses a perfect hash function for
255 : * the backend code.
256 : */
257 : #ifndef FRONTEND
258 :
259 : int h,
260 : inv_lookup_index;
261 : uint64 hashkey;
262 140 : pg_unicode_recompinfo recompinfo = UnicodeRecompInfo;
263 :
264 : /*
265 : * Compute the hash function. The hash key is formed by concatenating
266 : * bytes of the two codepoints in network order. See also
267 : * src/common/unicode/generate-unicode_norm_table.pl.
268 : */
269 140 : hashkey = pg_hton64(((uint64) start << 32) | (uint64) code);
270 140 : h = recompinfo.hash(&hashkey);
271 :
272 : /* An out-of-range result implies no match */
273 140 : if (h < 0 || h >= recompinfo.num_recomps)
274 136 : return false;
275 :
276 46 : inv_lookup_index = recompinfo.inverse_lookup[h];
277 46 : entry = &UnicodeDecompMain[inv_lookup_index];
278 :
279 46 : if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
280 42 : code == UnicodeDecomp_codepoints[entry->dec_index + 1])
281 : {
282 42 : *result = entry->codepoint;
283 42 : return true;
284 : }
285 :
286 : #else
287 :
288 : int i;
289 :
290 216832 : for (i = 0; i < lengthof(UnicodeDecompMain); i++)
291 : {
292 216800 : entry = &UnicodeDecompMain[i];
293 :
294 216800 : if (DECOMPOSITION_SIZE(entry) != 2)
295 163232 : continue;
296 :
297 53568 : if (DECOMPOSITION_NO_COMPOSE(entry))
298 23456 : continue;
299 :
300 30112 : if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
301 280 : code == UnicodeDecomp_codepoints[entry->dec_index + 1])
302 : {
303 0 : *result = entry->codepoint;
304 0 : return true;
305 : }
306 : }
307 : #endif /* !FRONTEND */
308 : }
309 :
310 36 : return false;
311 : }
312 :
313 : /*
314 : * Decompose the given code into the array given by caller. The
315 : * decomposition begins at the position given by caller, saving one
316 : * lookup on the decomposition table. The current position needs to be
317 : * updated here to let the caller know from where to continue filling
318 : * in the array result.
319 : */
320 : static void
321 544 : decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
322 : {
323 : const pg_unicode_decomposition *entry;
324 : int i;
325 : const uint32 *decomp;
326 : int dec_size;
327 :
328 : /*
329 : * Fast path for Hangul characters not stored in tables to save memory as
330 : * decomposition is algorithmic. See
331 : * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
332 : * on the matter.
333 : */
334 544 : if (code >= SBASE && code < SBASE + SCOUNT)
335 : {
336 : uint32 l,
337 : v,
338 : tindex,
339 : sindex;
340 0 : pg_wchar *res = *result;
341 :
342 0 : sindex = code - SBASE;
343 0 : l = LBASE + sindex / (VCOUNT * TCOUNT);
344 0 : v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
345 0 : tindex = sindex % TCOUNT;
346 :
347 0 : res[*current] = l;
348 0 : (*current)++;
349 0 : res[*current] = v;
350 0 : (*current)++;
351 :
352 0 : if (tindex != 0)
353 : {
354 0 : res[*current] = TBASE + tindex;
355 0 : (*current)++;
356 : }
357 :
358 470 : return;
359 : }
360 :
361 544 : entry = get_code_entry(code);
362 :
363 : /*
364 : * Just fill in with the current decomposition if there are no
365 : * decomposition codes to recurse to. A NULL entry is equivalent to a
366 : * character with class 0 and no decompositions, so just leave also in
367 : * this case.
368 : */
369 544 : if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
370 110 : (!compat && DECOMPOSITION_IS_COMPAT(entry)))
371 : {
372 470 : pg_wchar *res = *result;
373 :
374 470 : res[*current] = code;
375 470 : (*current)++;
376 470 : return;
377 : }
378 :
379 : /*
380 : * If this entry has other decomposition codes look at them as well.
381 : */
382 74 : decomp = get_code_decomposition(entry, &dec_size);
383 192 : for (i = 0; i < dec_size; i++)
384 : {
385 118 : pg_wchar lcode = (pg_wchar) decomp[i];
386 :
387 : /* Leave if no more decompositions */
388 118 : decompose_code(lcode, compat, result, current);
389 : }
390 : }
391 :
392 : /*
393 : * unicode_normalize - Normalize a Unicode string to the specified form.
394 : *
395 : * The input is a 0-terminated array of codepoints.
396 : *
397 : * In frontend, returns a 0-terminated array of codepoints, allocated with
398 : * malloc. Or NULL if we run out of memory. In backend, the returned
399 : * string is palloc'd instead, and OOM is reported with ereport().
400 : */
401 : pg_wchar *
402 136 : unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
403 : {
404 136 : bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
405 136 : bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
406 : pg_wchar *decomp_chars;
407 : pg_wchar *recomp_chars;
408 : int decomp_size,
409 : current_size;
410 : int count;
411 : const pg_wchar *p;
412 :
413 : /* variables for recomposition */
414 : int last_class;
415 : int starter_pos;
416 : int target_pos;
417 : uint32 starter_ch;
418 :
419 : /* First, do character decomposition */
420 :
421 : /*
422 : * Calculate how many characters long the decomposed version will be.
423 : */
424 136 : decomp_size = 0;
425 562 : for (p = input; *p; p++)
426 426 : decomp_size += get_decomposed_size(*p, compat);
427 :
428 136 : decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
429 136 : if (decomp_chars == NULL)
430 0 : return NULL;
431 :
432 : /*
433 : * Now fill in each entry recursively. This needs a second pass on the
434 : * decomposition table.
435 : */
436 136 : current_size = 0;
437 562 : for (p = input; *p; p++)
438 426 : decompose_code(*p, compat, &decomp_chars, ¤t_size);
439 136 : decomp_chars[decomp_size] = '\0';
440 : Assert(decomp_size == current_size);
441 :
442 : /* Leave if there is nothing to decompose */
443 136 : if (decomp_size == 0)
444 18 : return decomp_chars;
445 :
446 : /*
447 : * Now apply canonical ordering.
448 : */
449 470 : for (count = 1; count < decomp_size; count++)
450 : {
451 352 : pg_wchar prev = decomp_chars[count - 1];
452 352 : pg_wchar next = decomp_chars[count];
453 : pg_wchar tmp;
454 352 : const uint8 prevClass = get_canonical_class(prev);
455 352 : const uint8 nextClass = get_canonical_class(next);
456 :
457 : /*
458 : * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
459 : * annex 4, a sequence of two adjacent characters in a string is an
460 : * exchangeable pair if the combining class (from the Unicode
461 : * Character Database) for the first character is greater than the
462 : * combining class for the second, and the second is not a starter. A
463 : * character is a starter if its combining class is 0.
464 : */
465 352 : if (prevClass == 0 || nextClass == 0)
466 352 : continue;
467 :
468 0 : if (prevClass <= nextClass)
469 0 : continue;
470 :
471 : /* exchange can happen */
472 0 : tmp = decomp_chars[count - 1];
473 0 : decomp_chars[count - 1] = decomp_chars[count];
474 0 : decomp_chars[count] = tmp;
475 :
476 : /* backtrack to check again */
477 0 : if (count > 1)
478 0 : count -= 2;
479 : }
480 :
481 118 : if (!recompose)
482 60 : return decomp_chars;
483 :
484 : /*
485 : * The last phase of NFC and NFKC is the recomposition of the reordered
486 : * Unicode string using combining classes. The recomposed string cannot be
487 : * longer than the decomposed one, so make the allocation of the output
488 : * string based on that assumption.
489 : */
490 58 : recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
491 58 : if (!recomp_chars)
492 : {
493 0 : FREE(decomp_chars);
494 0 : return NULL;
495 : }
496 :
497 58 : last_class = -1; /* this eliminates a special check */
498 58 : starter_pos = 0;
499 58 : target_pos = 1;
500 58 : starter_ch = recomp_chars[0] = decomp_chars[0];
501 :
502 230 : for (count = 1; count < decomp_size; count++)
503 : {
504 172 : pg_wchar ch = decomp_chars[count];
505 172 : int ch_class = get_canonical_class(ch);
506 : pg_wchar composite;
507 :
508 344 : if (last_class < ch_class &&
509 172 : recompose_code(starter_ch, ch, &composite))
510 : {
511 42 : recomp_chars[starter_pos] = composite;
512 42 : starter_ch = composite;
513 : }
514 130 : else if (ch_class == 0)
515 : {
516 130 : starter_pos = target_pos;
517 130 : starter_ch = ch;
518 130 : last_class = -1;
519 130 : recomp_chars[target_pos++] = ch;
520 : }
521 : else
522 : {
523 0 : last_class = ch_class;
524 0 : recomp_chars[target_pos++] = ch;
525 : }
526 : }
527 58 : recomp_chars[target_pos] = (pg_wchar) '\0';
528 :
529 58 : FREE(decomp_chars);
530 :
531 58 : return recomp_chars;
532 : }
533 :
534 : /*
535 : * Normalization "quick check" algorithm; see
536 : * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
537 : */
538 :
539 : /* We only need this in the backend. */
540 : #ifndef FRONTEND
541 :
542 : static const pg_unicode_normprops *
543 192 : qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
544 : {
545 : int h;
546 : uint32 hashkey;
547 :
548 : /*
549 : * Compute the hash function. The hash key is the codepoint with the bytes
550 : * in network order.
551 : */
552 192 : hashkey = pg_hton32(ch);
553 192 : h = norminfo->hash(&hashkey);
554 :
555 : /* An out-of-range result implies no match */
556 192 : if (h < 0 || h >= norminfo->num_normprops)
557 84 : return NULL;
558 :
559 : /*
560 : * Since it's a perfect hash, we need only match to the specific codepoint
561 : * it identifies.
562 : */
563 108 : if (ch != norminfo->normprops[h].codepoint)
564 72 : return NULL;
565 :
566 : /* Success! */
567 36 : return &norminfo->normprops[h];
568 : }
569 :
570 : /*
571 : * Look up the normalization quick check character property
572 : */
573 : static UnicodeNormalizationQC
574 192 : qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
575 : {
576 192 : const pg_unicode_normprops *found = NULL;
577 :
578 192 : switch (form)
579 : {
580 120 : case UNICODE_NFC:
581 120 : found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
582 120 : break;
583 72 : case UNICODE_NFKC:
584 72 : found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
585 72 : break;
586 0 : default:
587 : Assert(false);
588 0 : break;
589 : }
590 :
591 192 : if (found)
592 36 : return found->quickcheck;
593 : else
594 156 : return UNICODE_NORM_QC_YES;
595 : }
596 :
597 : UnicodeNormalizationQC
598 132 : unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
599 : {
600 132 : uint8 lastCanonicalClass = 0;
601 132 : UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
602 :
603 : /*
604 : * For the "D" forms, we don't run the quickcheck. We don't include the
605 : * lookup tables for those because they are huge, checking for these
606 : * particular forms is less common, and running the slow path is faster
607 : * for the "D" forms than the "C" forms because you don't need to
608 : * recompose, which is slow.
609 : */
610 132 : if (form == UNICODE_NFD || form == UNICODE_NFKD)
611 60 : return UNICODE_NORM_QC_MAYBE;
612 :
613 252 : for (const pg_wchar *p = input; *p; p++)
614 : {
615 192 : pg_wchar ch = *p;
616 : uint8 canonicalClass;
617 : UnicodeNormalizationQC check;
618 :
619 192 : canonicalClass = get_canonical_class(ch);
620 192 : if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
621 0 : return UNICODE_NORM_QC_NO;
622 :
623 192 : check = qc_is_allowed(form, ch);
624 192 : if (check == UNICODE_NORM_QC_NO)
625 12 : return UNICODE_NORM_QC_NO;
626 180 : else if (check == UNICODE_NORM_QC_MAYBE)
627 24 : result = UNICODE_NORM_QC_MAYBE;
628 :
629 180 : lastCanonicalClass = canonicalClass;
630 : }
631 60 : return result;
632 : }
633 :
634 : #endif /* !FRONTEND */
|