LCOV - code coverage report
Current view: top level - src/common - unicode_norm.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13beta1 Lines: 136 178 76.4 %
Date: 2020-05-29 00:07:09 Functions: 11 11 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  * unicode_norm.c
       3             :  *      Normalize a Unicode string
       4             :  *
       5             :  * This implements Unicode normalization, per the documentation at
       6             :  * https://www.unicode.org/reports/tr15/.
       7             :  *
       8             :  * Portions Copyright (c) 2017-2020, PostgreSQL Global Development Group
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/common/unicode_norm.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : #ifndef FRONTEND
      16             : #include "postgres.h"
      17             : #else
      18             : #include "postgres_fe.h"
      19             : #endif
      20             : 
      21             : #include "common/unicode_norm.h"
      22             : #include "common/unicode_norm_table.h"
      23             : #ifndef FRONTEND
      24             : #include "common/unicode_normprops_table.h"
      25             : #endif
      26             : 
      27             : #ifndef FRONTEND
      28             : #define ALLOC(size) palloc(size)
      29             : #define FREE(size) pfree(size)
      30             : #else
      31             : #define ALLOC(size) malloc(size)
      32             : #define FREE(size) free(size)
      33             : #endif
      34             : 
      35             : /* Constants for calculations with Hangul characters */
      36             : #define SBASE       0xAC00      /* U+AC00 */
      37             : #define LBASE       0x1100      /* U+1100 */
      38             : #define VBASE       0x1161      /* U+1161 */
      39             : #define TBASE       0x11A7      /* U+11A7 */
      40             : #define LCOUNT      19
      41             : #define VCOUNT      21
      42             : #define TCOUNT      28
      43             : #define NCOUNT      VCOUNT * TCOUNT
      44             : #define SCOUNT      LCOUNT * NCOUNT
      45             : 
      46             : /* comparison routine for bsearch() of decomposition lookup table. */
      47             : static int
      48       19810 : conv_compare(const void *p1, const void *p2)
      49             : {
      50             :     uint32      v1,
      51             :                 v2;
      52             : 
      53       19810 :     v1 = *(const uint32 *) p1;
      54       19810 :     v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
      55       19810 :     return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
      56             : }
      57             : 
      58             : /*
      59             :  * Get the entry corresponding to code in the decomposition lookup table.
      60             :  */
      61             : static pg_unicode_decomposition *
      62        1530 : get_code_entry(pg_wchar code)
      63             : {
      64        1530 :     return bsearch(&(code),
      65             :                    UnicodeDecompMain,
      66             :                    lengthof(UnicodeDecompMain),
      67             :                    sizeof(pg_unicode_decomposition),
      68             :                    conv_compare);
      69             : }
      70             : 
      71             : /*
      72             :  * Given a decomposition entry looked up earlier, get the decomposed
      73             :  * characters.
      74             :  *
      75             :  * Note: the returned pointer can point to statically allocated buffer, and
      76             :  * is only valid until next call to this function!
      77             :  */
      78             : static const pg_wchar *
      79         104 : get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size)
      80             : {
      81             :     static pg_wchar x;
      82             : 
      83         104 :     if (DECOMPOSITION_IS_INLINE(entry))
      84             :     {
      85             :         Assert(DECOMPOSITION_SIZE(entry) == 1);
      86          44 :         x = (pg_wchar) entry->dec_index;
      87          44 :         *dec_size = 1;
      88          44 :         return &x;
      89             :     }
      90             :     else
      91             :     {
      92          60 :         *dec_size = DECOMPOSITION_SIZE(entry);
      93          60 :         return &UnicodeDecomp_codepoints[entry->dec_index];
      94             :     }
      95             : }
      96             : 
      97             : /*
      98             :  * Calculate how many characters a given character will decompose to.
      99             :  *
     100             :  * This needs to recurse, if the character decomposes into characters that
     101             :  * are, in turn, decomposable.
     102             :  */
     103             : static int
     104         386 : get_decomposed_size(pg_wchar code, bool compat)
     105             : {
     106             :     pg_unicode_decomposition *entry;
     107         386 :     int         size = 0;
     108             :     int         i;
     109             :     const uint32 *decomp;
     110             :     int         dec_size;
     111             : 
     112             :     /*
     113             :      * Fast path for Hangul characters not stored in tables to save memory as
     114             :      * decomposition is algorithmic. See
     115             :      * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
     116             :      * on the matter.
     117             :      */
     118         386 :     if (code >= SBASE && code < SBASE + SCOUNT)
     119             :     {
     120             :         uint32      tindex,
     121             :                     sindex;
     122             : 
     123           0 :         sindex = code - SBASE;
     124           0 :         tindex = sindex % TCOUNT;
     125             : 
     126           0 :         if (tindex != 0)
     127           0 :             return 3;
     128           0 :         return 2;
     129             :     }
     130             : 
     131         386 :     entry = get_code_entry(code);
     132             : 
     133             :     /*
     134             :      * Just count current code if no other decompositions.  A NULL entry is
     135             :      * equivalent to a character with class 0 and no decompositions.
     136             :      */
     137         386 :     if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
     138          76 :         (!compat && DECOMPOSITION_IS_COMPAT(entry)))
     139         334 :         return 1;
     140             : 
     141             :     /*
     142             :      * If this entry has other decomposition codes look at them as well. First
     143             :      * get its decomposition in the list of tables available.
     144             :      */
     145          52 :     decomp = get_code_decomposition(entry, &dec_size);
     146         134 :     for (i = 0; i < dec_size; i++)
     147             :     {
     148          82 :         uint32      lcode = decomp[i];
     149             : 
     150          82 :         size += get_decomposed_size(lcode, compat);
     151             :     }
     152             : 
     153          52 :     return size;
     154             : }
     155             : 
     156             : /*
     157             :  * Recompose a set of characters. For hangul characters, the calculation
     158             :  * is algorithmic. For others, an inverse lookup at the decomposition
     159             :  * table is necessary. Returns true if a recomposition can be done, and
     160             :  * false otherwise.
     161             :  */
     162             : static bool
     163         130 : recompose_code(uint32 start, uint32 code, uint32 *result)
     164             : {
     165             :     /*
     166             :      * Handle Hangul characters algorithmically, per the Unicode spec.
     167             :      *
     168             :      * Check if two current characters are L and V.
     169             :      */
     170         130 :     if (start >= LBASE && start < LBASE + LCOUNT &&
     171           0 :         code >= VBASE && code < VBASE + VCOUNT)
     172             :     {
     173             :         /* make syllable of form LV */
     174           0 :         uint32      lindex = start - LBASE;
     175           0 :         uint32      vindex = code - VBASE;
     176             : 
     177           0 :         *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
     178           0 :         return true;
     179             :     }
     180             :     /* Check if two current characters are LV and T */
     181         130 :     else if (start >= SBASE && start < (SBASE + SCOUNT) &&
     182           0 :              ((start - SBASE) % TCOUNT) == 0 &&
     183           0 :              code >= TBASE && code < (TBASE + TCOUNT))
     184             :     {
     185             :         /* make syllable of form LVT */
     186           0 :         uint32      tindex = code - TBASE;
     187             : 
     188           0 :         *result = start + tindex;
     189           0 :         return true;
     190             :     }
     191             :     else
     192             :     {
     193             :         int         i;
     194             : 
     195             :         /*
     196             :          * Do an inverse lookup of the decomposition tables to see if anything
     197             :          * matches. The comparison just needs to be a perfect match on the
     198             :          * sub-table of size two, because the start character has already been
     199             :          * recomposed partially.
     200             :          */
     201      674970 :         for (i = 0; i < lengthof(UnicodeDecompMain); i++)
     202             :         {
     203      674868 :             const pg_unicode_decomposition *entry = &UnicodeDecompMain[i];
     204             : 
     205      674868 :             if (DECOMPOSITION_SIZE(entry) != 2)
     206      503140 :                 continue;
     207             : 
     208      171728 :             if (DECOMPOSITION_NO_COMPOSE(entry))
     209       74878 :                 continue;
     210             : 
     211       96850 :             if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
     212         606 :                 code == UnicodeDecomp_codepoints[entry->dec_index + 1])
     213             :             {
     214          28 :                 *result = entry->codepoint;
     215          28 :                 return true;
     216             :             }
     217             :         }
     218             :     }
     219             : 
     220         102 :     return false;
     221             : }
     222             : 
     223             : /*
     224             :  * Decompose the given code into the array given by caller. The
     225             :  * decomposition begins at the position given by caller, saving one
     226             :  * lookup on the decomposition table. The current position needs to be
     227             :  * updated here to let the caller know from where to continue filling
     228             :  * in the array result.
     229             :  */
     230             : static void
     231         386 : decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
     232             : {
     233             :     pg_unicode_decomposition *entry;
     234             :     int         i;
     235             :     const uint32 *decomp;
     236             :     int         dec_size;
     237             : 
     238             :     /*
     239             :      * Fast path for Hangul characters not stored in tables to save memory as
     240             :      * decomposition is algorithmic. See
     241             :      * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
     242             :      * on the matter.
     243             :      */
     244         386 :     if (code >= SBASE && code < SBASE + SCOUNT)
     245             :     {
     246             :         uint32      l,
     247             :                     v,
     248             :                     tindex,
     249             :                     sindex;
     250           0 :         pg_wchar   *res = *result;
     251             : 
     252           0 :         sindex = code - SBASE;
     253           0 :         l = LBASE + sindex / (VCOUNT * TCOUNT);
     254           0 :         v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
     255           0 :         tindex = sindex % TCOUNT;
     256             : 
     257           0 :         res[*current] = l;
     258           0 :         (*current)++;
     259           0 :         res[*current] = v;
     260           0 :         (*current)++;
     261             : 
     262           0 :         if (tindex != 0)
     263             :         {
     264           0 :             res[*current] = TBASE + tindex;
     265           0 :             (*current)++;
     266             :         }
     267             : 
     268         334 :         return;
     269             :     }
     270             : 
     271         386 :     entry = get_code_entry(code);
     272             : 
     273             :     /*
     274             :      * Just fill in with the current decomposition if there are no
     275             :      * decomposition codes to recurse to.  A NULL entry is equivalent to a
     276             :      * character with class 0 and no decompositions, so just leave also in
     277             :      * this case.
     278             :      */
     279         386 :     if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
     280          76 :         (!compat && DECOMPOSITION_IS_COMPAT(entry)))
     281             :     {
     282         334 :         pg_wchar   *res = *result;
     283             : 
     284         334 :         res[*current] = code;
     285         334 :         (*current)++;
     286         334 :         return;
     287             :     }
     288             : 
     289             :     /*
     290             :      * If this entry has other decomposition codes look at them as well.
     291             :      */
     292          52 :     decomp = get_code_decomposition(entry, &dec_size);
     293         134 :     for (i = 0; i < dec_size; i++)
     294             :     {
     295          82 :         pg_wchar    lcode = (pg_wchar) decomp[i];
     296             : 
     297             :         /* Leave if no more decompositions */
     298          82 :         decompose_code(lcode, compat, result, current);
     299             :     }
     300             : }
     301             : 
     302             : /*
     303             :  * unicode_normalize - Normalize a Unicode string to the specified form.
     304             :  *
     305             :  * The input is a 0-terminated array of codepoints.
     306             :  *
     307             :  * In frontend, returns a 0-terminated array of codepoints, allocated with
     308             :  * malloc. Or NULL if we run out of memory. In backend, the returned
     309             :  * string is palloc'd instead, and OOM is reported with ereport().
     310             :  */
     311             : pg_wchar *
     312          84 : unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
     313             : {
     314          84 :     bool        compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
     315          84 :     bool        recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
     316             :     pg_wchar   *decomp_chars;
     317             :     pg_wchar   *recomp_chars;
     318             :     int         decomp_size,
     319             :                 current_size;
     320             :     int         count;
     321             :     const pg_wchar *p;
     322             : 
     323             :     /* variables for recomposition */
     324             :     int         last_class;
     325             :     int         starter_pos;
     326             :     int         target_pos;
     327             :     uint32      starter_ch;
     328             : 
     329             :     /* First, do character decomposition */
     330             : 
     331             :     /*
     332             :      * Calculate how many characters long the decomposed version will be.
     333             :      */
     334          84 :     decomp_size = 0;
     335         388 :     for (p = input; *p; p++)
     336         304 :         decomp_size += get_decomposed_size(*p, compat);
     337             : 
     338          84 :     decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
     339          84 :     if (decomp_chars == NULL)
     340           0 :         return NULL;
     341             : 
     342             :     /*
     343             :      * Now fill in each entry recursively. This needs a second pass on the
     344             :      * decomposition table.
     345             :      */
     346          84 :     current_size = 0;
     347         388 :     for (p = input; *p; p++)
     348         304 :         decompose_code(*p, compat, &decomp_chars, &current_size);
     349          84 :     decomp_chars[decomp_size] = '\0';
     350             :     Assert(decomp_size == current_size);
     351             : 
     352             :     /*
     353             :      * Now apply canonical ordering.
     354             :      */
     355         334 :     for (count = 1; count < decomp_size; count++)
     356             :     {
     357         250 :         pg_wchar    prev = decomp_chars[count - 1];
     358         250 :         pg_wchar    next = decomp_chars[count];
     359             :         pg_wchar    tmp;
     360         250 :         pg_unicode_decomposition *prevEntry = get_code_entry(prev);
     361         250 :         pg_unicode_decomposition *nextEntry = get_code_entry(next);
     362             : 
     363             :         /*
     364             :          * If no entries are found, the character used is either an Hangul
     365             :          * character or a character with a class of 0 and no decompositions,
     366             :          * so move to next result.
     367             :          */
     368         250 :         if (prevEntry == NULL || nextEntry == NULL)
     369         226 :             continue;
     370             : 
     371             :         /*
     372             :          * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
     373             :          * annex 4, a sequence of two adjacent characters in a string is an
     374             :          * exchangeable pair if the combining class (from the Unicode
     375             :          * Character Database) for the first character is greater than the
     376             :          * combining class for the second, and the second is not a starter.  A
     377             :          * character is a starter if its combining class is 0.
     378             :          */
     379          24 :         if (nextEntry->comb_class == 0x0 || prevEntry->comb_class == 0x0)
     380          24 :             continue;
     381             : 
     382           0 :         if (prevEntry->comb_class <= nextEntry->comb_class)
     383           0 :             continue;
     384             : 
     385             :         /* exchange can happen */
     386           0 :         tmp = decomp_chars[count - 1];
     387           0 :         decomp_chars[count - 1] = decomp_chars[count];
     388           0 :         decomp_chars[count] = tmp;
     389             : 
     390             :         /* backtrack to check again */
     391           0 :         if (count > 1)
     392           0 :             count -= 2;
     393             :     }
     394             : 
     395          84 :     if (!recompose)
     396          40 :         return decomp_chars;
     397             : 
     398             :     /*
     399             :      * The last phase of NFC and NFKC is the recomposition of the reordered
     400             :      * Unicode string using combining classes. The recomposed string cannot be
     401             :      * longer than the decomposed one, so make the allocation of the output
     402             :      * string based on that assumption.
     403             :      */
     404          44 :     recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
     405          44 :     if (!recomp_chars)
     406             :     {
     407           0 :         FREE(decomp_chars);
     408           0 :         return NULL;
     409             :     }
     410             : 
     411          44 :     last_class = -1;            /* this eliminates a special check */
     412          44 :     starter_pos = 0;
     413          44 :     target_pos = 1;
     414          44 :     starter_ch = recomp_chars[0] = decomp_chars[0];
     415             : 
     416         174 :     for (count = 1; count < decomp_size; count++)
     417             :     {
     418         130 :         pg_wchar    ch = decomp_chars[count];
     419         130 :         pg_unicode_decomposition *ch_entry = get_code_entry(ch);
     420         130 :         int         ch_class = (ch_entry == NULL) ? 0 : ch_entry->comb_class;
     421             :         pg_wchar    composite;
     422             : 
     423         260 :         if (last_class < ch_class &&
     424         130 :             recompose_code(starter_ch, ch, &composite))
     425             :         {
     426          28 :             recomp_chars[starter_pos] = composite;
     427          28 :             starter_ch = composite;
     428             :         }
     429         102 :         else if (ch_class == 0)
     430             :         {
     431         102 :             starter_pos = target_pos;
     432         102 :             starter_ch = ch;
     433         102 :             last_class = -1;
     434         102 :             recomp_chars[target_pos++] = ch;
     435             :         }
     436             :         else
     437             :         {
     438           0 :             last_class = ch_class;
     439           0 :             recomp_chars[target_pos++] = ch;
     440             :         }
     441             :     }
     442          44 :     recomp_chars[target_pos] = (pg_wchar) '\0';
     443             : 
     444          44 :     FREE(decomp_chars);
     445             : 
     446          44 :     return recomp_chars;
     447             : }
     448             : 
     449             : /*
     450             :  * Normalization "quick check" algorithm; see
     451             :  * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
     452             :  */
     453             : 
     454             : /* We only need this in the backend. */
     455             : #ifndef FRONTEND
     456             : 
     457             : static uint8
     458         128 : get_canonical_class(pg_wchar ch)
     459             : {
     460         128 :     pg_unicode_decomposition *entry = get_code_entry(ch);
     461             : 
     462         128 :     if (!entry)
     463          64 :         return 0;
     464             :     else
     465          64 :         return entry->comb_class;
     466             : }
     467             : 
     468             : static int
     469        1456 : qc_compare(const void *p1, const void *p2)
     470             : {
     471             :     uint32      v1,
     472             :                 v2;
     473             : 
     474        1456 :     v1 = ((const pg_unicode_normprops *) p1)->codepoint;
     475        1456 :     v2 = ((const pg_unicode_normprops *) p2)->codepoint;
     476        1456 :     return (v1 - v2);
     477             : }
     478             : 
     479             : /*
     480             :  * Look up the normalization quick check character property
     481             :  */
     482             : static UnicodeNormalizationQC
     483         128 : qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
     484             : {
     485             :     pg_unicode_normprops key;
     486         128 :     pg_unicode_normprops *found = NULL;
     487             : 
     488         128 :     key.codepoint = ch;
     489             : 
     490         128 :     switch (form)
     491             :     {
     492          80 :         case UNICODE_NFC:
     493          80 :             found = bsearch(&key,
     494             :                             UnicodeNormProps_NFC_QC,
     495             :                             lengthof(UnicodeNormProps_NFC_QC),
     496             :                             sizeof(pg_unicode_normprops),
     497             :                             qc_compare);
     498          80 :             break;
     499          48 :         case UNICODE_NFKC:
     500          48 :             found = bsearch(&key,
     501             :                             UnicodeNormProps_NFKC_QC,
     502             :                             lengthof(UnicodeNormProps_NFKC_QC),
     503             :                             sizeof(pg_unicode_normprops),
     504             :                             qc_compare);
     505          48 :             break;
     506           0 :         default:
     507             :             Assert(false);
     508           0 :             break;
     509             :     }
     510             : 
     511         128 :     if (found)
     512          24 :         return found->quickcheck;
     513             :     else
     514         104 :         return UNICODE_NORM_QC_YES;
     515             : }
     516             : 
     517             : UnicodeNormalizationQC
     518          72 : unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
     519             : {
     520          72 :     uint8       lastCanonicalClass = 0;
     521          72 :     UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
     522             : 
     523             :     /*
     524             :      * For the "D" forms, we don't run the quickcheck.  We don't include the
     525             :      * lookup tables for those because they are huge, checking for these
     526             :      * particular forms is less common, and running the slow path is faster
     527             :      * for the "D" forms than the "C" forms because you don't need to
     528             :      * recompose, which is slow.
     529             :      */
     530          72 :     if (form == UNICODE_NFD || form == UNICODE_NFKD)
     531          32 :         return UNICODE_NORM_QC_MAYBE;
     532             : 
     533         160 :     for (const pg_wchar *p = input; *p; p++)
     534             :     {
     535         128 :         pg_wchar    ch = *p;
     536             :         uint8       canonicalClass;
     537             :         UnicodeNormalizationQC check;
     538             : 
     539         128 :         canonicalClass = get_canonical_class(ch);
     540         128 :         if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
     541           0 :             return UNICODE_NORM_QC_NO;
     542             : 
     543         128 :         check = qc_is_allowed(form, ch);
     544         128 :         if (check == UNICODE_NORM_QC_NO)
     545           8 :             return UNICODE_NORM_QC_NO;
     546         120 :         else if (check == UNICODE_NORM_QC_MAYBE)
     547          16 :             result = UNICODE_NORM_QC_MAYBE;
     548             : 
     549         120 :         lastCanonicalClass = canonicalClass;
     550             :     }
     551          32 :     return result;
     552             : }
     553             : 
     554             : #endif                          /* !FRONTEND */

Generated by: LCOV version 1.13