LCOV - code coverage report
Current view: top level - src/common - unicode_norm.c (source / functions) Hit Total Coverage
Test: PostgreSQL 12beta2 Lines: 86 131 65.6 %
Date: 2019-06-19 14:06:47 Functions: 7 7 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  * unicode_norm.c
       3             :  *      Normalize a Unicode string to NFKC form
       4             :  *
       5             :  * This implements Unicode normalization, per the documentation at
       6             :  * http://www.unicode.org/reports/tr15/.
       7             :  *
       8             :  * Portions Copyright (c) 2017-2019, PostgreSQL Global Development Group
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/common/unicode_norm.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : #ifndef FRONTEND
      16             : #include "postgres.h"
      17             : #else
      18             : #include "postgres_fe.h"
      19             : #endif
      20             : 
      21             : #include "common/unicode_norm.h"
      22             : #include "common/unicode_norm_table.h"
      23             : 
      24             : #ifndef FRONTEND
      25             : #define ALLOC(size) palloc(size)
      26             : #define FREE(size) pfree(size)
      27             : #else
      28             : #define ALLOC(size) malloc(size)
      29             : #define FREE(size) free(size)
      30             : #endif
      31             : 
      32             : /* Constants for calculations with Hangul characters */
      33             : #define SBASE       0xAC00      /* U+AC00 */
      34             : #define LBASE       0x1100      /* U+1100 */
      35             : #define VBASE       0x1161      /* U+1161 */
      36             : #define TBASE       0x11A7      /* U+11A7 */
      37             : #define LCOUNT      19
      38             : #define VCOUNT      21
      39             : #define TCOUNT      28
      40             : #define NCOUNT      VCOUNT * TCOUNT
      41             : #define SCOUNT      LCOUNT * NCOUNT
      42             : 
      43             : /* comparison routine for bsearch() of decomposition lookup table. */
      44             : static int
      45        3610 : conv_compare(const void *p1, const void *p2)
      46             : {
      47             :     uint32      v1,
      48             :                 v2;
      49             : 
      50        3610 :     v1 = *(const uint32 *) p1;
      51        3610 :     v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
      52        3610 :     return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
      53             : }
      54             : 
      55             : /*
      56             :  * Get the entry corresponding to code in the decomposition lookup table.
      57             :  */
      58             : static pg_unicode_decomposition *
      59         278 : get_code_entry(pg_wchar code)
      60             : {
      61         278 :     return bsearch(&(code),
      62             :                    UnicodeDecompMain,
      63             :                    lengthof(UnicodeDecompMain),
      64             :                    sizeof(pg_unicode_decomposition),
      65             :                    conv_compare);
      66             : }
      67             : 
      68             : /*
      69             :  * Given a decomposition entry looked up earlier, get the decomposed
      70             :  * characters.
      71             :  *
      72             :  * Note: the returned pointer can point to statically allocated buffer, and
      73             :  * is only valid until next call to this function!
      74             :  */
      75             : static const pg_wchar *
      76          16 : get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size)
      77             : {
      78             :     static pg_wchar x;
      79             : 
      80          16 :     if (DECOMPOSITION_IS_INLINE(entry))
      81             :     {
      82             :         Assert(DECOMPOSITION_SIZE(entry) == 1);
      83          12 :         x = (pg_wchar) entry->dec_index;
      84          12 :         *dec_size = 1;
      85          12 :         return &x;
      86             :     }
      87             :     else
      88             :     {
      89           4 :         *dec_size = DECOMPOSITION_SIZE(entry);
      90           4 :         return &UnicodeDecomp_codepoints[entry->dec_index];
      91             :     }
      92             : }
      93             : 
      94             : /*
      95             :  * Calculate how many characters a given character will decompose to.
      96             :  *
      97             :  * This needs to recurse, if the character decomposes into characters that
      98             :  * are, in turn, decomposable.
      99             :  */
     100             : static int
     101          70 : get_decomposed_size(pg_wchar code)
     102             : {
     103             :     pg_unicode_decomposition *entry;
     104          70 :     int         size = 0;
     105             :     int         i;
     106             :     const uint32 *decomp;
     107             :     int         dec_size;
     108             : 
     109             :     /*
     110             :      * Fast path for Hangul characters not stored in tables to save memory as
     111             :      * decomposition is algorithmic. See
     112             :      * http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
     113             :      * the matter.
     114             :      */
     115          70 :     if (code >= SBASE && code < SBASE + SCOUNT)
     116             :     {
     117             :         uint32      tindex,
     118             :                     sindex;
     119             : 
     120           0 :         sindex = code - SBASE;
     121           0 :         tindex = sindex % TCOUNT;
     122             : 
     123           0 :         if (tindex != 0)
     124           0 :             return 3;
     125           0 :         return 2;
     126             :     }
     127             : 
     128          70 :     entry = get_code_entry(code);
     129             : 
     130             :     /*
     131             :      * Just count current code if no other decompositions.  A NULL entry is
     132             :      * equivalent to a character with class 0 and no decompositions.
     133             :      */
     134          70 :     if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0)
     135          62 :         return 1;
     136             : 
     137             :     /*
     138             :      * If this entry has other decomposition codes look at them as well. First
     139             :      * get its decomposition in the list of tables available.
     140             :      */
     141           8 :     decomp = get_code_decomposition(entry, &dec_size);
     142          18 :     for (i = 0; i < dec_size; i++)
     143             :     {
     144          10 :         uint32      lcode = decomp[i];
     145             : 
     146          10 :         size += get_decomposed_size(lcode);
     147             :     }
     148             : 
     149           8 :     return size;
     150             : }
     151             : 
     152             : /*
     153             :  * Recompose a set of characters. For hangul characters, the calculation
     154             :  * is algorithmic. For others, an inverse lookup at the decomposition
     155             :  * table is necessary. Returns true if a recomposition can be done, and
     156             :  * false otherwise.
     157             :  */
     158             : static bool
     159          46 : recompose_code(uint32 start, uint32 code, uint32 *result)
     160             : {
     161             :     /*
     162             :      * Handle Hangul characters algorithmically, per the Unicode spec.
     163             :      *
     164             :      * Check if two current characters are L and V.
     165             :      */
     166          46 :     if (start >= LBASE && start < LBASE + LCOUNT &&
     167           0 :         code >= VBASE && code < VBASE + VCOUNT)
     168             :     {
     169             :         /* make syllable of form LV */
     170           0 :         uint32      lindex = start - LBASE;
     171           0 :         uint32      vindex = code - VBASE;
     172             : 
     173           0 :         *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
     174           0 :         return true;
     175             :     }
     176             :     /* Check if two current characters are LV and T */
     177          46 :     else if (start >= SBASE && start < (SBASE + SCOUNT) &&
     178           0 :              ((start - SBASE) % TCOUNT) == 0 &&
     179           0 :              code >= TBASE && code < (TBASE + TCOUNT))
     180             :     {
     181             :         /* make syllable of from LVT */
     182           0 :         uint32      tindex = code - TBASE;
     183             : 
     184           0 :         *result = start + tindex;
     185           0 :         return true;
     186             :     }
     187             :     else
     188             :     {
     189             :         int         i;
     190             : 
     191             :         /*
     192             :          * Do an inverse lookup of the decomposition tables to see if anything
     193             :          * matches. The comparison just needs to be a perfect match on the
     194             :          * sub-table of size two, because the start character has already been
     195             :          * recomposed partially.
     196             :          */
     197      300518 :         for (i = 0; i < lengthof(UnicodeDecompMain); i++)
     198             :         {
     199      300472 :             const pg_unicode_decomposition *entry = &UnicodeDecompMain[i];
     200             : 
     201      300472 :             if (DECOMPOSITION_SIZE(entry) != 2)
     202      223606 :                 continue;
     203             : 
     204       76866 :             if (DECOMPOSITION_NO_COMPOSE(entry))
     205       33626 :                 continue;
     206             : 
     207       43630 :             if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
     208         390 :                 code == UnicodeDecomp_codepoints[entry->dec_index + 1])
     209             :             {
     210           0 :                 *result = entry->codepoint;
     211           0 :                 return true;
     212             :             }
     213             :         }
     214             :     }
     215             : 
     216          46 :     return false;
     217             : }
     218             : 
     219             : /*
     220             :  * Decompose the given code into the array given by caller. The
     221             :  * decomposition begins at the position given by caller, saving one
     222             :  * lookup on the decomposition table. The current position needs to be
     223             :  * updated here to let the caller know from where to continue filling
     224             :  * in the array result.
     225             :  */
     226             : static void
     227          70 : decompose_code(pg_wchar code, pg_wchar **result, int *current)
     228             : {
     229             :     pg_unicode_decomposition *entry;
     230             :     int         i;
     231             :     const uint32 *decomp;
     232             :     int         dec_size;
     233             : 
     234             :     /*
     235             :      * Fast path for Hangul characters not stored in tables to save memory as
     236             :      * decomposition is algorithmic. See
     237             :      * http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
     238             :      * the matter.
     239             :      */
     240          70 :     if (code >= SBASE && code < SBASE + SCOUNT)
     241             :     {
     242             :         uint32      l,
     243             :                     v,
     244             :                     tindex,
     245             :                     sindex;
     246           0 :         pg_wchar   *res = *result;
     247             : 
     248           0 :         sindex = code - SBASE;
     249           0 :         l = LBASE + sindex / (VCOUNT * TCOUNT);
     250           0 :         v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
     251           0 :         tindex = sindex % TCOUNT;
     252             : 
     253           0 :         res[*current] = l;
     254           0 :         (*current)++;
     255           0 :         res[*current] = v;
     256           0 :         (*current)++;
     257             : 
     258           0 :         if (tindex != 0)
     259             :         {
     260           0 :             res[*current] = TBASE + tindex;
     261           0 :             (*current)++;
     262             :         }
     263             : 
     264          62 :         return;
     265             :     }
     266             : 
     267          70 :     entry = get_code_entry(code);
     268             : 
     269             :     /*
     270             :      * Just fill in with the current decomposition if there are no
     271             :      * decomposition codes to recurse to.  A NULL entry is equivalent to a
     272             :      * character with class 0 and no decompositions, so just leave also in
     273             :      * this case.
     274             :      */
     275          70 :     if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0)
     276             :     {
     277          62 :         pg_wchar   *res = *result;
     278             : 
     279          62 :         res[*current] = code;
     280          62 :         (*current)++;
     281          62 :         return;
     282             :     }
     283             : 
     284             :     /*
     285             :      * If this entry has other decomposition codes look at them as well.
     286             :      */
     287           8 :     decomp = get_code_decomposition(entry, &dec_size);
     288          18 :     for (i = 0; i < dec_size; i++)
     289             :     {
     290          10 :         pg_wchar    lcode = (pg_wchar) decomp[i];
     291             : 
     292             :         /* Leave if no more decompositions */
     293          10 :         decompose_code(lcode, result, current);
     294             :     }
     295             : }
     296             : 
     297             : /*
     298             :  * unicode_normalize_kc - Normalize a Unicode string to NFKC form.
     299             :  *
     300             :  * The input is a 0-terminated array of codepoints.
     301             :  *
     302             :  * In frontend, returns a 0-terminated array of codepoints, allocated with
     303             :  * malloc. Or NULL if we run out of memory. In frontend, the returned
     304             :  * string is palloc'd instead, and OOM is reported with ereport().
     305             :  */
     306             : pg_wchar *
     307          16 : unicode_normalize_kc(const pg_wchar *input)
     308             : {
     309             :     pg_wchar   *decomp_chars;
     310             :     pg_wchar   *recomp_chars;
     311             :     int         decomp_size,
     312             :                 current_size;
     313             :     int         count;
     314             :     const pg_wchar *p;
     315             : 
     316             :     /* variables for recomposition */
     317             :     int         last_class;
     318             :     int         starter_pos;
     319             :     int         target_pos;
     320             :     uint32      starter_ch;
     321             : 
     322             :     /* First, do character decomposition */
     323             : 
     324             :     /*
     325             :      * Calculate how many characters long the decomposed version will be.
     326             :      */
     327          16 :     decomp_size = 0;
     328          76 :     for (p = input; *p; p++)
     329          60 :         decomp_size += get_decomposed_size(*p);
     330             : 
     331          16 :     decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
     332          16 :     if (decomp_chars == NULL)
     333           0 :         return NULL;
     334             : 
     335             :     /*
     336             :      * Now fill in each entry recursively. This needs a second pass on the
     337             :      * decomposition table.
     338             :      */
     339          16 :     current_size = 0;
     340          76 :     for (p = input; *p; p++)
     341          60 :         decompose_code(*p, &decomp_chars, &current_size);
     342          16 :     decomp_chars[decomp_size] = '\0';
     343             :     Assert(decomp_size == current_size);
     344             : 
     345             :     /*
     346             :      * Now apply canonical ordering.
     347             :      */
     348          62 :     for (count = 1; count < decomp_size; count++)
     349             :     {
     350          46 :         pg_wchar    prev = decomp_chars[count - 1];
     351          46 :         pg_wchar    next = decomp_chars[count];
     352             :         pg_wchar    tmp;
     353          46 :         pg_unicode_decomposition *prevEntry = get_code_entry(prev);
     354          46 :         pg_unicode_decomposition *nextEntry = get_code_entry(next);
     355             : 
     356             :         /*
     357             :          * If no entries are found, the character used is either an Hangul
     358             :          * character or a character with a class of 0 and no decompositions,
     359             :          * so move to next result.
     360             :          */
     361          46 :         if (prevEntry == NULL || nextEntry == NULL)
     362          46 :             continue;
     363             : 
     364             :         /*
     365             :          * Per Unicode (http://unicode.org/reports/tr15/tr15-18.html) annex 4,
     366             :          * a sequence of two adjacent characters in a string is an
     367             :          * exchangeable pair if the combining class (from the Unicode
     368             :          * Character Database) for the first character is greater than the
     369             :          * combining class for the second, and the second is not a starter.  A
     370             :          * character is a starter if its combining class is 0.
     371             :          */
     372           0 :         if (nextEntry->comb_class == 0x0 || prevEntry->comb_class == 0x0)
     373           0 :             continue;
     374             : 
     375           0 :         if (prevEntry->comb_class <= nextEntry->comb_class)
     376           0 :             continue;
     377             : 
     378             :         /* exchange can happen */
     379           0 :         tmp = decomp_chars[count - 1];
     380           0 :         decomp_chars[count - 1] = decomp_chars[count];
     381           0 :         decomp_chars[count] = tmp;
     382             : 
     383             :         /* backtrack to check again */
     384           0 :         if (count > 1)
     385           0 :             count -= 2;
     386             :     }
     387             : 
     388             :     /*
     389             :      * The last phase of NFKC is the recomposition of the reordered Unicode
     390             :      * string using combining classes. The recomposed string cannot be longer
     391             :      * than the decomposed one, so make the allocation of the output string
     392             :      * based on that assumption.
     393             :      */
     394          16 :     recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
     395          16 :     if (!recomp_chars)
     396             :     {
     397           0 :         FREE(decomp_chars);
     398           0 :         return NULL;
     399             :     }
     400             : 
     401          16 :     last_class = -1;            /* this eliminates a special check */
     402          16 :     starter_pos = 0;
     403          16 :     target_pos = 1;
     404          16 :     starter_ch = recomp_chars[0] = decomp_chars[0];
     405             : 
     406          62 :     for (count = 1; count < decomp_size; count++)
     407             :     {
     408          46 :         pg_wchar    ch = decomp_chars[count];
     409          46 :         pg_unicode_decomposition *ch_entry = get_code_entry(ch);
     410          46 :         int         ch_class = (ch_entry == NULL) ? 0 : ch_entry->comb_class;
     411             :         pg_wchar    composite;
     412             : 
     413          92 :         if (last_class < ch_class &&
     414          46 :             recompose_code(starter_ch, ch, &composite))
     415             :         {
     416           0 :             recomp_chars[starter_pos] = composite;
     417           0 :             starter_ch = composite;
     418             :         }
     419          46 :         else if (ch_class == 0)
     420             :         {
     421          46 :             starter_pos = target_pos;
     422          46 :             starter_ch = ch;
     423          46 :             last_class = -1;
     424          46 :             recomp_chars[target_pos++] = ch;
     425             :         }
     426             :         else
     427             :         {
     428           0 :             last_class = ch_class;
     429           0 :             recomp_chars[target_pos++] = ch;
     430             :         }
     431             :     }
     432          16 :     recomp_chars[target_pos] = (pg_wchar) '\0';
     433             : 
     434          16 :     FREE(decomp_chars);
     435             : 
     436          16 :     return recomp_chars;
     437             : }

Generated by: LCOV version 1.13