LCOV - 77aeca80249c9e640c811e80633a2e334a9320de vs 38afc3dcb25c45b744d4025029ce0a6c90b7059f

LCOV - differential code coverage report

Current view:	top level - src/common - unicode_norm.c (source / functions)		Coverage	Total	Hit	UBC	GNC	CBC	DCB
Current:	77aeca80249c9e640c811e80633a2e334a9320de vs 38afc3dcb25c45b744d4025029ce0a6c90b7059f	Lines:	90.7 %	204	185	19	1	184	1
Current Date:	2026-07-25 19:08:27 +0900	Functions:	100.0 %	11	11		1	10
Baseline:	lcov-20260725-baseline	Branches:	84.1 %	151	127	24	2	125
Baseline Date:	2026-07-25 19:09:19 +0900	Line coverage date bins:
Legend:	Lines: hit not hit Branches: + taken - not taken # not executed	(7,30] days:	100.0 %	1	1		1
		(30,360] days:	91.7 %	24	22	2		22
		(360..) days:	90.5 %	179	162	17		162
		Function coverage date bins:
		(30,360] days:	100.0 %	8	8			8
		(360..) days:	100.0 %	3	3		1	2
		Branch coverage date bins:
		(7,30] days:	100.0 %	2	2		2
		(30,360] days:	66.7 %	6	4	2		4
		(360..) days:	84.6 %	143	121	22		121

 Age         Owner                    Branch data    TLA  Line data    Source code

                                  1                 :                : /*-------------------------------------------------------------------------
                                  2                 :                :  * unicode_norm.c
                                  3                 :                :  *      Normalize a Unicode string
                                  4                 :                :  *
                                  5                 :                :  * This implements Unicode normalization, per the documentation at
                                  6                 :                :  * https://www.unicode.org/reports/tr15/.
                                  7                 :                :  *
                                  8                 :                :  * Portions Copyright (c) 2017-2026, PostgreSQL Global Development Group
                                  9                 :                :  *
                                 10                 :                :  * IDENTIFICATION
                                 11                 :                :  *    src/common/unicode_norm.c
                                 12                 :                :  *
                                 13                 :                :  *-------------------------------------------------------------------------
                                 14                 :                :  */
                                 15                 :                : #ifndef FRONTEND
                                 16                 :                : #include "postgres.h"
                                 17                 :                : #else
                                 18                 :                : #include "postgres_fe.h"
                                 19                 :                : #endif
                                 20                 :                : 
                                 21                 :                : #include "common/unicode_norm.h"
                                 22                 :                : #ifndef FRONTEND
                                 23                 :                : #include "common/unicode_norm_hashfunc.h"
                                 24                 :                : #include "common/unicode_normprops_table.h"
                                 25                 :                : #include "port/pg_bswap.h"
                                 26                 :                : #include "utils/memutils.h"
                                 27                 :                : #else
                                 28                 :                : #include "common/unicode_norm_table.h"
                                 29                 :                : #endif
                                 30                 :                : 
                                 31                 :                : #ifndef FRONTEND
                                 32                 :                : #define ALLOC(size) palloc(size)
                                 33                 :                : #define FREE(size) pfree(size)
                                 34                 :                : #else
                                 35                 :                : #define ALLOC(size) malloc(size)
                                 36                 :                : #define FREE(size) free(size)
                                 37                 :                : #endif
                                 38                 :                : 
                                 39                 :                : /* Constants for calculations with Hangul characters */
                                 40                 :                : #define SBASE       0xAC00      /* U+AC00 */
                                 41                 :                : #define LBASE       0x1100      /* U+1100 */
                                 42                 :                : #define VBASE       0x1161      /* U+1161 */
                                 43                 :                : #define TBASE       0x11A7      /* U+11A7 */
                                 44                 :                : #define LCOUNT      19
                                 45                 :                : #define VCOUNT      21
                                 46                 :                : #define TCOUNT      28
                                 47                 :                : #define NCOUNT      VCOUNT * TCOUNT
                                 48                 :                : #define SCOUNT      LCOUNT * NCOUNT
                                 49                 :                : 
                                 50                 :                : #ifdef FRONTEND
                                 51                 :                : /* comparison routine for bsearch() of decomposition lookup table. */
                                 52                 :                : static int
 2100 michael@paquier.xyz        53                 :CBC       14699 : conv_compare(const void *p1, const void *p2)
                                 54                 :                : {
                                 55                 :                :     uint32      v1,
                                 56                 :                :                 v2;
                                 57                 :                : 
                                 58                 :          14699 :     v1 = *(const uint32 *) p1;
                                 59                 :          14699 :     v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
                                 60   [ +  +  +  + ]:          14699 :     return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
                                 61                 :                : }
                                 62                 :                : 
                                 63                 :                : #endif
                                 64                 :                : 
                                 65                 :                : /*
                                 66                 :                :  * get_code_entry
                                 67                 :                :  *
                                 68                 :                :  * Get the entry corresponding to code in the decomposition lookup table.
                                 69                 :                :  * The backend version of this code uses a perfect hash function for the
                                 70                 :                :  * lookup, while the frontend version uses a binary search.
                                 71                 :                :  */
                                 72                 :                : static const pg_unicode_decomposition *
  269 jdavis@postgresql.or       73                 :          14728 : get_code_entry(char32_t code)
                                 74                 :                : {
                                 75                 :                : #ifndef FRONTEND
                                 76                 :                :     int         h;
                                 77                 :                :     uint32      hashkey;
 2101 michael@paquier.xyz        78                 :          13597 :     pg_unicode_decompinfo decompinfo = UnicodeDecompInfo;
                                 79                 :                : 
                                 80                 :                :     /*
                                 81                 :                :      * Compute the hash function. The hash key is the codepoint with the bytes
                                 82                 :                :      * in network order.
                                 83                 :                :      */
                                 84                 :          13597 :     hashkey = pg_hton32(code);
                                 85                 :          13597 :     h = decompinfo.hash(&hashkey);
                                 86                 :                : 
                                 87                 :                :     /* An out-of-range result implies no match */
                                 88   [ +  -  +  + ]:          13597 :     if (h < 0 || h >= decompinfo.num_decomps)
                                 89                 :           3448 :         return NULL;
                                 90                 :                : 
                                 91                 :                :     /*
                                 92                 :                :      * Since it's a perfect hash, we need only match to the specific codepoint
                                 93                 :                :      * it identifies.
                                 94                 :                :      */
                                 95         [ +  + ]:          10149 :     if (code != decompinfo.decomps[h].codepoint)
                                 96                 :           9443 :         return NULL;
                                 97                 :                : 
                                 98                 :                :     /* Success! */
                                 99                 :            706 :     return &decompinfo.decomps[h];
                                100                 :                : #else
 3396 heikki.linnakangas@i      101                 :           1131 :     return bsearch(&(code),
                                102                 :                :                    UnicodeDecompMain,
                                103                 :                :                    lengthof(UnicodeDecompMain),
                                104                 :                :                    sizeof(pg_unicode_decomposition),
                                105                 :                :                    conv_compare);
                                106                 :                : #endif
                                107                 :                : }
                                108                 :                : 
                                109                 :                : /*
                                110                 :                :  * Get the combining class of the given codepoint.
                                111                 :                :  */
                                112                 :                : static uint8
  269 jdavis@postgresql.or      113                 :           8322 : get_canonical_class(char32_t code)
                                114                 :                : {
 2054 michael@paquier.xyz       115                 :           8322 :     const pg_unicode_decomposition *entry = get_code_entry(code);
                                116                 :                : 
                                117                 :                :     /*
                                118                 :                :      * If no entries are found, the character used is either a Hangul
                                119                 :                :      * character or a character with a class of 0 and no decompositions.
                                120                 :                :      */
                                121         [ +  + ]:           8322 :     if (!entry)
                                122                 :           7982 :         return 0;
                                123                 :                :     else
                                124                 :            340 :         return entry->comb_class;
                                125                 :                : }
                                126                 :                : 
                                127                 :                : /*
                                128                 :                :  * Given a decomposition entry looked up earlier, get the decomposed
                                129                 :                :  * characters.
                                130                 :                :  *
                                131                 :                :  * Note: the returned pointer can point to statically allocated buffer, and
                                132                 :                :  * is only valid until next call to this function!
                                133                 :                :  */
                                134                 :                : static const char32_t *
 2101                           135                 :            138 : get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
                                136                 :                : {
                                137                 :                :     static char32_t x;
                                138                 :                : 
 3396 heikki.linnakangas@i      139         [ +  + ]:            138 :     if (DECOMPOSITION_IS_INLINE(entry))
                                140                 :                :     {
                                141         [ -  + ]:             42 :         Assert(DECOMPOSITION_SIZE(entry) == 1);
  269 jdavis@postgresql.or      142                 :             42 :         x = (char32_t) entry->dec_index;
 3396 heikki.linnakangas@i      143                 :             42 :         *dec_size = 1;
                                144                 :             42 :         return &x;
                                145                 :                :     }
                                146                 :                :     else
                                147                 :                :     {
                                148                 :             96 :         *dec_size = DECOMPOSITION_SIZE(entry);
                                149                 :             96 :         return &UnicodeDecomp_codepoints[entry->dec_index];
                                150                 :                :     }
                                151                 :                : }
                                152                 :                : 
                                153                 :                : /*
                                154                 :                :  * Calculate how many characters a given character will decompose to.
                                155                 :                :  *
                                156                 :                :  * This needs to recurse, if the character decomposes into characters that
                                157                 :                :  * are, in turn, decomposable.
                                158                 :                :  */
                                159                 :                : static int
  269 jdavis@postgresql.or      160                 :           3238 : get_decomposed_size(char32_t code, bool compat)
                                161                 :                : {
                                162                 :                :     const pg_unicode_decomposition *entry;
 3396 heikki.linnakangas@i      163                 :           3238 :     int         size = 0;
                                164                 :                :     int         i;
                                165                 :                :     const uint32 *decomp;
                                166                 :                :     int         dec_size;
                                167                 :                : 
                                168                 :                :     /*
                                169                 :                :      * Fast path for Hangul characters not stored in tables to save memory as
                                170                 :                :      * decomposition is algorithmic. See
                                171                 :                :      * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
                                172                 :                :      * on the matter.
                                173                 :                :      */
                                174   [ +  +  +  - ]:           3238 :     if (code >= SBASE && code < SBASE + SCOUNT)
                                175                 :                :     {
                                176                 :                :         uint32      tindex,
                                177                 :                :                     sindex;
                                178                 :                : 
                                179                 :             35 :         sindex = code - SBASE;
                                180                 :             35 :         tindex = sindex % TCOUNT;
                                181                 :                : 
                                182         [ +  + ]:             35 :         if (tindex != 0)
                                183                 :             10 :             return 3;
                                184                 :             25 :         return 2;
                                185                 :                :     }
                                186                 :                : 
                                187                 :           3203 :     entry = get_code_entry(code);
                                188                 :                : 
                                189                 :                :     /*
                                190                 :                :      * Just count current code if no other decompositions.  A NULL entry is
                                191                 :                :      * equivalent to a character with class 0 and no decompositions.
                                192                 :                :      */
 2314 peter@eisentraut.org      193   [ +  +  +  + ]:           3203 :     if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
                                194   [ +  +  +  + ]:             96 :         (!compat && DECOMPOSITION_IS_COMPAT(entry)))
 3396 heikki.linnakangas@i      195                 :           3134 :         return 1;
                                196                 :                : 
                                197                 :                :     /*
                                198                 :                :      * If this entry has other decomposition codes look at them as well. First
                                199                 :                :      * get its decomposition in the list of tables available.
                                200                 :                :      */
                                201                 :             69 :     decomp = get_code_decomposition(entry, &dec_size);
                                202         [ +  + ]:            186 :     for (i = 0; i < dec_size; i++)
                                203                 :                :     {
                                204                 :            117 :         uint32      lcode = decomp[i];
                                205                 :                : 
 2314 peter@eisentraut.org      206                 :            117 :         size += get_decomposed_size(lcode, compat);
                                207                 :                :     }
                                208                 :                : 
 3396 heikki.linnakangas@i      209                 :             69 :     return size;
                                210                 :                : }
                                211                 :                : 
                                212                 :                : /*
                                213                 :                :  * Recompose a set of characters. For hangul characters, the calculation
                                214                 :                :  * is algorithmic. For others, an inverse lookup at the decomposition
                                215                 :                :  * table is necessary. Returns true if a recomposition can be done, and
                                216                 :                :  * false otherwise.
                                217                 :                :  */
                                218                 :                : static bool
                                219                 :           2586 : recompose_code(uint32 start, uint32 code, uint32 *result)
                                220                 :                : {
                                221                 :                :     /*
                                222                 :                :      * Handle Hangul characters algorithmically, per the Unicode spec.
                                223                 :                :      *
                                224                 :                :      * Check if two current characters are L and V.
                                225                 :                :      */
                                226   [ +  +  +  +  :           2586 :     if (start >= LBASE && start < LBASE + LCOUNT &&
                                              +  - ]
                                227         [ +  - ]:             45 :         code >= VBASE && code < VBASE + VCOUNT)
                                228                 :                :     {
                                229                 :                :         /* make syllable of form LV */
                                230                 :             45 :         uint32      lindex = start - LBASE;
                                231                 :             45 :         uint32      vindex = code - VBASE;
                                232                 :                : 
                                233                 :             45 :         *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
                                234                 :             45 :         return true;
                                235                 :                :     }
                                236                 :                :     /* Check if two current characters are LV and T */
                                237   [ +  +  +  - ]:           2541 :     else if (start >= SBASE && start < (SBASE + SCOUNT) &&
                                238   [ +  -  +  + ]:             35 :              ((start - SBASE) % TCOUNT) == 0 &&
   50 michael@paquier.xyz       239         [ +  - ]:             25 :              code > TBASE && code < (TBASE + TCOUNT))
                                240                 :                :     {
                                241                 :                :         /* make syllable of form LVT */
 3396 heikki.linnakangas@i      242                 :             25 :         uint32      tindex = code - TBASE;
                                243                 :                : 
                                244                 :             25 :         *result = start + tindex;
                                245                 :             25 :         return true;
                                246                 :                :     }
                                247                 :                :     else
                                248                 :                :     {
                                249                 :                :         const pg_unicode_decomposition *entry;
                                250                 :                : 
                                251                 :                :         /*
                                252                 :                :          * Do an inverse lookup of the decomposition tables to see if anything
                                253                 :                :          * matches. The comparison just needs to be a perfect match on the
                                254                 :                :          * sub-table of size two, because the start character has already been
                                255                 :                :          * recomposed partially.  This lookup uses a perfect hash function for
                                256                 :                :          * the backend code.
                                257                 :                :          */
                                258                 :                : #ifndef FRONTEND
                                259                 :                : 
                                260                 :                :         int         h,
                                261                 :                :                     inv_lookup_index;
                                262                 :                :         uint64      hashkey;
 2101 michael@paquier.xyz       263                 :           2317 :         pg_unicode_recompinfo recompinfo = UnicodeRecompInfo;
                                264                 :                : 
                                265                 :                :         /*
                                266                 :                :          * Compute the hash function. The hash key is formed by concatenating
                                267                 :                :          * bytes of the two codepoints in network order. See also
                                268                 :                :          * src/common/unicode/generate-unicode_norm_table.pl.
                                269                 :                :          */
                                270                 :           2317 :         hashkey = pg_hton64(((uint64) start << 32) | (uint64) code);
                                271                 :           2317 :         h = recompinfo.hash(&hashkey);
                                272                 :                : 
                                273                 :                :         /* An out-of-range result implies no match */
                                274   [ +  +  +  + ]:           2317 :         if (h < 0 || h >= recompinfo.num_recomps)
                                275                 :           1916 :             return false;
                                276                 :                : 
                                277                 :            433 :         inv_lookup_index = recompinfo.inverse_lookup[h];
                                278                 :            433 :         entry = &UnicodeDecompMain[inv_lookup_index];
                                279                 :                : 
                                280         [ +  + ]:            433 :         if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
                                281         [ +  + ]:             36 :             code == UnicodeDecomp_codepoints[entry->dec_index + 1])
                                282                 :                :         {
                                283                 :             32 :             *result = entry->codepoint;
                                284                 :             32 :             return true;
                                285                 :                :         }
                                286                 :                : 
                                287                 :                : #else
                                288                 :                : 
   14 peter@eisentraut.org      289         [ +  + ]:GNC     1368921 :         for (size_t i = 0; i < lengthof(UnicodeDecompMain); i++)
                                290                 :                :         {
 2101 michael@paquier.xyz       291                 :CBC     1368722 :             entry = &UnicodeDecompMain[i];
                                292                 :                : 
 3396 heikki.linnakangas@i      293         [ +  + ]:        1368722 :             if (DECOMPOSITION_SIZE(entry) != 2)
                                294                 :        1031616 :                 continue;
                                295                 :                : 
                                296         [ +  + ]:         337106 :             if (DECOMPOSITION_NO_COMPOSE(entry))
                                297                 :         145867 :                 continue;
                                298                 :                : 
                                299         [ +  + ]:         191239 :             if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
                                300         [ -  + ]:           1718 :                 code == UnicodeDecomp_codepoints[entry->dec_index + 1])
                                301                 :                :             {
 3396 heikki.linnakangas@i      302                 :UBC           0 :                 *result = entry->codepoint;
                                303                 :              0 :                 return true;
                                304                 :                :             }
                                305                 :                :         }
                                306                 :                : #endif                          /* !FRONTEND */
                                307                 :                :     }
                                308                 :                : 
 3396 heikki.linnakangas@i      309                 :CBC         600 :     return false;
                                310                 :                : }
                                311                 :                : 
                                312                 :                : /*
                                313                 :                :  * Decompose the given code into the array given by caller. The
                                314                 :                :  * decomposition begins at the position given by caller, saving one
                                315                 :                :  * lookup on the decomposition table. The current position needs to be
                                316                 :                :  * updated here to let the caller know from where to continue filling
                                317                 :                :  * in the array result.
                                318                 :                :  */
                                319                 :                : static void
  269 jdavis@postgresql.or      320                 :           3238 : decompose_code(char32_t code, bool compat, char32_t **result, int *current)
                                321                 :                : {
                                322                 :                :     const pg_unicode_decomposition *entry;
                                323                 :                :     int         i;
                                324                 :                :     const uint32 *decomp;
                                325                 :                :     int         dec_size;
                                326                 :                : 
                                327                 :                :     /*
                                328                 :                :      * Fast path for Hangul characters not stored in tables to save memory as
                                329                 :                :      * decomposition is algorithmic. See
                                330                 :                :      * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
                                331                 :                :      * on the matter.
                                332                 :                :      */
 3396 heikki.linnakangas@i      333   [ +  +  +  - ]:           3238 :     if (code >= SBASE && code < SBASE + SCOUNT)
                                334                 :                :     {
                                335                 :                :         uint32      l,
                                336                 :                :                     v,
                                337                 :                :                     tindex,
                                338                 :                :                     sindex;
  269 jdavis@postgresql.or      339                 :             35 :         char32_t   *res = *result;
                                340                 :                : 
 3396 heikki.linnakangas@i      341                 :             35 :         sindex = code - SBASE;
                                342                 :             35 :         l = LBASE + sindex / (VCOUNT * TCOUNT);
                                343                 :             35 :         v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
                                344                 :             35 :         tindex = sindex % TCOUNT;
                                345                 :                : 
                                346                 :             35 :         res[*current] = l;
                                347                 :             35 :         (*current)++;
                                348                 :             35 :         res[*current] = v;
                                349                 :             35 :         (*current)++;
                                350                 :                : 
                                351         [ +  + ]:             35 :         if (tindex != 0)
                                352                 :                :         {
                                353                 :             10 :             res[*current] = TBASE + tindex;
                                354                 :             10 :             (*current)++;
                                355                 :                :         }
                                356                 :                : 
                                357                 :           3169 :         return;
                                358                 :                :     }
                                359                 :                : 
                                360                 :           3203 :     entry = get_code_entry(code);
                                361                 :                : 
                                362                 :                :     /*
                                363                 :                :      * Just fill in with the current decomposition if there are no
                                364                 :                :      * decomposition codes to recurse to.  A NULL entry is equivalent to a
                                365                 :                :      * character with class 0 and no decompositions, so just leave also in
                                366                 :                :      * this case.
                                367                 :                :      */
 2314 peter@eisentraut.org      368   [ +  +  +  + ]:           3203 :     if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
                                369   [ +  +  +  + ]:             96 :         (!compat && DECOMPOSITION_IS_COMPAT(entry)))
                                370                 :                :     {
  269 jdavis@postgresql.or      371                 :           3134 :         char32_t   *res = *result;
                                372                 :                : 
 3396 heikki.linnakangas@i      373                 :           3134 :         res[*current] = code;
                                374                 :           3134 :         (*current)++;
                                375                 :           3134 :         return;
                                376                 :                :     }
                                377                 :                : 
                                378                 :                :     /*
                                379                 :                :      * If this entry has other decomposition codes look at them as well.
                                380                 :                :      */
                                381                 :             69 :     decomp = get_code_decomposition(entry, &dec_size);
                                382         [ +  + ]:            186 :     for (i = 0; i < dec_size; i++)
                                383                 :                :     {
  269 jdavis@postgresql.or      384                 :            117 :         char32_t    lcode = (char32_t) decomp[i];
                                385                 :                : 
                                386                 :                :         /* Leave if no more decompositions */
 2314 peter@eisentraut.org      387                 :            117 :         decompose_code(lcode, compat, result, current);
                                388                 :                :     }
                                389                 :                : }
                                390                 :                : 
                                391                 :                : /*
                                392                 :                :  * unicode_normalize - Normalize a Unicode string to the specified form.
                                393                 :                :  *
                                394                 :                :  * The input is a 0-terminated array of codepoints.
                                395                 :                :  *
                                396                 :                :  * In frontend, returns a 0-terminated array of codepoints, allocated with
                                397                 :                :  * malloc. Or NULL if we run out of memory. In backend, the returned
                                398                 :                :  * string is palloc'd instead, and OOM is reported with ereport().
                                399                 :                :  */
                                400                 :                : char32_t *
  269 jdavis@postgresql.or      401                 :            426 : unicode_normalize(UnicodeNormalizationForm form, const char32_t *input)
                                402                 :                : {
 2314 peter@eisentraut.org      403   [ +  +  +  + ]:            426 :     bool        compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
                                404   [ +  +  +  + ]:            426 :     bool        recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
                                405                 :                :     char32_t   *decomp_chars;
                                406                 :                :     char32_t   *recomp_chars;
                                407                 :                :     int         decomp_size,
                                408                 :                :                 current_size;
                                409                 :                :     int         count;
                                410                 :                :     const char32_t *p;
                                411                 :                : 
                                412                 :                :     /* variables for recomposition */
                                413                 :                :     int         last_class;
                                414                 :                :     int         starter_pos;
                                415                 :                :     int         target_pos;
                                416                 :                :     uint32      starter_ch;
                                417                 :                : 
                                418                 :                :     /* First, do character decomposition */
                                419                 :                : 
                                420                 :                :     /*
                                421                 :                :      * Calculate how many characters long the decomposed version will be.
                                422                 :                :      *
                                423                 :                :      * Some characters decompose to quite a few code points, so that the
                                424                 :                :      * decomposed version's size could overrun MaxAllocSize, and even 32-bit
                                425                 :                :      * size_t, even though the input string presumably fits in that.  In
                                426                 :                :      * frontend we want to just return NULL in that case, so monitor the sum
                                427                 :                :      * and exit early once we'd need more than MaxAllocSize bytes.
                                428                 :                :      */
 3396 heikki.linnakangas@i      429                 :            426 :     decomp_size = 0;
                                430         [ +  + ]:           3547 :     for (p = input; *p; p++)
                                431                 :                :     {
 2314 peter@eisentraut.org      432                 :           3121 :         decomp_size += get_decomposed_size(*p, compat);
   75 tgl@sss.pgh.pa.us         433         [ -  + ]:           3121 :         if (unlikely(decomp_size > MaxAllocSize / sizeof(char32_t)))
                                434                 :                :         {
                                435                 :                : #ifndef FRONTEND
                                436                 :                :             /* Exit loop and let palloc() throw error below */
   75 tgl@sss.pgh.pa.us         437                 :UBC           0 :             break;
                                438                 :                : #else
                                439                 :                :             /* Just return NULL with no explicit error */
                                440                 :              0 :             return NULL;
                                441                 :                : #endif
                                442                 :                :         }
                                443                 :                :     }
                                444                 :                : 
  269 jdavis@postgresql.or      445                 :CBC         426 :     decomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
 3396 heikki.linnakangas@i      446         [ -  + ]:            426 :     if (decomp_chars == NULL)
 3396 heikki.linnakangas@i      447                 :UBC           0 :         return NULL;
                                448                 :                : 
                                449                 :                :     /*
                                450                 :                :      * Now fill in each entry recursively. This needs a second pass on the
                                451                 :                :      * decomposition table.
                                452                 :                :      */
 3396 heikki.linnakangas@i      453                 :CBC         426 :     current_size = 0;
                                454         [ +  + ]:           3547 :     for (p = input; *p; p++)
 2314 peter@eisentraut.org      455                 :           3121 :         decompose_code(*p, compat, &decomp_chars, &current_size);
 3396 heikki.linnakangas@i      456                 :            426 :     decomp_chars[decomp_size] = '\0';
                                457         [ -  + ]:            426 :     Assert(decomp_size == current_size);
                                458                 :                : 
                                459                 :                :     /* Leave if there is nothing to decompose */
 1717 michael@paquier.xyz       460         [ +  + ]:            426 :     if (decomp_size == 0)
                                461                 :             13 :         return decomp_chars;
                                462                 :                : 
                                463                 :                :     /*
                                464                 :                :      * Now apply canonical ordering.
                                465                 :                :      */
 3396 heikki.linnakangas@i      466         [ +  + ]:           3214 :     for (count = 1; count < decomp_size; count++)
                                467                 :                :     {
  269 jdavis@postgresql.or      468                 :           2801 :         char32_t    prev = decomp_chars[count - 1];
                                469                 :           2801 :         char32_t    next = decomp_chars[count];
                                470                 :                :         char32_t    tmp;
 2054 michael@paquier.xyz       471                 :           2801 :         const uint8 prevClass = get_canonical_class(prev);
                                472                 :           2801 :         const uint8 nextClass = get_canonical_class(next);
                                473                 :                : 
                                474                 :                :         /*
                                475                 :                :          * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
                                476                 :                :          * annex 4, a sequence of two adjacent characters in a string is an
                                477                 :                :          * exchangeable pair if the combining class (from the Unicode
                                478                 :                :          * Character Database) for the first character is greater than the
                                479                 :                :          * combining class for the second, and the second is not a starter.  A
                                480                 :                :          * character is a starter if its combining class is 0.
                                481                 :                :          */
                                482   [ +  +  +  - ]:           2801 :         if (prevClass == 0 || nextClass == 0)
 3396 heikki.linnakangas@i      483                 :           2801 :             continue;
                                484                 :                : 
 2054 michael@paquier.xyz       485         [ #  # ]:UBC           0 :         if (prevClass <= nextClass)
 3396 heikki.linnakangas@i      486                 :              0 :             continue;
                                487                 :                : 
                                488                 :                :         /* exchange can happen */
                                489                 :              0 :         tmp = decomp_chars[count - 1];
                                490                 :              0 :         decomp_chars[count - 1] = decomp_chars[count];
                                491                 :              0 :         decomp_chars[count] = tmp;
                                492                 :                : 
                                493                 :                :         /* backtrack to check again */
                                494         [ #  # ]:              0 :         if (count > 1)
                                495                 :              0 :             count -= 2;
                                496                 :                :     }
                                497                 :                : 
 2314 peter@eisentraut.org      498         [ +  + ]:CBC         413 :     if (!recompose)
                                499                 :             73 :         return decomp_chars;
                                500                 :                : 
                                501                 :                :     /*
                                502                 :                :      * The last phase of NFC and NFKC is the recomposition of the reordered
                                503                 :                :      * Unicode string using combining classes. The recomposed string cannot be
                                504                 :                :      * longer than the decomposed one, so make the allocation of the output
                                505                 :                :      * string based on that assumption.
                                506                 :                :      */
  269 jdavis@postgresql.or      507                 :            340 :     recomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
 3396 heikki.linnakangas@i      508         [ -  + ]:            340 :     if (!recomp_chars)
                                509                 :                :     {
 3396 heikki.linnakangas@i      510                 :UBC           0 :         FREE(decomp_chars);
                                511                 :              0 :         return NULL;
                                512                 :                :     }
                                513                 :                : 
 3396 heikki.linnakangas@i      514                 :CBC         340 :     last_class = -1;            /* this eliminates a special check */
                                515                 :            340 :     starter_pos = 0;
                                516                 :            340 :     target_pos = 1;
                                517                 :            340 :     starter_ch = recomp_chars[0] = decomp_chars[0];
                                518                 :                : 
                                519         [ +  + ]:           2926 :     for (count = 1; count < decomp_size; count++)
                                520                 :                :     {
  269 jdavis@postgresql.or      521                 :           2586 :         char32_t    ch = decomp_chars[count];
 2054 michael@paquier.xyz       522                 :           2586 :         int         ch_class = get_canonical_class(ch);
                                523                 :                :         char32_t    composite;
                                524                 :                : 
 3396 heikki.linnakangas@i      525   [ +  -  +  + ]:           5172 :         if (last_class < ch_class &&
                                526                 :           2586 :             recompose_code(starter_ch, ch, &composite))
                                527                 :                :         {
                                528                 :            102 :             recomp_chars[starter_pos] = composite;
                                529                 :            102 :             starter_ch = composite;
                                530                 :                :         }
                                531         [ +  - ]:           2484 :         else if (ch_class == 0)
                                532                 :                :         {
                                533                 :           2484 :             starter_pos = target_pos;
                                534                 :           2484 :             starter_ch = ch;
                                535                 :           2484 :             last_class = -1;
                                536                 :           2484 :             recomp_chars[target_pos++] = ch;
                                537                 :                :         }
                                538                 :                :         else
                                539                 :                :         {
 3396 heikki.linnakangas@i      540                 :UBC           0 :             last_class = ch_class;
                                541                 :              0 :             recomp_chars[target_pos++] = ch;
                                542                 :                :         }
                                543                 :                :     }
  269 jdavis@postgresql.or      544                 :CBC         340 :     recomp_chars[target_pos] = (char32_t) '\0';
                                545                 :                : 
 3396 heikki.linnakangas@i      546                 :            340 :     FREE(decomp_chars);
                                547                 :                : 
                                548                 :            340 :     return recomp_chars;
                                549                 :                : }
                                550                 :                : 
                                551                 :                : /*
                                552                 :                :  * Normalization "quick check" algorithm; see
                                553                 :                :  * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
                                554                 :                :  */
                                555                 :                : 
                                556                 :                : /* We only need this in the backend. */
                                557                 :                : #ifndef FRONTEND
                                558                 :                : 
                                559                 :                : static const pg_unicode_normprops *
  269 jdavis@postgresql.or      560                 :            134 : qc_hash_lookup(char32_t ch, const pg_unicode_norminfo *norminfo)
                                561                 :                : {
                                562                 :                :     int         h;
                                563                 :                :     uint32      hashkey;
                                564                 :                : 
                                565                 :                :     /*
                                566                 :                :      * Compute the hash function. The hash key is the codepoint with the bytes
                                567                 :                :      * in network order.
                                568                 :                :      */
 2112 michael@paquier.xyz       569                 :            134 :     hashkey = pg_hton32(ch);
 2113                           570                 :            134 :     h = norminfo->hash(&hashkey);
                                571                 :                : 
                                572                 :                :     /* An out-of-range result implies no match */
                                573   [ +  -  +  + ]:            134 :     if (h < 0 || h >= norminfo->num_normprops)
                                574                 :             92 :         return NULL;
                                575                 :                : 
                                576                 :                :     /*
                                577                 :                :      * Since it's a perfect hash, we need only match to the specific codepoint
                                578                 :                :      * it identifies.
                                579                 :                :      */
                                580         [ +  + ]:             42 :     if (ch != norminfo->normprops[h].codepoint)
                                581                 :             18 :         return NULL;
                                582                 :                : 
                                583                 :                :     /* Success! */
                                584                 :             24 :     return &norminfo->normprops[h];
                                585                 :                : }
                                586                 :                : 
                                587                 :                : /*
                                588                 :                :  * Look up the normalization quick check character property
                                589                 :                :  */
                                590                 :                : static UnicodeNormalizationQC
  269 jdavis@postgresql.or      591                 :            134 : qc_is_allowed(UnicodeNormalizationForm form, char32_t ch)
                                592                 :                : {
 2113 michael@paquier.xyz       593                 :            134 :     const pg_unicode_normprops *found = NULL;
                                594                 :                : 
 2312 peter@eisentraut.org      595      [ +  +  - ]:            134 :     switch (form)
                                596                 :                :     {
                                597                 :             86 :         case UNICODE_NFC:
 2113 michael@paquier.xyz       598                 :             86 :             found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
 2312 peter@eisentraut.org      599                 :             86 :             break;
                                600                 :             48 :         case UNICODE_NFKC:
 2113 michael@paquier.xyz       601                 :             48 :             found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
 2312 peter@eisentraut.org      602                 :             48 :             break;
 2312 peter@eisentraut.org      603                 :UBC           0 :         default:
                                604                 :              0 :             Assert(false);
                                605                 :                :             break;
                                606                 :                :     }
                                607                 :                : 
 2312 peter@eisentraut.org      608         [ +  + ]:CBC         134 :     if (found)
                                609                 :             24 :         return found->quickcheck;
                                610                 :                :     else
                                611                 :            110 :         return UNICODE_NORM_QC_YES;
                                612                 :                : }
                                613                 :                : 
                                614                 :                : UnicodeNormalizationQC
  269 jdavis@postgresql.or      615                 :             90 : unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const char32_t *input)
                                616                 :                : {
 2312 peter@eisentraut.org      617                 :             90 :     uint8       lastCanonicalClass = 0;
                                618                 :             90 :     UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
                                619                 :                : 
                                620                 :                :     /*
                                621                 :                :      * For the "D" forms, we don't run the quickcheck.  We don't include the
                                622                 :                :      * lookup tables for those because they are huge, checking for these
                                623                 :                :      * particular forms is less common, and running the slow path is faster
                                624                 :                :      * for the "D" forms than the "C" forms because you don't need to
                                625                 :                :      * recompose, which is slow.
                                626                 :                :      */
                                627   [ +  +  +  + ]:             90 :     if (form == UNICODE_NFD || form == UNICODE_NFKD)
                                628                 :             40 :         return UNICODE_NORM_QC_MAYBE;
                                629                 :                : 
  269 jdavis@postgresql.or      630         [ +  + ]:            176 :     for (const char32_t *p = input; *p; p++)
                                631                 :                :     {
                                632                 :            134 :         char32_t    ch = *p;
                                633                 :                :         uint8       canonicalClass;
                                634                 :                :         UnicodeNormalizationQC check;
                                635                 :                : 
 2312 peter@eisentraut.org      636                 :            134 :         canonicalClass = get_canonical_class(ch);
                                637   [ +  +  -  + ]:            134 :         if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
 2312 peter@eisentraut.org      638                 :UBC           0 :             return UNICODE_NORM_QC_NO;
                                639                 :                : 
 2312 peter@eisentraut.org      640                 :CBC         134 :         check = qc_is_allowed(form, ch);
                                641         [ +  + ]:            134 :         if (check == UNICODE_NORM_QC_NO)
                                642                 :              8 :             return UNICODE_NORM_QC_NO;
                                643         [ +  + ]:            126 :         else if (check == UNICODE_NORM_QC_MAYBE)
                                644                 :             16 :             result = UNICODE_NORM_QC_MAYBE;
                                645                 :                : 
                                646                 :            126 :         lastCanonicalClass = canonicalClass;
                                647                 :                :     }
                                648                 :             42 :     return result;
                                649                 :                : }
                                650                 :                : 
                                651                 :                : #endif                          /* !FRONTEND */

Generated by: LCOV version 2.0-1