LCOV - PostgreSQL 20devel - src/backend/utils/adt/levenshtein.c

LCOV - code coverage report

Current view:	top level - src/backend/utils/adt - levenshtein.c (source / functions)		Coverage	Total	Hit
Test:	PostgreSQL 20devel	Lines:	94.3 %	105	99
Test Date:	2026-07-22 12:15:41	Functions:	100.0 %	2	2
Legend:	Lines: hit not hit Branches: + taken - not taken # not executed	Branches:	68.3 %	120	82

             Branch data     Line data    Source code

       1                 :             : /*-------------------------------------------------------------------------
       2                 :             :  *
       3                 :             :  * levenshtein.c
       4                 :             :  *    Levenshtein distance implementation.
       5                 :             :  *
       6                 :             :  * Original author:  Joe Conway <mail@joeconway.com>
       7                 :             :  *
       8                 :             :  * This file is included by varlena.c twice, to provide matching code for (1)
       9                 :             :  * Levenshtein distance with custom costings, and (2) Levenshtein distance with
      10                 :             :  * custom costings and a "max" value above which exact distances are not
      11                 :             :  * interesting.  Before the inclusion, we rely on the presence of the inline
      12                 :             :  * function rest_of_char_same().
      13                 :             :  *
      14                 :             :  * Written based on a description of the algorithm by Michael Gilleland found
      15                 :             :  * at http://www.merriampark.com/ld.htm.  Also looked at levenshtein.c in the
      16                 :             :  * PHP 4.0.6 distribution for inspiration.  Configurable penalty costs
      17                 :             :  * extension is introduced by Volkan YAZICI <volkan.yazici@gmail.com.
      18                 :             :  *
      19                 :             :  * Copyright (c) 2001-2026, PostgreSQL Global Development Group
      20                 :             :  *
      21                 :             :  * IDENTIFICATION
      22                 :             :  *  src/backend/utils/adt/levenshtein.c
      23                 :             :  *
      24                 :             :  *-------------------------------------------------------------------------
      25                 :             :  */
      26                 :             : #define MAX_LEVENSHTEIN_STRLEN      255
      27                 :             : 
      28                 :             : /*
      29                 :             :  * Calculates Levenshtein distance metric between supplied strings, which are
      30                 :             :  * not necessarily null-terminated.
      31                 :             :  *
      32                 :             :  * source: source string, of length slen bytes.
      33                 :             :  * target: target string, of length tlen bytes.
      34                 :             :  * ins_c, del_c, sub_c: costs to charge for character insertion, deletion,
      35                 :             :  *      and substitution respectively; (1, 1, 1) costs suffice for common
      36                 :             :  *      cases, but your mileage may vary.
      37                 :             :  * max_d: if provided and >= 0, maximum distance we care about; see below.
      38                 :             :  * trusted: caller is trusted and need not obey MAX_LEVENSHTEIN_STRLEN.
      39                 :             :  *
      40                 :             :  * One way to compute Levenshtein distance is to incrementally construct
      41                 :             :  * an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number
      42                 :             :  * of operations required to transform the first i characters of s into
      43                 :             :  * the first j characters of t.  The last column of the final row is the
      44                 :             :  * answer.
      45                 :             :  *
      46                 :             :  * We use that algorithm here with some modification.  In lieu of holding
      47                 :             :  * the entire array in memory at once, we'll just use two arrays of size
      48                 :             :  * m+1 for storing accumulated values. At each step one array represents
      49                 :             :  * the "previous" row and one is the "current" row of the notional large
      50                 :             :  * array.
      51                 :             :  *
      52                 :             :  * If max_d >= 0, we only need to provide an accurate answer when that answer
      53                 :             :  * is less than or equal to max_d.  From any cell in the matrix, there is
      54                 :             :  * theoretical "minimum residual distance" from that cell to the last column
      55                 :             :  * of the final row.  This minimum residual distance is zero when the
      56                 :             :  * untransformed portions of the strings are of equal length (because we might
      57                 :             :  * get lucky and find all the remaining characters matching) and is otherwise
      58                 :             :  * based on the minimum number of insertions or deletions needed to make them
      59                 :             :  * equal length.  The residual distance grows as we move toward the upper
      60                 :             :  * right or lower left corners of the matrix.  When the max_d bound is
      61                 :             :  * usefully tight, we can use this property to avoid computing the entirety
      62                 :             :  * of each row; instead, we maintain a start_column and stop_column that
      63                 :             :  * identify the portion of the matrix close to the diagonal which can still
      64                 :             :  * affect the final answer.
      65                 :             :  */
      66                 :             : int
      67                 :             : #ifdef LEVENSHTEIN_LESS_EQUAL
      68                 :        1933 : varstr_levenshtein_less_equal(const char *source, int slen,
      69                 :             :                               const char *target, int tlen,
      70                 :             :                               int ins_c, int del_c, int sub_c,
      71                 :             :                               int max_d, bool trusted)
      72                 :             : #else
      73                 :           2 : varstr_levenshtein(const char *source, int slen,
      74                 :             :                    const char *target, int tlen,
      75                 :             :                    int ins_c, int del_c, int sub_c,
      76                 :             :                    bool trusted)
      77                 :             : #endif
      78                 :             : {
      79                 :             :     int         m,
      80                 :             :                 n;
      81                 :             :     int        *prev;
      82                 :             :     int        *curr;
      83                 :        1935 :     int        *s_char_len = NULL;
      84                 :             :     int         j;
      85                 :             :     const char *y;
      86                 :        1935 :     const char *send = source + slen;
      87                 :        1935 :     const char *tend = target + tlen;
      88                 :             : 
      89                 :             :     /*
      90                 :             :      * For varstr_levenshtein_less_equal, we have real variables called
      91                 :             :      * start_column and stop_column; otherwise it's just short-hand for 0 and
      92                 :             :      * m.
      93                 :             :      */
      94                 :             : #ifdef LEVENSHTEIN_LESS_EQUAL
      95                 :             :     int         start_column,
      96                 :             :                 stop_column;
      97                 :             : 
      98                 :             : #undef START_COLUMN
      99                 :             : #undef STOP_COLUMN
     100                 :             : #define START_COLUMN start_column
     101                 :             : #define STOP_COLUMN stop_column
     102                 :             : #else
     103                 :             : #undef START_COLUMN
     104                 :             : #undef STOP_COLUMN
     105                 :             : #define START_COLUMN 0
     106                 :             : #define STOP_COLUMN m
     107                 :             : #endif
     108                 :             : 
     109                 :             :     /* Convert string lengths (in bytes) to lengths in characters */
     110                 :        1935 :     m = pg_mbstrlen_with_len(source, slen);
     111                 :        1935 :     n = pg_mbstrlen_with_len(target, tlen);
     112                 :             : 
     113                 :             :     /*
     114                 :             :      * We can transform an empty s into t with n insertions, or a non-empty t
     115                 :             :      * into an empty s with m deletions.
     116                 :             :      */
     117   [ -  +  -  + ]:        1935 :     if (!m)
     118                 :           0 :         return n * ins_c;
     119   [ -  +  -  + ]:        1935 :     if (!n)
     120                 :           0 :         return m * del_c;
     121                 :             : 
     122                 :             :     /*
     123                 :             :      * For security concerns, restrict excessive CPU+RAM usage. (This
     124                 :             :      * implementation uses O(m) memory and has O(mn) complexity.)  If
     125                 :             :      * "trusted" is true, caller is responsible for not making excessive
     126                 :             :      * requests, typically by using a small max_d along with strings that are
     127                 :             :      * bounded, though not necessarily to MAX_LEVENSHTEIN_STRLEN exactly.
     128                 :             :      */
     129   [ +  +  +  -  :        1935 :     if (!trusted &&
             +  -  +  - ]
     130   [ -  +  -  + ]:           4 :         (m > MAX_LEVENSHTEIN_STRLEN ||
     131                 :             :          n > MAX_LEVENSHTEIN_STRLEN))
     132   [ #  #  #  # ]:           0 :         ereport(ERROR,
     133                 :             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     134                 :             :                  errmsg("levenshtein argument exceeds maximum length of %d characters",
     135                 :             :                         MAX_LEVENSHTEIN_STRLEN)));
     136                 :             : 
     137                 :             : #ifdef LEVENSHTEIN_LESS_EQUAL
     138                 :             :     /* Initialize start and stop columns. */
     139                 :        1933 :     start_column = 0;
     140                 :        1933 :     stop_column = m + 1;
     141                 :             : 
     142                 :             :     /*
     143                 :             :      * If max_d >= 0, determine whether the bound is impossibly tight.  If so,
     144                 :             :      * return max_d + 1 immediately.  Otherwise, determine whether it's tight
     145                 :             :      * enough to limit the computation we must perform.  If so, figure out
     146                 :             :      * initial stop column.
     147                 :             :      */
     148         [ +  - ]:        1933 :     if (max_d >= 0)
     149                 :             :     {
     150                 :             :         int         min_theo_d; /* Theoretical minimum distance. */
     151                 :             :         int         max_theo_d; /* Theoretical maximum distance. */
     152                 :        1933 :         int         net_inserts = n - m;
     153                 :             : 
     154                 :        1933 :         min_theo_d = net_inserts < 0 ?
     155         [ +  + ]:        1933 :             -net_inserts * del_c : net_inserts * ins_c;
     156         [ +  + ]:        1933 :         if (min_theo_d > max_d)
     157                 :         693 :             return max_d + 1;
     158         [ -  + ]:        1240 :         if (ins_c + del_c < sub_c)
     159                 :           0 :             sub_c = ins_c + del_c;
     160                 :        1240 :         max_theo_d = min_theo_d + sub_c * Min(m, n);
     161         [ +  + ]:        1240 :         if (max_d >= max_theo_d)
     162                 :         381 :             max_d = -1;
     163         [ +  - ]:         859 :         else if (ins_c + del_c > 0)
     164                 :             :         {
     165                 :             :             /*
     166                 :             :              * Figure out how much of the first row of the notional matrix we
     167                 :             :              * need to fill in.  If the string is growing, the theoretical
     168                 :             :              * minimum distance already incorporates the cost of deleting the
     169                 :             :              * number of characters necessary to make the two strings equal in
     170                 :             :              * length.  Each additional deletion forces another insertion, so
     171                 :             :              * the best-case total cost increases by ins_c + del_c. If the
     172                 :             :              * string is shrinking, the minimum theoretical cost assumes no
     173                 :             :              * excess deletions; that is, we're starting no further right than
     174                 :             :              * column n - m.  If we do start further right, the best-case
     175                 :             :              * total cost increases by ins_c + del_c for each move right.
     176                 :             :              */
     177                 :         859 :             int         slack_d = max_d - min_theo_d;
     178         [ +  + ]:         859 :             int         best_column = net_inserts < 0 ? -net_inserts : 0;
     179                 :             : 
     180                 :         859 :             stop_column = best_column + (slack_d / (ins_c + del_c)) + 1;
     181         [ -  + ]:         859 :             if (stop_column > m)
     182                 :           0 :                 stop_column = m + 1;
     183                 :             :         }
     184                 :             :     }
     185                 :             : #endif
     186                 :             : 
     187                 :             :     /*
     188                 :             :      * In order to avoid calling pg_mblen_range() repeatedly on each character
     189                 :             :      * in s, we cache all the lengths before starting the main loop -- but if
     190                 :             :      * all the characters in both strings are single byte, then we skip this
     191                 :             :      * and use a fast-path in the main loop.  If only one string contains
     192                 :             :      * multi-byte characters, we still build the array, so that the fast-path
     193                 :             :      * needn't deal with the case where the array hasn't been initialized.
     194                 :             :      */
     195   [ +  -  +  +  :        1242 :     if (m != slen || n != tlen)
             +  -  -  + ]
     196                 :             :     {
     197                 :             :         int         i;
     198                 :           4 :         const char *cp = source;
     199                 :             : 
     200                 :           4 :         s_char_len = (int *) palloc((m + 1) * sizeof(int));
     201   [ +  +  -  - ]:          40 :         for (i = 0; i < m; ++i)
     202                 :             :         {
     203                 :          36 :             s_char_len[i] = pg_mblen_range(cp, send);
     204                 :          36 :             cp += s_char_len[i];
     205                 :             :         }
     206                 :           4 :         s_char_len[i] = 0;
     207                 :             :     }
     208                 :             : 
     209                 :             :     /* One more cell for initialization column and row. */
     210                 :        1242 :     ++m;
     211                 :        1242 :     ++n;
     212                 :             : 
     213                 :             :     /* Previous and current rows of notional array. */
     214                 :        1242 :     prev = (int *) palloc(2 * m * sizeof(int));
     215                 :        1242 :     curr = prev + m;
     216                 :             : 
     217                 :             :     /*
     218                 :             :      * To transform the first i characters of s into the first 0 characters of
     219                 :             :      * t, we must perform i deletions.
     220                 :             :      */
     221   [ +  +  +  + ]:        4827 :     for (int i = START_COLUMN; i < STOP_COLUMN; i++)
     222                 :        3585 :         prev[i] = i * del_c;
     223                 :             : 
     224                 :             :     /* Loop through rows of the notional array */
     225   [ +  +  +  + ]:        4830 :     for (y = target, j = 1; j < n; j++)
     226                 :             :     {
     227                 :             :         int        *temp;
     228                 :        4316 :         const char *x = source;
     229   [ +  +  -  + ]:        4316 :         int         y_char_len = n != tlen + 1 ? pg_mblen_range(y, tend) : 1;
     230                 :             :         int         i;
     231                 :             : 
     232                 :             : #ifdef LEVENSHTEIN_LESS_EQUAL
     233                 :             : 
     234                 :             :         /*
     235                 :             :          * In the best case, values percolate down the diagonal unchanged, so
     236                 :             :          * we must increment stop_column unless it's already on the right end
     237                 :             :          * of the array.  The inner loop will read prev[stop_column], so we
     238                 :             :          * have to initialize it even though it shouldn't affect the result.
     239                 :             :          */
     240         [ +  + ]:        4304 :         if (stop_column < m)
     241                 :             :         {
     242                 :        3442 :             prev[stop_column] = max_d + 1;
     243                 :        3442 :             ++stop_column;
     244                 :             :         }
     245                 :             : 
     246                 :             :         /*
     247                 :             :          * The main loop fills in curr, but curr[0] needs a special case: to
     248                 :             :          * transform the first 0 characters of s into the first j characters
     249                 :             :          * of t, we must perform j insertions.  However, if start_column > 0,
     250                 :             :          * this special case does not apply.
     251                 :             :          */
     252         [ +  + ]:        4304 :         if (start_column == 0)
     253                 :             :         {
     254                 :        2758 :             curr[0] = j * ins_c;
     255                 :        2758 :             i = 1;
     256                 :             :         }
     257                 :             :         else
     258                 :        1546 :             i = start_column;
     259                 :             : #else
     260                 :          12 :         curr[0] = j * ins_c;
     261                 :          12 :         i = 1;
     262                 :             : #endif
     263                 :             : 
     264                 :             :         /*
     265                 :             :          * This inner loop is critical to performance, so we include a
     266                 :             :          * fast-path to handle the (fairly common) case where no multibyte
     267                 :             :          * characters are in the mix.  The fast-path is entitled to assume
     268                 :             :          * that if s_char_len is not initialized then BOTH strings contain
     269                 :             :          * only single-byte characters.
     270                 :             :          */
     271   [ +  +  -  + ]:        4316 :         if (s_char_len != NULL)
     272                 :             :         {
     273   [ +  +  -  - ]:         248 :             for (; i < STOP_COLUMN; i++)
     274                 :             :             {
     275                 :             :                 int         ins;
     276                 :             :                 int         del;
     277                 :             :                 int         sub;
     278                 :         208 :                 int         x_char_len = s_char_len[i - 1];
     279                 :             : 
     280                 :             :                 /*
     281                 :             :                  * Calculate costs for insertion, deletion, and substitution.
     282                 :             :                  *
     283                 :             :                  * When calculating cost for substitution, we compare the last
     284                 :             :                  * character of each possibly-multibyte character first,
     285                 :             :                  * because that's enough to rule out most mis-matches.  If we
     286                 :             :                  * get past that test, then we compare the lengths and the
     287                 :             :                  * remaining bytes.
     288                 :             :                  */
     289                 :         208 :                 ins = prev[i] + ins_c;
     290                 :         208 :                 del = curr[i - 1] + del_c;
     291   [ +  +  -  - ]:         208 :                 if (x[x_char_len - 1] == y[y_char_len - 1]
     292   [ +  -  -  +  :          36 :                     && x_char_len == y_char_len &&
             -  -  -  - ]
     293   [ #  #  #  # ]:           0 :                     (x_char_len == 1 || rest_of_char_same(x, y, x_char_len)))
     294                 :          36 :                     sub = prev[i - 1];
     295                 :             :                 else
     296                 :         172 :                     sub = prev[i - 1] + sub_c;
     297                 :             : 
     298                 :             :                 /* Take the one with minimum cost. */
     299                 :         208 :                 curr[i] = Min(ins, del);
     300                 :         208 :                 curr[i] = Min(curr[i], sub);
     301                 :             : 
     302                 :             :                 /* Point to next character. */
     303                 :         208 :                 x += x_char_len;
     304                 :             :             }
     305                 :             :         }
     306                 :             :         else
     307                 :             :         {
     308   [ +  +  +  + ]:       17292 :             for (; i < STOP_COLUMN; i++)
     309                 :             :             {
     310                 :             :                 int         ins;
     311                 :             :                 int         del;
     312                 :             :                 int         sub;
     313                 :             : 
     314                 :             :                 /* Calculate costs for insertion, deletion, and substitution. */
     315                 :       13016 :                 ins = prev[i] + ins_c;
     316                 :       13016 :                 del = curr[i - 1] + del_c;
     317   [ +  +  +  + ]:       13016 :                 sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c);
     318                 :             : 
     319                 :             :                 /* Take the one with minimum cost. */
     320                 :       13016 :                 curr[i] = Min(ins, del);
     321                 :       13016 :                 curr[i] = Min(curr[i], sub);
     322                 :             : 
     323                 :             :                 /* Point to next character. */
     324                 :       13016 :                 x++;
     325                 :             :             }
     326                 :             :         }
     327                 :             : 
     328                 :             :         /* Swap current row with previous row. */
     329                 :        4316 :         temp = curr;
     330                 :        4316 :         curr = prev;
     331                 :        4316 :         prev = temp;
     332                 :             : 
     333                 :             :         /* Point to next character. */
     334                 :          12 :         y += y_char_len;
     335                 :             : 
     336                 :             : #ifdef LEVENSHTEIN_LESS_EQUAL
     337                 :             : 
     338                 :             :         /*
     339                 :             :          * This chunk of code represents a significant performance hit if used
     340                 :             :          * in the case where there is no max_d bound.  This is probably not
     341                 :             :          * because the max_d >= 0 test itself is expensive, but rather because
     342                 :             :          * the possibility of needing to execute this code prevents tight
     343                 :             :          * optimization of the loop as a whole.
     344                 :             :          */
     345         [ +  + ]:        4304 :         if (max_d >= 0)
     346                 :             :         {
     347                 :             :             /*
     348                 :             :              * The "zero point" is the column of the current row where the
     349                 :             :              * remaining portions of the strings are of equal length.  There
     350                 :             :              * are (n - 1) characters in the target string, of which j have
     351                 :             :              * been transformed.  There are (m - 1) characters in the source
     352                 :             :              * string, so we want to find the value for zp where (n - 1) - j =
     353                 :             :              * (m - 1) - zp.
     354                 :             :              */
     355                 :        3548 :             int         zp = j - (n - m);
     356                 :             : 
     357                 :             :             /* Check whether the stop column can slide left. */
     358         [ +  + ]:        8513 :             while (stop_column > 0)
     359                 :             :             {
     360                 :        7785 :                 int         ii = stop_column - 1;
     361                 :        7785 :                 int         net_inserts = ii - zp;
     362                 :             : 
     363   [ +  +  +  + ]:       13161 :                 if (prev[ii] + (net_inserts > 0 ? net_inserts * ins_c :
     364                 :        5376 :                                 -net_inserts * del_c) <= max_d)
     365                 :        2820 :                     break;
     366                 :        4965 :                 stop_column--;
     367                 :             :             }
     368                 :             : 
     369                 :             :             /* Check whether the start column can slide right. */
     370         [ +  + ]:        5853 :             while (start_column < stop_column)
     371                 :             :             {
     372                 :        5125 :                 int         net_inserts = start_column - zp;
     373                 :             : 
     374         [ +  + ]:        5125 :                 if (prev[start_column] +
     375         [ +  + ]:        5125 :                     (net_inserts > 0 ? net_inserts * ins_c :
     376                 :        4813 :                      -net_inserts * del_c) <= max_d)
     377                 :        2820 :                     break;
     378                 :             : 
     379                 :             :                 /*
     380                 :             :                  * We'll never again update these values, so we must make sure
     381                 :             :                  * there's nothing here that could confuse any future
     382                 :             :                  * iteration of the outer loop.
     383                 :             :                  */
     384                 :        2305 :                 prev[start_column] = max_d + 1;
     385                 :        2305 :                 curr[start_column] = max_d + 1;
     386         [ +  + ]:        2305 :                 if (start_column != 0)
     387         [ +  + ]:        1580 :                     source += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1;
     388                 :        2305 :                 start_column++;
     389                 :             :             }
     390                 :             : 
     391                 :             :             /* If they cross, we're going to exceed the bound. */
     392         [ +  + ]:        3548 :             if (start_column >= stop_column)
     393                 :         728 :                 return max_d + 1;
     394                 :             :         }
     395                 :             : #endif
     396                 :             :     }
     397                 :             : 
     398                 :             :     /*
     399                 :             :      * Because the final value was swapped from the previous row to the
     400                 :             :      * current row, that's where we'll find it.
     401                 :             :      */
     402                 :         514 :     return prev[m - 1];
     403                 :             : }

Generated by: LCOV version 2.0-1