LCOV - PostgreSQL 19devel - src/backend/utils/adt/levenshtein.c

LCOV - code coverage report

Current view:	top level - src/backend/utils/adt - levenshtein.c (source / functions)		Coverage	Total	Hit
Test:	PostgreSQL 19devel	Lines:	94.3 %	105	99
Test Date:	2026-02-28 22:14:39	Functions:	100.0 %	2	2
Legend:	Lines: hit not hit

            Line data    Source code

       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * levenshtein.c
       4              :  *    Levenshtein distance implementation.
       5              :  *
       6              :  * Original author:  Joe Conway <mail@joeconway.com>
       7              :  *
       8              :  * This file is included by varlena.c twice, to provide matching code for (1)
       9              :  * Levenshtein distance with custom costings, and (2) Levenshtein distance with
      10              :  * custom costings and a "max" value above which exact distances are not
      11              :  * interesting.  Before the inclusion, we rely on the presence of the inline
      12              :  * function rest_of_char_same().
      13              :  *
      14              :  * Written based on a description of the algorithm by Michael Gilleland found
      15              :  * at http://www.merriampark.com/ld.htm.  Also looked at levenshtein.c in the
      16              :  * PHP 4.0.6 distribution for inspiration.  Configurable penalty costs
      17              :  * extension is introduced by Volkan YAZICI <volkan.yazici@gmail.com.
      18              :  *
      19              :  * Copyright (c) 2001-2026, PostgreSQL Global Development Group
      20              :  *
      21              :  * IDENTIFICATION
      22              :  *  src/backend/utils/adt/levenshtein.c
      23              :  *
      24              :  *-------------------------------------------------------------------------
      25              :  */
      26              : #define MAX_LEVENSHTEIN_STRLEN      255
      27              : 
      28              : /*
      29              :  * Calculates Levenshtein distance metric between supplied strings, which are
      30              :  * not necessarily null-terminated.
      31              :  *
      32              :  * source: source string, of length slen bytes.
      33              :  * target: target string, of length tlen bytes.
      34              :  * ins_c, del_c, sub_c: costs to charge for character insertion, deletion,
      35              :  *      and substitution respectively; (1, 1, 1) costs suffice for common
      36              :  *      cases, but your mileage may vary.
      37              :  * max_d: if provided and >= 0, maximum distance we care about; see below.
      38              :  * trusted: caller is trusted and need not obey MAX_LEVENSHTEIN_STRLEN.
      39              :  *
      40              :  * One way to compute Levenshtein distance is to incrementally construct
      41              :  * an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number
      42              :  * of operations required to transform the first i characters of s into
      43              :  * the first j characters of t.  The last column of the final row is the
      44              :  * answer.
      45              :  *
      46              :  * We use that algorithm here with some modification.  In lieu of holding
      47              :  * the entire array in memory at once, we'll just use two arrays of size
      48              :  * m+1 for storing accumulated values. At each step one array represents
      49              :  * the "previous" row and one is the "current" row of the notional large
      50              :  * array.
      51              :  *
      52              :  * If max_d >= 0, we only need to provide an accurate answer when that answer
      53              :  * is less than or equal to max_d.  From any cell in the matrix, there is
      54              :  * theoretical "minimum residual distance" from that cell to the last column
      55              :  * of the final row.  This minimum residual distance is zero when the
      56              :  * untransformed portions of the strings are of equal length (because we might
      57              :  * get lucky and find all the remaining characters matching) and is otherwise
      58              :  * based on the minimum number of insertions or deletions needed to make them
      59              :  * equal length.  The residual distance grows as we move toward the upper
      60              :  * right or lower left corners of the matrix.  When the max_d bound is
      61              :  * usefully tight, we can use this property to avoid computing the entirety
      62              :  * of each row; instead, we maintain a start_column and stop_column that
      63              :  * identify the portion of the matrix close to the diagonal which can still
      64              :  * affect the final answer.
      65              :  */
      66              : int
      67              : #ifdef LEVENSHTEIN_LESS_EQUAL
      68         1520 : varstr_levenshtein_less_equal(const char *source, int slen,
      69              :                               const char *target, int tlen,
      70              :                               int ins_c, int del_c, int sub_c,
      71              :                               int max_d, bool trusted)
      72              : #else
      73            2 : varstr_levenshtein(const char *source, int slen,
      74              :                    const char *target, int tlen,
      75              :                    int ins_c, int del_c, int sub_c,
      76              :                    bool trusted)
      77              : #endif
      78              : {
      79              :     int         m,
      80              :                 n;
      81              :     int        *prev;
      82              :     int        *curr;
      83         1522 :     int        *s_char_len = NULL;
      84              :     int         j;
      85              :     const char *y;
      86         1522 :     const char *send = source + slen;
      87         1522 :     const char *tend = target + tlen;
      88              : 
      89              :     /*
      90              :      * For varstr_levenshtein_less_equal, we have real variables called
      91              :      * start_column and stop_column; otherwise it's just short-hand for 0 and
      92              :      * m.
      93              :      */
      94              : #ifdef LEVENSHTEIN_LESS_EQUAL
      95              :     int         start_column,
      96              :                 stop_column;
      97              : 
      98              : #undef START_COLUMN
      99              : #undef STOP_COLUMN
     100              : #define START_COLUMN start_column
     101              : #define STOP_COLUMN stop_column
     102              : #else
     103              : #undef START_COLUMN
     104              : #undef STOP_COLUMN
     105              : #define START_COLUMN 0
     106              : #define STOP_COLUMN m
     107              : #endif
     108              : 
     109              :     /* Convert string lengths (in bytes) to lengths in characters */
     110         1522 :     m = pg_mbstrlen_with_len(source, slen);
     111         1522 :     n = pg_mbstrlen_with_len(target, tlen);
     112              : 
     113              :     /*
     114              :      * We can transform an empty s into t with n insertions, or a non-empty t
     115              :      * into an empty s with m deletions.
     116              :      */
     117         1522 :     if (!m)
     118            0 :         return n * ins_c;
     119         1522 :     if (!n)
     120            0 :         return m * del_c;
     121              : 
     122              :     /*
     123              :      * For security concerns, restrict excessive CPU+RAM usage. (This
     124              :      * implementation uses O(m) memory and has O(mn) complexity.)  If
     125              :      * "trusted" is true, caller is responsible for not making excessive
     126              :      * requests, typically by using a small max_d along with strings that are
     127              :      * bounded, though not necessarily to MAX_LEVENSHTEIN_STRLEN exactly.
     128              :      */
     129         1522 :     if (!trusted &&
     130            4 :         (m > MAX_LEVENSHTEIN_STRLEN ||
     131              :          n > MAX_LEVENSHTEIN_STRLEN))
     132            0 :         ereport(ERROR,
     133              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     134              :                  errmsg("levenshtein argument exceeds maximum length of %d characters",
     135              :                         MAX_LEVENSHTEIN_STRLEN)));
     136              : 
     137              : #ifdef LEVENSHTEIN_LESS_EQUAL
     138              :     /* Initialize start and stop columns. */
     139         1520 :     start_column = 0;
     140         1520 :     stop_column = m + 1;
     141              : 
     142              :     /*
     143              :      * If max_d >= 0, determine whether the bound is impossibly tight.  If so,
     144              :      * return max_d + 1 immediately.  Otherwise, determine whether it's tight
     145              :      * enough to limit the computation we must perform.  If so, figure out
     146              :      * initial stop column.
     147              :      */
     148         1520 :     if (max_d >= 0)
     149              :     {
     150              :         int         min_theo_d; /* Theoretical minimum distance. */
     151              :         int         max_theo_d; /* Theoretical maximum distance. */
     152         1520 :         int         net_inserts = n - m;
     153              : 
     154         1520 :         min_theo_d = net_inserts < 0 ?
     155         1520 :             -net_inserts * del_c : net_inserts * ins_c;
     156         1520 :         if (min_theo_d > max_d)
     157          558 :             return max_d + 1;
     158          962 :         if (ins_c + del_c < sub_c)
     159            0 :             sub_c = ins_c + del_c;
     160          962 :         max_theo_d = min_theo_d + sub_c * Min(m, n);
     161          962 :         if (max_d >= max_theo_d)
     162          286 :             max_d = -1;
     163          676 :         else if (ins_c + del_c > 0)
     164              :         {
     165              :             /*
     166              :              * Figure out how much of the first row of the notional matrix we
     167              :              * need to fill in.  If the string is growing, the theoretical
     168              :              * minimum distance already incorporates the cost of deleting the
     169              :              * number of characters necessary to make the two strings equal in
     170              :              * length.  Each additional deletion forces another insertion, so
     171              :              * the best-case total cost increases by ins_c + del_c. If the
     172              :              * string is shrinking, the minimum theoretical cost assumes no
     173              :              * excess deletions; that is, we're starting no further right than
     174              :              * column n - m.  If we do start further right, the best-case
     175              :              * total cost increases by ins_c + del_c for each move right.
     176              :              */
     177          676 :             int         slack_d = max_d - min_theo_d;
     178          676 :             int         best_column = net_inserts < 0 ? -net_inserts : 0;
     179              : 
     180          676 :             stop_column = best_column + (slack_d / (ins_c + del_c)) + 1;
     181          676 :             if (stop_column > m)
     182            0 :                 stop_column = m + 1;
     183              :         }
     184              :     }
     185              : #endif
     186              : 
     187              :     /*
     188              :      * In order to avoid calling pg_mblen_range() repeatedly on each character
     189              :      * in s, we cache all the lengths before starting the main loop -- but if
     190              :      * all the characters in both strings are single byte, then we skip this
     191              :      * and use a fast-path in the main loop.  If only one string contains
     192              :      * multi-byte characters, we still build the array, so that the fast-path
     193              :      * needn't deal with the case where the array hasn't been initialized.
     194              :      */
     195          964 :     if (m != slen || n != tlen)
     196              :     {
     197              :         int         i;
     198            3 :         const char *cp = source;
     199              : 
     200            3 :         s_char_len = (int *) palloc((m + 1) * sizeof(int));
     201           30 :         for (i = 0; i < m; ++i)
     202              :         {
     203           27 :             s_char_len[i] = pg_mblen_range(cp, send);
     204           27 :             cp += s_char_len[i];
     205              :         }
     206            3 :         s_char_len[i] = 0;
     207              :     }
     208              : 
     209              :     /* One more cell for initialization column and row. */
     210          964 :     ++m;
     211          964 :     ++n;
     212              : 
     213              :     /* Previous and current rows of notional array. */
     214          964 :     prev = (int *) palloc(2 * m * sizeof(int));
     215          964 :     curr = prev + m;
     216              : 
     217              :     /*
     218              :      * To transform the first i characters of s into the first 0 characters of
     219              :      * t, we must perform i deletions.
     220              :      */
     221         3748 :     for (int i = START_COLUMN; i < STOP_COLUMN; i++)
     222         2784 :         prev[i] = i * del_c;
     223              : 
     224              :     /* Loop through rows of the notional array */
     225         3785 :     for (y = target, j = 1; j < n; j++)
     226              :     {
     227              :         int        *temp;
     228         3397 :         const char *x = source;
     229         3397 :         int         y_char_len = n != tlen + 1 ? pg_mblen_range(y, tend) : 1;
     230              :         int         i;
     231              : 
     232              : #ifdef LEVENSHTEIN_LESS_EQUAL
     233              : 
     234              :         /*
     235              :          * In the best case, values percolate down the diagonal unchanged, so
     236              :          * we must increment stop_column unless it's already on the right end
     237              :          * of the array.  The inner loop will read prev[stop_column], so we
     238              :          * have to initialize it even though it shouldn't affect the result.
     239              :          */
     240         3385 :         if (stop_column < m)
     241              :         {
     242         2736 :             prev[stop_column] = max_d + 1;
     243         2736 :             ++stop_column;
     244              :         }
     245              : 
     246              :         /*
     247              :          * The main loop fills in curr, but curr[0] needs a special case: to
     248              :          * transform the first 0 characters of s into the first j characters
     249              :          * of t, we must perform j insertions.  However, if start_column > 0,
     250              :          * this special case does not apply.
     251              :          */
     252         3385 :         if (start_column == 0)
     253              :         {
     254         2167 :             curr[0] = j * ins_c;
     255         2167 :             i = 1;
     256              :         }
     257              :         else
     258         1218 :             i = start_column;
     259              : #else
     260           12 :         curr[0] = j * ins_c;
     261           12 :         i = 1;
     262              : #endif
     263              : 
     264              :         /*
     265              :          * This inner loop is critical to performance, so we include a
     266              :          * fast-path to handle the (fairly common) case where no multibyte
     267              :          * characters are in the mix.  The fast-path is entitled to assume
     268              :          * that if s_char_len is not initialized then BOTH strings contain
     269              :          * only single-byte characters.
     270              :          */
     271         3397 :         if (s_char_len != NULL)
     272              :         {
     273          186 :             for (; i < STOP_COLUMN; i++)
     274              :             {
     275              :                 int         ins;
     276              :                 int         del;
     277              :                 int         sub;
     278          156 :                 int         x_char_len = s_char_len[i - 1];
     279              : 
     280              :                 /*
     281              :                  * Calculate costs for insertion, deletion, and substitution.
     282              :                  *
     283              :                  * When calculating cost for substitution, we compare the last
     284              :                  * character of each possibly-multibyte character first,
     285              :                  * because that's enough to rule out most mis-matches.  If we
     286              :                  * get past that test, then we compare the lengths and the
     287              :                  * remaining bytes.
     288              :                  */
     289          156 :                 ins = prev[i] + ins_c;
     290          156 :                 del = curr[i - 1] + del_c;
     291          156 :                 if (x[x_char_len - 1] == y[y_char_len - 1]
     292           27 :                     && x_char_len == y_char_len &&
     293            0 :                     (x_char_len == 1 || rest_of_char_same(x, y, x_char_len)))
     294           27 :                     sub = prev[i - 1];
     295              :                 else
     296          129 :                     sub = prev[i - 1] + sub_c;
     297              : 
     298              :                 /* Take the one with minimum cost. */
     299          156 :                 curr[i] = Min(ins, del);
     300          156 :                 curr[i] = Min(curr[i], sub);
     301              : 
     302              :                 /* Point to next character. */
     303          156 :                 x += x_char_len;
     304              :             }
     305              :         }
     306              :         else
     307              :         {
     308        13604 :             for (; i < STOP_COLUMN; i++)
     309              :             {
     310              :                 int         ins;
     311              :                 int         del;
     312              :                 int         sub;
     313              : 
     314              :                 /* Calculate costs for insertion, deletion, and substitution. */
     315        10237 :                 ins = prev[i] + ins_c;
     316        10237 :                 del = curr[i - 1] + del_c;
     317        10237 :                 sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c);
     318              : 
     319              :                 /* Take the one with minimum cost. */
     320        10237 :                 curr[i] = Min(ins, del);
     321        10237 :                 curr[i] = Min(curr[i], sub);
     322              : 
     323              :                 /* Point to next character. */
     324        10237 :                 x++;
     325              :             }
     326              :         }
     327              : 
     328              :         /* Swap current row with previous row. */
     329         3397 :         temp = curr;
     330         3397 :         curr = prev;
     331         3397 :         prev = temp;
     332              : 
     333              :         /* Point to next character. */
     334           12 :         y += y_char_len;
     335              : 
     336              : #ifdef LEVENSHTEIN_LESS_EQUAL
     337              : 
     338              :         /*
     339              :          * This chunk of code represents a significant performance hit if used
     340              :          * in the case where there is no max_d bound.  This is probably not
     341              :          * because the max_d >= 0 test itself is expensive, but rather because
     342              :          * the possibility of needing to execute this code prevents tight
     343              :          * optimization of the loop as a whole.
     344              :          */
     345         3385 :         if (max_d >= 0)
     346              :         {
     347              :             /*
     348              :              * The "zero point" is the column of the current row where the
     349              :              * remaining portions of the strings are of equal length.  There
     350              :              * are (n - 1) characters in the target string, of which j have
     351              :              * been transformed.  There are (m - 1) characters in the source
     352              :              * string, so we want to find the value for zp where (n - 1) - j =
     353              :              * (m - 1) - zp.
     354              :              */
     355         2817 :             int         zp = j - (n - m);
     356              : 
     357              :             /* Check whether the stop column can slide left. */
     358         6774 :             while (stop_column > 0)
     359              :             {
     360         6198 :                 int         ii = stop_column - 1;
     361         6198 :                 int         net_inserts = ii - zp;
     362              : 
     363        10451 :                 if (prev[ii] + (net_inserts > 0 ? net_inserts * ins_c :
     364         4253 :                                 -net_inserts * del_c) <= max_d)
     365         2241 :                     break;
     366         3957 :                 stop_column--;
     367              :             }
     368              : 
     369              :             /* Check whether the start column can slide right. */
     370         4639 :             while (start_column < stop_column)
     371              :             {
     372         4063 :                 int         net_inserts = start_column - zp;
     373              : 
     374         4063 :                 if (prev[start_column] +
     375         4063 :                     (net_inserts > 0 ? net_inserts * ins_c :
     376         3804 :                      -net_inserts * del_c) <= max_d)
     377         2241 :                     break;
     378              : 
     379              :                 /*
     380              :                  * We'll never again update these values, so we must make sure
     381              :                  * there's nothing here that could confuse any future
     382              :                  * iteration of the outer loop.
     383              :                  */
     384         1822 :                 prev[start_column] = max_d + 1;
     385         1822 :                 curr[start_column] = max_d + 1;
     386         1822 :                 if (start_column != 0)
     387         1251 :                     source += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1;
     388         1822 :                 start_column++;
     389              :             }
     390              : 
     391              :             /* If they cross, we're going to exceed the bound. */
     392         2817 :             if (start_column >= stop_column)
     393          576 :                 return max_d + 1;
     394              :         }
     395              : #endif
     396              :     }
     397              : 
     398              :     /*
     399              :      * Because the final value was swapped from the previous row to the
     400              :      * current row, that's where we'll find it.
     401              :      */
     402          388 :     return prev[m - 1];
     403              : }

Generated by: LCOV version 2.0-1