LCOV - code coverage report
Current view: top level - src/backend/utils/adt - levenshtein.c (source / functions) Hit Total Coverage
Test: PostgreSQL 17devel Lines: 80 103 77.7 %
Date: 2024-04-25 22:10:56 Functions: 2 2 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * levenshtein.c
       4             :  *    Levenshtein distance implementation.
       5             :  *
       6             :  * Original author:  Joe Conway <mail@joeconway.com>
       7             :  *
       8             :  * This file is included by varlena.c twice, to provide matching code for (1)
       9             :  * Levenshtein distance with custom costings, and (2) Levenshtein distance with
      10             :  * custom costings and a "max" value above which exact distances are not
      11             :  * interesting.  Before the inclusion, we rely on the presence of the inline
      12             :  * function rest_of_char_same().
      13             :  *
      14             :  * Written based on a description of the algorithm by Michael Gilleland found
      15             :  * at http://www.merriampark.com/ld.htm.  Also looked at levenshtein.c in the
      16             :  * PHP 4.0.6 distribution for inspiration.  Configurable penalty costs
      17             :  * extension is introduced by Volkan YAZICI <volkan.yazici@gmail.com.
      18             :  *
      19             :  * Copyright (c) 2001-2024, PostgreSQL Global Development Group
      20             :  *
      21             :  * IDENTIFICATION
      22             :  *  src/backend/utils/adt/levenshtein.c
      23             :  *
      24             :  *-------------------------------------------------------------------------
      25             :  */
      26             : #define MAX_LEVENSHTEIN_STRLEN      255
      27             : 
      28             : /*
      29             :  * Calculates Levenshtein distance metric between supplied strings, which are
      30             :  * not necessarily null-terminated.
      31             :  *
      32             :  * source: source string, of length slen bytes.
      33             :  * target: target string, of length tlen bytes.
      34             :  * ins_c, del_c, sub_c: costs to charge for character insertion, deletion,
      35             :  *      and substitution respectively; (1, 1, 1) costs suffice for common
      36             :  *      cases, but your mileage may vary.
      37             :  * max_d: if provided and >= 0, maximum distance we care about; see below.
      38             :  * trusted: caller is trusted and need not obey MAX_LEVENSHTEIN_STRLEN.
      39             :  *
      40             :  * One way to compute Levenshtein distance is to incrementally construct
      41             :  * an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number
      42             :  * of operations required to transform the first i characters of s into
      43             :  * the first j characters of t.  The last column of the final row is the
      44             :  * answer.
      45             :  *
      46             :  * We use that algorithm here with some modification.  In lieu of holding
      47             :  * the entire array in memory at once, we'll just use two arrays of size
      48             :  * m+1 for storing accumulated values. At each step one array represents
      49             :  * the "previous" row and one is the "current" row of the notional large
      50             :  * array.
      51             :  *
      52             :  * If max_d >= 0, we only need to provide an accurate answer when that answer
      53             :  * is less than or equal to max_d.  From any cell in the matrix, there is
      54             :  * theoretical "minimum residual distance" from that cell to the last column
      55             :  * of the final row.  This minimum residual distance is zero when the
      56             :  * untransformed portions of the strings are of equal length (because we might
      57             :  * get lucky and find all the remaining characters matching) and is otherwise
      58             :  * based on the minimum number of insertions or deletions needed to make them
      59             :  * equal length.  The residual distance grows as we move toward the upper
      60             :  * right or lower left corners of the matrix.  When the max_d bound is
      61             :  * usefully tight, we can use this property to avoid computing the entirety
      62             :  * of each row; instead, we maintain a start_column and stop_column that
      63             :  * identify the portion of the matrix close to the diagonal which can still
      64             :  * affect the final answer.
      65             :  */
      66             : int
      67             : #ifdef LEVENSHTEIN_LESS_EQUAL
      68        2518 : varstr_levenshtein_less_equal(const char *source, int slen,
      69             :                               const char *target, int tlen,
      70             :                               int ins_c, int del_c, int sub_c,
      71             :                               int max_d, bool trusted)
      72             : #else
      73           4 : varstr_levenshtein(const char *source, int slen,
      74             :                    const char *target, int tlen,
      75             :                    int ins_c, int del_c, int sub_c,
      76             :                    bool trusted)
      77             : #endif
      78             : {
      79             :     int         m,
      80             :                 n;
      81             :     int        *prev;
      82             :     int        *curr;
      83        2522 :     int        *s_char_len = NULL;
      84             :     int         j;
      85             :     const char *y;
      86             : 
      87             :     /*
      88             :      * For varstr_levenshtein_less_equal, we have real variables called
      89             :      * start_column and stop_column; otherwise it's just short-hand for 0 and
      90             :      * m.
      91             :      */
      92             : #ifdef LEVENSHTEIN_LESS_EQUAL
      93             :     int         start_column,
      94             :                 stop_column;
      95             : 
      96             : #undef START_COLUMN
      97             : #undef STOP_COLUMN
      98             : #define START_COLUMN start_column
      99             : #define STOP_COLUMN stop_column
     100             : #else
     101             : #undef START_COLUMN
     102             : #undef STOP_COLUMN
     103             : #define START_COLUMN 0
     104             : #define STOP_COLUMN m
     105             : #endif
     106             : 
     107             :     /* Convert string lengths (in bytes) to lengths in characters */
     108        2522 :     m = pg_mbstrlen_with_len(source, slen);
     109        2522 :     n = pg_mbstrlen_with_len(target, tlen);
     110             : 
     111             :     /*
     112             :      * We can transform an empty s into t with n insertions, or a non-empty t
     113             :      * into an empty s with m deletions.
     114             :      */
     115        2522 :     if (!m)
     116           0 :         return n * ins_c;
     117        2522 :     if (!n)
     118           0 :         return m * del_c;
     119             : 
     120             :     /*
     121             :      * For security concerns, restrict excessive CPU+RAM usage. (This
     122             :      * implementation uses O(m) memory and has O(mn) complexity.)  If
     123             :      * "trusted" is true, caller is responsible for not making excessive
     124             :      * requests, typically by using a small max_d along with strings that are
     125             :      * bounded, though not necessarily to MAX_LEVENSHTEIN_STRLEN exactly.
     126             :      */
     127        2522 :     if (!trusted &&
     128           8 :         (m > MAX_LEVENSHTEIN_STRLEN ||
     129             :          n > MAX_LEVENSHTEIN_STRLEN))
     130           0 :         ereport(ERROR,
     131             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     132             :                  errmsg("levenshtein argument exceeds maximum length of %d characters",
     133             :                         MAX_LEVENSHTEIN_STRLEN)));
     134             : 
     135             : #ifdef LEVENSHTEIN_LESS_EQUAL
     136             :     /* Initialize start and stop columns. */
     137        2518 :     start_column = 0;
     138        2518 :     stop_column = m + 1;
     139             : 
     140             :     /*
     141             :      * If max_d >= 0, determine whether the bound is impossibly tight.  If so,
     142             :      * return max_d + 1 immediately.  Otherwise, determine whether it's tight
     143             :      * enough to limit the computation we must perform.  If so, figure out
     144             :      * initial stop column.
     145             :      */
     146        2518 :     if (max_d >= 0)
     147             :     {
     148             :         int         min_theo_d; /* Theoretical minimum distance. */
     149             :         int         max_theo_d; /* Theoretical maximum distance. */
     150        2518 :         int         net_inserts = n - m;
     151             : 
     152        2518 :         min_theo_d = net_inserts < 0 ?
     153        2518 :             -net_inserts * del_c : net_inserts * ins_c;
     154        2518 :         if (min_theo_d > max_d)
     155         884 :             return max_d + 1;
     156        1634 :         if (ins_c + del_c < sub_c)
     157           0 :             sub_c = ins_c + del_c;
     158        1634 :         max_theo_d = min_theo_d + sub_c * Min(m, n);
     159        1634 :         if (max_d >= max_theo_d)
     160         516 :             max_d = -1;
     161        1118 :         else if (ins_c + del_c > 0)
     162             :         {
     163             :             /*
     164             :              * Figure out how much of the first row of the notional matrix we
     165             :              * need to fill in.  If the string is growing, the theoretical
     166             :              * minimum distance already incorporates the cost of deleting the
     167             :              * number of characters necessary to make the two strings equal in
     168             :              * length.  Each additional deletion forces another insertion, so
     169             :              * the best-case total cost increases by ins_c + del_c. If the
     170             :              * string is shrinking, the minimum theoretical cost assumes no
     171             :              * excess deletions; that is, we're starting no further right than
     172             :              * column n - m.  If we do start further right, the best-case
     173             :              * total cost increases by ins_c + del_c for each move right.
     174             :              */
     175        1118 :             int         slack_d = max_d - min_theo_d;
     176        1118 :             int         best_column = net_inserts < 0 ? -net_inserts : 0;
     177             : 
     178        1118 :             stop_column = best_column + (slack_d / (ins_c + del_c)) + 1;
     179        1118 :             if (stop_column > m)
     180           0 :                 stop_column = m + 1;
     181             :         }
     182             :     }
     183             : #endif
     184             : 
     185             :     /*
     186             :      * In order to avoid calling pg_mblen() repeatedly on each character in s,
     187             :      * we cache all the lengths before starting the main loop -- but if all
     188             :      * the characters in both strings are single byte, then we skip this and
     189             :      * use a fast-path in the main loop.  If only one string contains
     190             :      * multi-byte characters, we still build the array, so that the fast-path
     191             :      * needn't deal with the case where the array hasn't been initialized.
     192             :      */
     193        1638 :     if (m != slen || n != tlen)
     194             :     {
     195             :         int         i;
     196           0 :         const char *cp = source;
     197             : 
     198           0 :         s_char_len = (int *) palloc((m + 1) * sizeof(int));
     199           0 :         for (i = 0; i < m; ++i)
     200             :         {
     201           0 :             s_char_len[i] = pg_mblen(cp);
     202           0 :             cp += s_char_len[i];
     203             :         }
     204           0 :         s_char_len[i] = 0;
     205             :     }
     206             : 
     207             :     /* One more cell for initialization column and row. */
     208        1638 :     ++m;
     209        1638 :     ++n;
     210             : 
     211             :     /* Previous and current rows of notional array. */
     212        1638 :     prev = (int *) palloc(2 * m * sizeof(int));
     213        1638 :     curr = prev + m;
     214             : 
     215             :     /*
     216             :      * To transform the first i characters of s into the first 0 characters of
     217             :      * t, we must perform i deletions.
     218             :      */
     219        6406 :     for (int i = START_COLUMN; i < STOP_COLUMN; i++)
     220        4768 :         prev[i] = i * del_c;
     221             : 
     222             :     /* Loop through rows of the notional array */
     223        6408 :     for (y = target, j = 1; j < n; j++)
     224             :     {
     225             :         int        *temp;
     226        5698 :         const char *x = source;
     227        5698 :         int         y_char_len = n != tlen + 1 ? pg_mblen(y) : 1;
     228             :         int         i;
     229             : 
     230             : #ifdef LEVENSHTEIN_LESS_EQUAL
     231             : 
     232             :         /*
     233             :          * In the best case, values percolate down the diagonal unchanged, so
     234             :          * we must increment stop_column unless it's already on the right end
     235             :          * of the array.  The inner loop will read prev[stop_column], so we
     236             :          * have to initialize it even though it shouldn't affect the result.
     237             :          */
     238        5674 :         if (stop_column < m)
     239             :         {
     240        4490 :             prev[stop_column] = max_d + 1;
     241        4490 :             ++stop_column;
     242             :         }
     243             : 
     244             :         /*
     245             :          * The main loop fills in curr, but curr[0] needs a special case: to
     246             :          * transform the first 0 characters of s into the first j characters
     247             :          * of t, we must perform j insertions.  However, if start_column > 0,
     248             :          * this special case does not apply.
     249             :          */
     250        5674 :         if (start_column == 0)
     251             :         {
     252        3586 :             curr[0] = j * ins_c;
     253        3586 :             i = 1;
     254             :         }
     255             :         else
     256        2088 :             i = start_column;
     257             : #else
     258          24 :         curr[0] = j * ins_c;
     259          24 :         i = 1;
     260             : #endif
     261             : 
     262             :         /*
     263             :          * This inner loop is critical to performance, so we include a
     264             :          * fast-path to handle the (fairly common) case where no multibyte
     265             :          * characters are in the mix.  The fast-path is entitled to assume
     266             :          * that if s_char_len is not initialized then BOTH strings contain
     267             :          * only single-byte characters.
     268             :          */
     269        5698 :         if (s_char_len != NULL)
     270             :         {
     271           0 :             for (; i < STOP_COLUMN; i++)
     272             :             {
     273             :                 int         ins;
     274             :                 int         del;
     275             :                 int         sub;
     276           0 :                 int         x_char_len = s_char_len[i - 1];
     277             : 
     278             :                 /*
     279             :                  * Calculate costs for insertion, deletion, and substitution.
     280             :                  *
     281             :                  * When calculating cost for substitution, we compare the last
     282             :                  * character of each possibly-multibyte character first,
     283             :                  * because that's enough to rule out most mis-matches.  If we
     284             :                  * get past that test, then we compare the lengths and the
     285             :                  * remaining bytes.
     286             :                  */
     287           0 :                 ins = prev[i] + ins_c;
     288           0 :                 del = curr[i - 1] + del_c;
     289           0 :                 if (x[x_char_len - 1] == y[y_char_len - 1]
     290           0 :                     && x_char_len == y_char_len &&
     291           0 :                     (x_char_len == 1 || rest_of_char_same(x, y, x_char_len)))
     292           0 :                     sub = prev[i - 1];
     293             :                 else
     294           0 :                     sub = prev[i - 1] + sub_c;
     295             : 
     296             :                 /* Take the one with minimum cost. */
     297           0 :                 curr[i] = Min(ins, del);
     298           0 :                 curr[i] = Min(curr[i], sub);
     299             : 
     300             :                 /* Point to next character. */
     301           0 :                 x += x_char_len;
     302             :             }
     303             :         }
     304             :         else
     305             :         {
     306       23360 :             for (; i < STOP_COLUMN; i++)
     307             :             {
     308             :                 int         ins;
     309             :                 int         del;
     310             :                 int         sub;
     311             : 
     312             :                 /* Calculate costs for insertion, deletion, and substitution. */
     313       17662 :                 ins = prev[i] + ins_c;
     314       17662 :                 del = curr[i - 1] + del_c;
     315       17662 :                 sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c);
     316             : 
     317             :                 /* Take the one with minimum cost. */
     318       17662 :                 curr[i] = Min(ins, del);
     319       17662 :                 curr[i] = Min(curr[i], sub);
     320             : 
     321             :                 /* Point to next character. */
     322       17662 :                 x++;
     323             :             }
     324             :         }
     325             : 
     326             :         /* Swap current row with previous row. */
     327        5698 :         temp = curr;
     328        5698 :         curr = prev;
     329        5698 :         prev = temp;
     330             : 
     331             :         /* Point to next character. */
     332          24 :         y += y_char_len;
     333             : 
     334             : #ifdef LEVENSHTEIN_LESS_EQUAL
     335             : 
     336             :         /*
     337             :          * This chunk of code represents a significant performance hit if used
     338             :          * in the case where there is no max_d bound.  This is probably not
     339             :          * because the max_d >= 0 test itself is expensive, but rather because
     340             :          * the possibility of needing to execute this code prevents tight
     341             :          * optimization of the loop as a whole.
     342             :          */
     343        5674 :         if (max_d >= 0)
     344             :         {
     345             :             /*
     346             :              * The "zero point" is the column of the current row where the
     347             :              * remaining portions of the strings are of equal length.  There
     348             :              * are (n - 1) characters in the target string, of which j have
     349             :              * been transformed.  There are (m - 1) characters in the source
     350             :              * string, so we want to find the value for zp where (n - 1) - j =
     351             :              * (m - 1) - zp.
     352             :              */
     353        4630 :             int         zp = j - (n - m);
     354             : 
     355             :             /* Check whether the stop column can slide left. */
     356       10984 :             while (stop_column > 0)
     357             :             {
     358       10056 :                 int         ii = stop_column - 1;
     359       10056 :                 int         net_inserts = ii - zp;
     360             : 
     361       17094 :                 if (prev[ii] + (net_inserts > 0 ? net_inserts * ins_c :
     362        7038 :                                 -net_inserts * del_c) <= max_d)
     363        3702 :                     break;
     364        6354 :                 stop_column--;
     365             :             }
     366             : 
     367             :             /* Check whether the start column can slide right. */
     368        7742 :             while (start_column < stop_column)
     369             :             {
     370        6814 :                 int         net_inserts = start_column - zp;
     371             : 
     372        6814 :                 if (prev[start_column] +
     373        6814 :                     (net_inserts > 0 ? net_inserts * ins_c :
     374        6454 :                      -net_inserts * del_c) <= max_d)
     375        3702 :                     break;
     376             : 
     377             :                 /*
     378             :                  * We'll never again update these values, so we must make sure
     379             :                  * there's nothing here that could confuse any future
     380             :                  * iteration of the outer loop.
     381             :                  */
     382        3112 :                 prev[start_column] = max_d + 1;
     383        3112 :                 curr[start_column] = max_d + 1;
     384        3112 :                 if (start_column != 0)
     385        2150 :                     source += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1;
     386        3112 :                 start_column++;
     387             :             }
     388             : 
     389             :             /* If they cross, we're going to exceed the bound. */
     390        4630 :             if (start_column >= stop_column)
     391         928 :                 return max_d + 1;
     392             :         }
     393             : #endif
     394             :     }
     395             : 
     396             :     /*
     397             :      * Because the final value was swapped from the previous row to the
     398             :      * current row, that's where we'll find it.
     399             :      */
     400         710 :     return prev[m - 1];
     401             : }

Generated by: LCOV version 1.14