LCOV - code coverage report
Current view: top level - contrib/pg_trgm - trgm_op.c (source / functions) Hit Total Coverage
Test: PostgreSQL 16beta1 Lines: 448 490 91.4 %
Date: 2023-06-01 12:12:28 Functions: 50 52 96.2 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * contrib/pg_trgm/trgm_op.c
       3             :  */
       4             : #include "postgres.h"
       5             : 
       6             : #include <ctype.h>
       7             : 
       8             : #include "catalog/pg_type.h"
       9             : #include "lib/qunique.h"
      10             : #include "miscadmin.h"
      11             : #include "trgm.h"
      12             : #include "tsearch/ts_locale.h"
      13             : #include "utils/guc.h"
      14             : #include "utils/lsyscache.h"
      15             : #include "utils/memutils.h"
      16             : #include "utils/pg_crc.h"
      17             : 
      18           6 : PG_MODULE_MAGIC;
      19             : 
      20             : /* GUC variables */
      21             : double      similarity_threshold = 0.3f;
      22             : double      word_similarity_threshold = 0.6f;
      23             : double      strict_word_similarity_threshold = 0.5f;
      24             : 
      25           4 : PG_FUNCTION_INFO_V1(set_limit);
      26           4 : PG_FUNCTION_INFO_V1(show_limit);
      27           4 : PG_FUNCTION_INFO_V1(show_trgm);
      28           4 : PG_FUNCTION_INFO_V1(similarity);
      29           4 : PG_FUNCTION_INFO_V1(word_similarity);
      30           4 : PG_FUNCTION_INFO_V1(strict_word_similarity);
      31           4 : PG_FUNCTION_INFO_V1(similarity_dist);
      32           4 : PG_FUNCTION_INFO_V1(similarity_op);
      33           4 : PG_FUNCTION_INFO_V1(word_similarity_op);
      34           4 : PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
      35           2 : PG_FUNCTION_INFO_V1(word_similarity_dist_op);
      36           4 : PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
      37           4 : PG_FUNCTION_INFO_V1(strict_word_similarity_op);
      38           4 : PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op);
      39           2 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op);
      40           4 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op);
      41             : 
      42             : /* Trigram with position */
      43             : typedef struct
      44             : {
      45             :     trgm        trg;
      46             :     int         index;
      47             : } pos_trgm;
      48             : 
      49             : /* Trigram bound type */
      50             : typedef uint8 TrgmBound;
      51             : #define TRGM_BOUND_LEFT             0x01    /* trigram is left bound of word */
      52             : #define TRGM_BOUND_RIGHT            0x02    /* trigram is right bound of word */
      53             : 
      54             : /* Word similarity flags */
      55             : #define WORD_SIMILARITY_CHECK_ONLY  0x01    /* only check existence of similar
      56             :                                              * search pattern in text */
      57             : #define WORD_SIMILARITY_STRICT      0x02    /* force bounds of extent to match
      58             :                                              * word bounds */
      59             : 
      60             : /*
      61             :  * Module load callback
      62             :  */
      63             : void
      64           6 : _PG_init(void)
      65             : {
      66             :     /* Define custom GUC variables. */
      67           6 :     DefineCustomRealVariable("pg_trgm.similarity_threshold",
      68             :                              "Sets the threshold used by the % operator.",
      69             :                              "Valid range is 0.0 .. 1.0.",
      70             :                              &similarity_threshold,
      71             :                              0.3f,
      72             :                              0.0,
      73             :                              1.0,
      74             :                              PGC_USERSET,
      75             :                              0,
      76             :                              NULL,
      77             :                              NULL,
      78             :                              NULL);
      79           6 :     DefineCustomRealVariable("pg_trgm.word_similarity_threshold",
      80             :                              "Sets the threshold used by the <% operator.",
      81             :                              "Valid range is 0.0 .. 1.0.",
      82             :                              &word_similarity_threshold,
      83             :                              0.6f,
      84             :                              0.0,
      85             :                              1.0,
      86             :                              PGC_USERSET,
      87             :                              0,
      88             :                              NULL,
      89             :                              NULL,
      90             :                              NULL);
      91           6 :     DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold",
      92             :                              "Sets the threshold used by the <<% operator.",
      93             :                              "Valid range is 0.0 .. 1.0.",
      94             :                              &strict_word_similarity_threshold,
      95             :                              0.5f,
      96             :                              0.0,
      97             :                              1.0,
      98             :                              PGC_USERSET,
      99             :                              0,
     100             :                              NULL,
     101             :                              NULL,
     102             :                              NULL);
     103             : 
     104           6 :     MarkGUCPrefixReserved("pg_trgm");
     105           6 : }
     106             : 
     107             : /*
     108             :  * Deprecated function.
     109             :  * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
     110             :  */
     111             : Datum
     112           4 : set_limit(PG_FUNCTION_ARGS)
     113             : {
     114           4 :     float4      nlimit = PG_GETARG_FLOAT4(0);
     115             :     char       *nlimit_str;
     116             :     Oid         func_out_oid;
     117             :     bool        is_varlena;
     118             : 
     119           4 :     getTypeOutputInfo(FLOAT4OID, &func_out_oid, &is_varlena);
     120             : 
     121           4 :     nlimit_str = OidOutputFunctionCall(func_out_oid, Float4GetDatum(nlimit));
     122             : 
     123           4 :     SetConfigOption("pg_trgm.similarity_threshold", nlimit_str,
     124             :                     PGC_USERSET, PGC_S_SESSION);
     125             : 
     126           4 :     PG_RETURN_FLOAT4(similarity_threshold);
     127             : }
     128             : 
     129             : 
     130             : /*
     131             :  * Get similarity threshold for given index scan strategy number.
     132             :  */
     133             : double
     134       86400 : index_strategy_get_limit(StrategyNumber strategy)
     135             : {
     136       86400 :     switch (strategy)
     137             :     {
     138       64808 :         case SimilarityStrategyNumber:
     139       64808 :             return similarity_threshold;
     140        9636 :         case WordSimilarityStrategyNumber:
     141        9636 :             return word_similarity_threshold;
     142       11956 :         case StrictWordSimilarityStrategyNumber:
     143       11956 :             return strict_word_similarity_threshold;
     144           0 :         default:
     145           0 :             elog(ERROR, "unrecognized strategy number: %d", strategy);
     146             :             break;
     147             :     }
     148             : 
     149             :     return 0.0;                 /* keep compiler quiet */
     150             : }
     151             : 
     152             : /*
     153             :  * Deprecated function.
     154             :  * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
     155             :  */
     156             : Datum
     157       40000 : show_limit(PG_FUNCTION_ARGS)
     158             : {
     159       40000 :     PG_RETURN_FLOAT4(similarity_threshold);
     160             : }
     161             : 
     162             : static int
     163     6265240 : comp_trgm(const void *a, const void *b)
     164             : {
     165     6265240 :     return CMPTRGM(a, b);
     166             : }
     167             : 
     168             : /*
     169             :  * Finds first word in string, returns pointer to the word,
     170             :  * endword points to the character after word
     171             :  */
     172             : static char *
     173      478818 : find_word(char *str, int lenstr, char **endword, int *charlen)
     174             : {
     175      478818 :     char       *beginword = str;
     176             : 
     177      506142 :     while (beginword - str < lenstr && !ISWORDCHR(beginword))
     178       27324 :         beginword += pg_mblen(beginword);
     179             : 
     180      478818 :     if (beginword - str >= lenstr)
     181      226160 :         return NULL;
     182             : 
     183      252658 :     *endword = beginword;
     184      252658 :     *charlen = 0;
     185     2175334 :     while (*endword - str < lenstr && ISWORDCHR(*endword))
     186             :     {
     187     1922676 :         *endword += pg_mblen(*endword);
     188     1922676 :         (*charlen)++;
     189             :     }
     190             : 
     191      252658 :     return beginword;
     192             : }
     193             : 
     194             : /*
     195             :  * Reduce a trigram (three possibly multi-byte characters) to a trgm,
     196             :  * which is always exactly three bytes.  If we have three single-byte
     197             :  * characters, we just use them as-is; otherwise we form a hash value.
     198             :  */
     199             : void
     200        2918 : compact_trigram(trgm *tptr, char *str, int bytelen)
     201             : {
     202        2918 :     if (bytelen == 3)
     203             :     {
     204        2918 :         CPTRGM(tptr, str);
     205             :     }
     206             :     else
     207             :     {
     208             :         pg_crc32    crc;
     209             : 
     210           0 :         INIT_LEGACY_CRC32(crc);
     211           0 :         COMP_LEGACY_CRC32(crc, str, bytelen);
     212           0 :         FIN_LEGACY_CRC32(crc);
     213             : 
     214             :         /*
     215             :          * use only 3 upper bytes from crc, hope, it's good enough hashing
     216             :          */
     217           0 :         CPTRGM(tptr, &crc);
     218             :     }
     219        2918 : }
     220             : 
     221             : /*
     222             :  * Adds trigrams from words (already padded).
     223             :  */
     224             : static trgm *
     225      252786 : make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
     226             : {
     227      252786 :     char       *ptr = str;
     228             : 
     229      252786 :     if (charlen < 3)
     230          54 :         return tptr;
     231             : 
     232      252732 :     if (bytelen > charlen)
     233             :     {
     234             :         /* Find multibyte character boundaries and apply compact_trigram */
     235           0 :         int         lenfirst = pg_mblen(str),
     236           0 :                     lenmiddle = pg_mblen(str + lenfirst),
     237           0 :                     lenlast = pg_mblen(str + lenfirst + lenmiddle);
     238             : 
     239           0 :         while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
     240             :         {
     241           0 :             compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
     242             : 
     243           0 :             ptr += lenfirst;
     244           0 :             tptr++;
     245             : 
     246           0 :             lenfirst = lenmiddle;
     247           0 :             lenmiddle = lenlast;
     248           0 :             lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
     249             :         }
     250             :     }
     251             :     else
     252             :     {
     253             :         /* Fast path when there are no multibyte characters */
     254             :         Assert(bytelen == charlen);
     255             : 
     256     2428248 :         while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
     257             :         {
     258     2175516 :             CPTRGM(tptr, ptr);
     259     2175516 :             ptr++;
     260     2175516 :             tptr++;
     261             :         }
     262             :     }
     263             : 
     264      252732 :     return tptr;
     265             : }
     266             : 
     267             : /*
     268             :  * Make array of trigrams without sorting and removing duplicate items.
     269             :  *
     270             :  * trg: where to return the array of trigrams.
     271             :  * str: source string, of length slen bytes.
     272             :  * bounds: where to return bounds of trigrams (if needed).
     273             :  *
     274             :  * Returns length of the generated array.
     275             :  */
     276             : static int
     277      226162 : generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
     278             : {
     279             :     trgm       *tptr;
     280             :     char       *buf;
     281             :     int         charlen,
     282             :                 bytelen;
     283             :     char       *bword,
     284             :                *eword;
     285             : 
     286      226162 :     if (slen + LPADDING + RPADDING < 3 || slen == 0)
     287           2 :         return 0;
     288             : 
     289      226160 :     tptr = trg;
     290             : 
     291             :     /* Allocate a buffer for case-folded, blank-padded words */
     292      226160 :     buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
     293             : 
     294             :     if (LPADDING > 0)
     295             :     {
     296      226160 :         *buf = ' ';
     297             :         if (LPADDING > 1)
     298      226160 :             *(buf + 1) = ' ';
     299             :     }
     300             : 
     301      226160 :     eword = str;
     302      478818 :     while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
     303             :     {
     304             : #ifdef IGNORECASE
     305      252658 :         bword = lowerstr_with_len(bword, eword - bword);
     306      252658 :         bytelen = strlen(bword);
     307             : #else
     308             :         bytelen = eword - bword;
     309             : #endif
     310             : 
     311      252658 :         memcpy(buf + LPADDING, bword, bytelen);
     312             : 
     313             : #ifdef IGNORECASE
     314      252658 :         pfree(bword);
     315             : #endif
     316             : 
     317      252658 :         buf[LPADDING + bytelen] = ' ';
     318      252658 :         buf[LPADDING + bytelen + 1] = ' ';
     319             : 
     320             :         /* Calculate trigrams marking their bounds if needed */
     321      252658 :         if (bounds)
     322       24794 :             bounds[tptr - trg] |= TRGM_BOUND_LEFT;
     323      252658 :         tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
     324             :                              charlen + LPADDING + RPADDING);
     325      252658 :         if (bounds)
     326       24794 :             bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
     327             :     }
     328             : 
     329      226160 :     pfree(buf);
     330             : 
     331      226160 :     return tptr - trg;
     332             : }
     333             : 
     334             : /*
     335             :  * Guard against possible overflow in the palloc requests below.  (We
     336             :  * don't worry about the additive constants, since palloc can detect
     337             :  * requests that are a little above MaxAllocSize --- we just need to
     338             :  * prevent integer overflow in the multiplications.)
     339             :  */
     340             : static void
     341      202020 : protect_out_of_mem(int slen)
     342             : {
     343      202020 :     if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
     344      202020 :         (Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
     345           0 :         ereport(ERROR,
     346             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     347             :                  errmsg("out of memory")));
     348      202020 : }
     349             : 
     350             : /*
     351             :  * Make array of trigrams with sorting and removing duplicate items.
     352             :  *
     353             :  * str: source string, of length slen bytes.
     354             :  *
     355             :  * Returns the sorted array of unique trigrams.
     356             :  */
     357             : TRGM *
     358      177658 : generate_trgm(char *str, int slen)
     359             : {
     360             :     TRGM       *trg;
     361             :     int         len;
     362             : 
     363      177658 :     protect_out_of_mem(slen);
     364             : 
     365      177658 :     trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
     366      177658 :     trg->flag = ARRKEY;
     367             : 
     368      177658 :     len = generate_trgm_only(GETARR(trg), str, slen, NULL);
     369      177658 :     SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
     370             : 
     371      177658 :     if (len == 0)
     372           8 :         return trg;
     373             : 
     374             :     /*
     375             :      * Make trigrams unique.
     376             :      */
     377      177650 :     if (len > 1)
     378             :     {
     379      177650 :         qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
     380      177650 :         len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
     381             :     }
     382             : 
     383      177650 :     SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
     384             : 
     385      177650 :     return trg;
     386             : }
     387             : 
     388             : /*
     389             :  * Make array of positional trigrams from two trigram arrays trg1 and trg2.
     390             :  *
     391             :  * trg1: trigram array of search pattern, of length len1. trg1 is required
     392             :  *       word which positions don't matter and replaced with -1.
     393             :  * trg2: trigram array of text, of length len2. trg2 is haystack where we
     394             :  *       search and have to store its positions.
     395             :  *
     396             :  * Returns concatenated trigram array.
     397             :  */
     398             : static pos_trgm *
     399       24252 : make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
     400             : {
     401             :     pos_trgm   *result;
     402             :     int         i,
     403       24252 :                 len = len1 + len2;
     404             : 
     405       24252 :     result = (pos_trgm *) palloc(sizeof(pos_trgm) * len);
     406             : 
     407      241728 :     for (i = 0; i < len1; i++)
     408             :     {
     409      217476 :         memcpy(&result[i].trg, &trg1[i], sizeof(trgm));
     410      217476 :         result[i].index = -1;
     411             :     }
     412             : 
     413      384410 :     for (i = 0; i < len2; i++)
     414             :     {
     415      360158 :         memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm));
     416      360158 :         result[i + len1].index = i;
     417             :     }
     418             : 
     419       24252 :     return result;
     420             : }
     421             : 
     422             : /*
     423             :  * Compare position trigrams: compare trigrams first and position second.
     424             :  */
     425             : static int
     426     2615370 : comp_ptrgm(const void *v1, const void *v2)
     427             : {
     428     2615370 :     const pos_trgm *p1 = (const pos_trgm *) v1;
     429     2615370 :     const pos_trgm *p2 = (const pos_trgm *) v2;
     430             :     int         cmp;
     431             : 
     432     2615370 :     cmp = CMPTRGM(p1->trg, p2->trg);
     433     2615370 :     if (cmp != 0)
     434     2535958 :         return cmp;
     435             : 
     436       79412 :     if (p1->index < p2->index)
     437       42736 :         return -1;
     438       36676 :     else if (p1->index == p2->index)
     439           0 :         return 0;
     440             :     else
     441       36676 :         return 1;
     442             : }
     443             : 
     444             : /*
     445             :  * Iterative search function which calculates maximum similarity with word in
     446             :  * the string. Maximum similarity is only calculated only if the flag
     447             :  * WORD_SIMILARITY_CHECK_ONLY isn't set.
     448             :  *
     449             :  * trg2indexes: array which stores indexes of the array "found".
     450             :  * found: array which stores true of false values.
     451             :  * ulen1: count of unique trigrams of array "trg1".
     452             :  * len2: length of array "trg2" and array "trg2indexes".
     453             :  * len: length of the array "found".
     454             :  * flags: set of boolean flags parameterizing similarity calculation.
     455             :  * bounds: whether each trigram is left/right bound of word.
     456             :  *
     457             :  * Returns word similarity.
     458             :  */
     459             : static float4
     460       24252 : iterate_word_similarity(int *trg2indexes,
     461             :                         bool *found,
     462             :                         int ulen1,
     463             :                         int len2,
     464             :                         int len,
     465             :                         uint8 flags,
     466             :                         TrgmBound *bounds)
     467             : {
     468             :     int        *lastpos,
     469             :                 i,
     470       24252 :                 ulen2 = 0,
     471       24252 :                 count = 0,
     472       24252 :                 upper = -1,
     473             :                 lower;
     474             :     float4      smlr_cur,
     475       24252 :                 smlr_max = 0.0f;
     476             :     double      threshold;
     477             : 
     478             :     Assert(bounds || !(flags & WORD_SIMILARITY_STRICT));
     479             : 
     480             :     /* Select appropriate threshold */
     481       48504 :     threshold = (flags & WORD_SIMILARITY_STRICT) ?
     482       24252 :         strict_word_similarity_threshold :
     483             :         word_similarity_threshold;
     484             : 
     485             :     /*
     486             :      * Consider first trigram as initial lower bound for strict word
     487             :      * similarity, or initialize it later with first trigram present for plain
     488             :      * word similarity.
     489             :      */
     490       24252 :     lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1;
     491             : 
     492             :     /* Memorise last position of each trigram */
     493       24252 :     lastpos = (int *) palloc(sizeof(int) * len);
     494       24252 :     memset(lastpos, -1, sizeof(int) * len);
     495             : 
     496      367270 :     for (i = 0; i < len2; i++)
     497             :     {
     498             :         int         trgindex;
     499             : 
     500      346586 :         CHECK_FOR_INTERRUPTS();
     501             : 
     502             :         /* Get index of next trigram */
     503      346586 :         trgindex = trg2indexes[i];
     504             : 
     505             :         /* Update last position of this trigram */
     506      346586 :         if (lower >= 0 || found[trgindex])
     507             :         {
     508      271564 :             if (lastpos[trgindex] < 0)
     509             :             {
     510      267858 :                 ulen2++;
     511      267858 :                 if (found[trgindex])
     512       61512 :                     count++;
     513             :             }
     514      271564 :             lastpos[trgindex] = i;
     515             :         }
     516             : 
     517             :         /*
     518             :          * Adjust upper bound if trigram is upper bound of word for strict
     519             :          * word similarity, or if trigram is present in required substring for
     520             :          * plain word similarity
     521             :          */
     522      500666 :         if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT)
     523      154080 :             : found[trgindex])
     524             :         {
     525             :             int         prev_lower,
     526             :                         tmp_ulen2,
     527             :                         tmp_lower,
     528             :                         tmp_count;
     529             : 
     530       51270 :             upper = i;
     531       51270 :             if (lower == -1)
     532             :             {
     533        9390 :                 lower = i;
     534        9390 :                 ulen2 = 1;
     535             :             }
     536             : 
     537       51270 :             smlr_cur = CALCSML(count, ulen1, ulen2);
     538             : 
     539             :             /* Also try to adjust lower bound for greater similarity */
     540       51270 :             tmp_count = count;
     541       51270 :             tmp_ulen2 = ulen2;
     542       51270 :             prev_lower = lower;
     543      417144 :             for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
     544             :             {
     545             :                 float       smlr_tmp;
     546             :                 int         tmp_trgindex;
     547             : 
     548             :                 /*
     549             :                  * Adjust lower bound only if trigram is lower bound of word
     550             :                  * for strict word similarity, or consider every trigram as
     551             :                  * lower bound for plain word similarity.
     552             :                  */
     553      369442 :                 if (!(flags & WORD_SIMILARITY_STRICT)
     554      290320 :                     || (bounds[tmp_lower] & TRGM_BOUND_LEFT))
     555             :                 {
     556      119382 :                     smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
     557      119382 :                     if (smlr_tmp > smlr_cur)
     558             :                     {
     559        7024 :                         smlr_cur = smlr_tmp;
     560        7024 :                         ulen2 = tmp_ulen2;
     561        7024 :                         lower = tmp_lower;
     562        7024 :                         count = tmp_count;
     563             :                     }
     564             : 
     565             :                     /*
     566             :                      * If we only check that word similarity is greater than
     567             :                      * threshold we do not need to calculate a maximum
     568             :                      * similarity.
     569             :                      */
     570      119382 :                     if ((flags & WORD_SIMILARITY_CHECK_ONLY)
     571       74228 :                         && smlr_cur >= threshold)
     572        3568 :                         break;
     573             :                 }
     574             : 
     575      365874 :                 tmp_trgindex = trg2indexes[tmp_lower];
     576      365874 :                 if (lastpos[tmp_trgindex] == tmp_lower)
     577             :                 {
     578      361354 :                     tmp_ulen2--;
     579      361354 :                     if (found[tmp_trgindex])
     580       93158 :                         tmp_count--;
     581             :                 }
     582             :             }
     583             : 
     584       51270 :             smlr_max = Max(smlr_max, smlr_cur);
     585             : 
     586             :             /*
     587             :              * if we only check that word similarity is greater than threshold
     588             :              * we do not need to calculate a maximum similarity.
     589             :              */
     590       51270 :             if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold)
     591        3568 :                 break;
     592             : 
     593       81202 :             for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
     594             :             {
     595             :                 int         tmp_trgindex;
     596             : 
     597       33500 :                 tmp_trgindex = trg2indexes[tmp_lower];
     598       33500 :                 if (lastpos[tmp_trgindex] == tmp_lower)
     599       32002 :                     lastpos[tmp_trgindex] = -1;
     600             :             }
     601             :         }
     602             :     }
     603             : 
     604       24252 :     pfree(lastpos);
     605             : 
     606       24252 :     return smlr_max;
     607             : }
     608             : 
     609             : /*
     610             :  * Calculate word similarity.
     611             :  * This function prepare two arrays: "trg2indexes" and "found". Then this arrays
     612             :  * are used to calculate word similarity using iterate_word_similarity().
     613             :  *
     614             :  * "trg2indexes" is array which stores indexes of the array "found".
     615             :  * In other words:
     616             :  * trg2indexes[j] = i;
     617             :  * found[i] = true (or false);
     618             :  * If found[i] == true then there is trigram trg2[j] in array "trg1".
     619             :  * If found[i] == false then there is not trigram trg2[j] in array "trg1".
     620             :  *
     621             :  * str1: search pattern string, of length slen1 bytes.
     622             :  * str2: text in which we are looking for a word, of length slen2 bytes.
     623             :  * flags: set of boolean flags parameterizing similarity calculation.
     624             :  *
     625             :  * Returns word similarity.
     626             :  */
     627             : static float4
     628       24252 : calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
     629             :                      uint8 flags)
     630             : {
     631             :     bool       *found;
     632             :     pos_trgm   *ptrg;
     633             :     trgm       *trg1;
     634             :     trgm       *trg2;
     635             :     int         len1,
     636             :                 len2,
     637             :                 len,
     638             :                 i,
     639             :                 j,
     640             :                 ulen1;
     641             :     int        *trg2indexes;
     642             :     float4      result;
     643             :     TrgmBound  *bounds;
     644             : 
     645       24252 :     protect_out_of_mem(slen1 + slen2);
     646             : 
     647             :     /* Make positional trigrams */
     648       24252 :     trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
     649       24252 :     trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
     650       24252 :     if (flags & WORD_SIMILARITY_STRICT)
     651       13324 :         bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
     652             :     else
     653       10928 :         bounds = NULL;
     654             : 
     655       24252 :     len1 = generate_trgm_only(trg1, str1, slen1, NULL);
     656       24252 :     len2 = generate_trgm_only(trg2, str2, slen2, bounds);
     657             : 
     658       24252 :     ptrg = make_positional_trgm(trg1, len1, trg2, len2);
     659       24252 :     len = len1 + len2;
     660       24252 :     qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
     661             : 
     662       24252 :     pfree(trg1);
     663       24252 :     pfree(trg2);
     664             : 
     665             :     /*
     666             :      * Merge positional trigrams array: enumerate each trigram and find its
     667             :      * presence in required word.
     668             :      */
     669       24252 :     trg2indexes = (int *) palloc(sizeof(int) * len2);
     670       24252 :     found = (bool *) palloc0(sizeof(bool) * len);
     671             : 
     672       24252 :     ulen1 = 0;
     673       24252 :     j = 0;
     674      601886 :     for (i = 0; i < len; i++)
     675             :     {
     676      577634 :         if (i > 0)
     677             :         {
     678      553382 :             int         cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg);
     679             : 
     680      553382 :             if (cmp != 0)
     681             :             {
     682      484980 :                 if (found[j])
     683      202274 :                     ulen1++;
     684      484980 :                 j++;
     685             :             }
     686             :         }
     687             : 
     688      577634 :         if (ptrg[i].index >= 0)
     689             :         {
     690      360158 :             trg2indexes[ptrg[i].index] = j;
     691             :         }
     692             :         else
     693             :         {
     694      217476 :             found[j] = true;
     695             :         }
     696             :     }
     697       24252 :     if (found[j])
     698       15202 :         ulen1++;
     699             : 
     700             :     /* Run iterative procedure to find maximum similarity with word */
     701       24252 :     result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
     702             :                                      flags, bounds);
     703             : 
     704       24252 :     pfree(trg2indexes);
     705       24252 :     pfree(found);
     706       24252 :     pfree(ptrg);
     707             : 
     708       24252 :     return result;
     709             : }
     710             : 
     711             : 
     712             : /*
     713             :  * Extract the next non-wildcard part of a search string, i.e. a word bounded
     714             :  * by '_' or '%' meta-characters, non-word characters or string end.
     715             :  *
     716             :  * str: source string, of length lenstr bytes (need not be null-terminated)
     717             :  * buf: where to return the substring (must be long enough)
     718             :  * *bytelen: receives byte length of the found substring
     719             :  * *charlen: receives character length of the found substring
     720             :  *
     721             :  * Returns pointer to end+1 of the found substring in the source string.
     722             :  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
     723             :  *
     724             :  * If the found word is bounded by non-word characters or string boundaries
     725             :  * then this function will include corresponding padding spaces into buf.
     726             :  */
     727             : static const char *
     728         238 : get_wildcard_part(const char *str, int lenstr,
     729             :                   char *buf, int *bytelen, int *charlen)
     730             : {
     731         238 :     const char *beginword = str;
     732             :     const char *endword;
     733         238 :     char       *s = buf;
     734         238 :     bool        in_leading_wildcard_meta = false;
     735         238 :     bool        in_trailing_wildcard_meta = false;
     736         238 :     bool        in_escape = false;
     737             :     int         clen;
     738             : 
     739             :     /*
     740             :      * Find the first word character, remembering whether preceding character
     741             :      * was wildcard meta-character.  Note that the in_escape state persists
     742             :      * from this loop to the next one, since we may exit at a word character
     743             :      * that is in_escape.
     744             :      */
     745         482 :     while (beginword - str < lenstr)
     746             :     {
     747         372 :         if (in_escape)
     748             :         {
     749           6 :             if (ISWORDCHR(beginword))
     750           6 :                 break;
     751           0 :             in_escape = false;
     752           0 :             in_leading_wildcard_meta = false;
     753             :         }
     754             :         else
     755             :         {
     756         366 :             if (ISESCAPECHAR(beginword))
     757           6 :                 in_escape = true;
     758         360 :             else if (ISWILDCARDCHAR(beginword))
     759         208 :                 in_leading_wildcard_meta = true;
     760         152 :             else if (ISWORDCHR(beginword))
     761         122 :                 break;
     762             :             else
     763          30 :                 in_leading_wildcard_meta = false;
     764             :         }
     765         244 :         beginword += pg_mblen(beginword);
     766             :     }
     767             : 
     768             :     /*
     769             :      * Handle string end.
     770             :      */
     771         238 :     if (beginword - str >= lenstr)
     772         110 :         return NULL;
     773             : 
     774             :     /*
     775             :      * Add left padding spaces if preceding character wasn't wildcard
     776             :      * meta-character.
     777             :      */
     778         128 :     *charlen = 0;
     779         128 :     if (!in_leading_wildcard_meta)
     780             :     {
     781             :         if (LPADDING > 0)
     782             :         {
     783          30 :             *s++ = ' ';
     784          30 :             (*charlen)++;
     785             :             if (LPADDING > 1)
     786             :             {
     787          30 :                 *s++ = ' ';
     788          30 :                 (*charlen)++;
     789             :             }
     790             :         }
     791             :     }
     792             : 
     793             :     /*
     794             :      * Copy data into buf until wildcard meta-character, non-word character or
     795             :      * string boundary.  Strip escapes during copy.
     796             :      */
     797         128 :     endword = beginword;
     798         488 :     while (endword - str < lenstr)
     799             :     {
     800         488 :         clen = pg_mblen(endword);
     801         488 :         if (in_escape)
     802             :         {
     803           6 :             if (ISWORDCHR(endword))
     804             :             {
     805           6 :                 memcpy(s, endword, clen);
     806           6 :                 (*charlen)++;
     807           6 :                 s += clen;
     808             :             }
     809             :             else
     810             :             {
     811             :                 /*
     812             :                  * Back up endword to the escape character when stopping at an
     813             :                  * escaped char, so that subsequent get_wildcard_part will
     814             :                  * restart from the escape character.  We assume here that
     815             :                  * escape chars are single-byte.
     816             :                  */
     817           0 :                 endword--;
     818           0 :                 break;
     819             :             }
     820           6 :             in_escape = false;
     821             :         }
     822             :         else
     823             :         {
     824         482 :             if (ISESCAPECHAR(endword))
     825           0 :                 in_escape = true;
     826         482 :             else if (ISWILDCARDCHAR(endword))
     827             :             {
     828         110 :                 in_trailing_wildcard_meta = true;
     829         110 :                 break;
     830             :             }
     831         372 :             else if (ISWORDCHR(endword))
     832             :             {
     833         354 :                 memcpy(s, endword, clen);
     834         354 :                 (*charlen)++;
     835         354 :                 s += clen;
     836             :             }
     837             :             else
     838          18 :                 break;
     839             :         }
     840         360 :         endword += clen;
     841             :     }
     842             : 
     843             :     /*
     844             :      * Add right padding spaces if next character isn't wildcard
     845             :      * meta-character.
     846             :      */
     847         128 :     if (!in_trailing_wildcard_meta)
     848             :     {
     849             :         if (RPADDING > 0)
     850             :         {
     851          18 :             *s++ = ' ';
     852          18 :             (*charlen)++;
     853             :             if (RPADDING > 1)
     854             :             {
     855             :                 *s++ = ' ';
     856             :                 (*charlen)++;
     857             :             }
     858             :         }
     859             :     }
     860             : 
     861         128 :     *bytelen = s - buf;
     862         128 :     return endword;
     863             : }
     864             : 
     865             : /*
     866             :  * Generates trigrams for wildcard search string.
     867             :  *
     868             :  * Returns array of trigrams that must occur in any string that matches the
     869             :  * wildcard string.  For example, given pattern "a%bcd%" the trigrams
     870             :  * " a", "bcd" would be extracted.
     871             :  */
     872             : TRGM *
     873         110 : generate_wildcard_trgm(const char *str, int slen)
     874             : {
     875             :     TRGM       *trg;
     876             :     char       *buf,
     877             :                *buf2;
     878             :     trgm       *tptr;
     879             :     int         len,
     880             :                 charlen,
     881             :                 bytelen;
     882             :     const char *eword;
     883             : 
     884         110 :     protect_out_of_mem(slen);
     885             : 
     886         110 :     trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
     887         110 :     trg->flag = ARRKEY;
     888         110 :     SET_VARSIZE(trg, TRGMHDRSIZE);
     889             : 
     890         110 :     if (slen + LPADDING + RPADDING < 3 || slen == 0)
     891           0 :         return trg;
     892             : 
     893         110 :     tptr = GETARR(trg);
     894             : 
     895             :     /* Allocate a buffer for blank-padded, but not yet case-folded, words */
     896         110 :     buf = palloc(sizeof(char) * (slen + 4));
     897             : 
     898             :     /*
     899             :      * Extract trigrams from each substring extracted by get_wildcard_part.
     900             :      */
     901         110 :     eword = str;
     902         238 :     while ((eword = get_wildcard_part(eword, slen - (eword - str),
     903             :                                       buf, &bytelen, &charlen)) != NULL)
     904             :     {
     905             : #ifdef IGNORECASE
     906         128 :         buf2 = lowerstr_with_len(buf, bytelen);
     907         128 :         bytelen = strlen(buf2);
     908             : #else
     909             :         buf2 = buf;
     910             : #endif
     911             : 
     912             :         /*
     913             :          * count trigrams
     914             :          */
     915         128 :         tptr = make_trigrams(tptr, buf2, bytelen, charlen);
     916             : 
     917             : #ifdef IGNORECASE
     918         128 :         pfree(buf2);
     919             : #endif
     920             :     }
     921             : 
     922         110 :     pfree(buf);
     923             : 
     924         110 :     if ((len = tptr - GETARR(trg)) == 0)
     925          48 :         return trg;
     926             : 
     927             :     /*
     928             :      * Make trigrams unique.
     929             :      */
     930          62 :     if (len > 1)
     931             :     {
     932          34 :         qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
     933          34 :         len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
     934             :     }
     935             : 
     936          62 :     SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
     937             : 
     938          62 :     return trg;
     939             : }
     940             : 
     941             : uint32
     942       69546 : trgm2int(trgm *ptr)
     943             : {
     944       69546 :     uint32      val = 0;
     945             : 
     946       69546 :     val |= *(((unsigned char *) ptr));
     947       69546 :     val <<= 8;
     948       69546 :     val |= *(((unsigned char *) ptr) + 1);
     949       69546 :     val <<= 8;
     950       69546 :     val |= *(((unsigned char *) ptr) + 2);
     951             : 
     952       69546 :     return val;
     953             : }
     954             : 
     955             : Datum
     956          14 : show_trgm(PG_FUNCTION_ARGS)
     957             : {
     958          14 :     text       *in = PG_GETARG_TEXT_PP(0);
     959             :     TRGM       *trg;
     960             :     Datum      *d;
     961             :     ArrayType  *a;
     962             :     trgm       *ptr;
     963             :     int         i;
     964             : 
     965          14 :     trg = generate_trgm(VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
     966          14 :     d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));
     967             : 
     968          88 :     for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
     969             :     {
     970          74 :         text       *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));
     971             : 
     972          74 :         if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr))
     973             :         {
     974           0 :             snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
     975           0 :             SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
     976             :         }
     977             :         else
     978             :         {
     979          74 :             SET_VARSIZE(item, VARHDRSZ + 3);
     980          74 :             CPTRGM(VARDATA(item), ptr);
     981             :         }
     982          74 :         d[i] = PointerGetDatum(item);
     983             :     }
     984             : 
     985          14 :     a = construct_array_builtin(d, ARRNELEM(trg), TEXTOID);
     986             : 
     987          88 :     for (i = 0; i < ARRNELEM(trg); i++)
     988          74 :         pfree(DatumGetPointer(d[i]));
     989             : 
     990          14 :     pfree(d);
     991          14 :     pfree(trg);
     992          14 :     PG_FREE_IF_COPY(in, 0);
     993             : 
     994          14 :     PG_RETURN_POINTER(a);
     995             : }
     996             : 
     997             : float4
     998      137576 : cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
     999             : {
    1000             :     trgm       *ptr1,
    1001             :                *ptr2;
    1002      137576 :     int         count = 0;
    1003             :     int         len1,
    1004             :                 len2;
    1005             : 
    1006      137576 :     ptr1 = GETARR(trg1);
    1007      137576 :     ptr2 = GETARR(trg2);
    1008             : 
    1009      137576 :     len1 = ARRNELEM(trg1);
    1010      137576 :     len2 = ARRNELEM(trg2);
    1011             : 
    1012             :     /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
    1013      137576 :     if (len1 <= 0 || len2 <= 0)
    1014           2 :         return (float4) 0.0;
    1015             : 
    1016     1751214 :     while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
    1017             :     {
    1018     1613640 :         int         res = CMPTRGM(ptr1, ptr2);
    1019             : 
    1020     1613640 :         if (res < 0)
    1021      364714 :             ptr1++;
    1022     1248926 :         else if (res > 0)
    1023      426238 :             ptr2++;
    1024             :         else
    1025             :         {
    1026      822688 :             ptr1++;
    1027      822688 :             ptr2++;
    1028      822688 :             count++;
    1029             :         }
    1030             :     }
    1031             : 
    1032             :     /*
    1033             :      * If inexact then len2 is equal to count, because we don't know actual
    1034             :      * length of second string in inexact search and we can assume that count
    1035             :      * is a lower bound of len2.
    1036             :      */
    1037      137574 :     return CALCSML(count, len1, inexact ? count : len2);
    1038             : }
    1039             : 
    1040             : 
    1041             : /*
    1042             :  * Returns whether trg2 contains all trigrams in trg1.
    1043             :  * This relies on the trigram arrays being sorted.
    1044             :  */
    1045             : bool
    1046         380 : trgm_contained_by(TRGM *trg1, TRGM *trg2)
    1047             : {
    1048             :     trgm       *ptr1,
    1049             :                *ptr2;
    1050             :     int         len1,
    1051             :                 len2;
    1052             : 
    1053         380 :     ptr1 = GETARR(trg1);
    1054         380 :     ptr2 = GETARR(trg2);
    1055             : 
    1056         380 :     len1 = ARRNELEM(trg1);
    1057         380 :     len2 = ARRNELEM(trg2);
    1058             : 
    1059        1244 :     while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
    1060             :     {
    1061        1198 :         int         res = CMPTRGM(ptr1, ptr2);
    1062             : 
    1063        1198 :         if (res < 0)
    1064         334 :             return false;
    1065         864 :         else if (res > 0)
    1066         640 :             ptr2++;
    1067             :         else
    1068             :         {
    1069         224 :             ptr1++;
    1070         224 :             ptr2++;
    1071             :         }
    1072             :     }
    1073          46 :     if (ptr1 - GETARR(trg1) < len1)
    1074           8 :         return false;
    1075             :     else
    1076          38 :         return true;
    1077             : }
    1078             : 
    1079             : /*
    1080             :  * Return a palloc'd boolean array showing, for each trigram in "query",
    1081             :  * whether it is present in the trigram array "key".
    1082             :  * This relies on the "key" array being sorted, but "query" need not be.
    1083             :  */
    1084             : bool *
    1085        4300 : trgm_presence_map(TRGM *query, TRGM *key)
    1086             : {
    1087             :     bool       *result;
    1088        4300 :     trgm       *ptrq = GETARR(query),
    1089        4300 :                *ptrk = GETARR(key);
    1090        4300 :     int         lenq = ARRNELEM(query),
    1091        4300 :                 lenk = ARRNELEM(key),
    1092             :                 i;
    1093             : 
    1094        4300 :     result = (bool *) palloc0(lenq * sizeof(bool));
    1095             : 
    1096             :     /* for each query trigram, do a binary search in the key array */
    1097     1015120 :     for (i = 0; i < lenq; i++)
    1098             :     {
    1099     1010820 :         int         lo = 0;
    1100     1010820 :         int         hi = lenk;
    1101             : 
    1102     4747306 :         while (lo < hi)
    1103             :         {
    1104     3752564 :             int         mid = (lo + hi) / 2;
    1105     3752564 :             int         res = CMPTRGM(ptrq, ptrk + mid);
    1106             : 
    1107     3752564 :             if (res < 0)
    1108     1568164 :                 hi = mid;
    1109     2184400 :             else if (res > 0)
    1110     2168322 :                 lo = mid + 1;
    1111             :             else
    1112             :             {
    1113       16078 :                 result[i] = true;
    1114       16078 :                 break;
    1115             :             }
    1116             :         }
    1117     1010820 :         ptrq++;
    1118             :     }
    1119             : 
    1120        4300 :     return result;
    1121             : }
    1122             : 
    1123             : Datum
    1124       62904 : similarity(PG_FUNCTION_ARGS)
    1125             : {
    1126       62904 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1127       62904 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1128             :     TRGM       *trg1,
    1129             :                *trg2;
    1130             :     float4      res;
    1131             : 
    1132       62904 :     trg1 = generate_trgm(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1));
    1133       62904 :     trg2 = generate_trgm(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2));
    1134             : 
    1135       62904 :     res = cnt_sml(trg1, trg2, false);
    1136             : 
    1137       62904 :     pfree(trg1);
    1138       62904 :     pfree(trg2);
    1139       62904 :     PG_FREE_IF_COPY(in1, 0);
    1140       62904 :     PG_FREE_IF_COPY(in2, 1);
    1141             : 
    1142       62904 :     PG_RETURN_FLOAT4(res);
    1143             : }
    1144             : 
    1145             : Datum
    1146        1804 : word_similarity(PG_FUNCTION_ARGS)
    1147             : {
    1148        1804 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1149        1804 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1150             :     float4      res;
    1151             : 
    1152        3608 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1153        3608 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1154             :                                0);
    1155             : 
    1156        1804 :     PG_FREE_IF_COPY(in1, 0);
    1157        1804 :     PG_FREE_IF_COPY(in2, 1);
    1158        1804 :     PG_RETURN_FLOAT4(res);
    1159             : }
    1160             : 
    1161             : Datum
    1162        1764 : strict_word_similarity(PG_FUNCTION_ARGS)
    1163             : {
    1164        1764 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1165        1764 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1166             :     float4      res;
    1167             : 
    1168        3528 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1169        3528 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1170             :                                WORD_SIMILARITY_STRICT);
    1171             : 
    1172        1764 :     PG_FREE_IF_COPY(in1, 0);
    1173        1764 :     PG_FREE_IF_COPY(in2, 1);
    1174        1764 :     PG_RETURN_FLOAT4(res);
    1175             : }
    1176             : 
    1177             : Datum
    1178        2008 : similarity_dist(PG_FUNCTION_ARGS)
    1179             : {
    1180        2008 :     float4      res = DatumGetFloat4(DirectFunctionCall2(similarity,
    1181             :                                                          PG_GETARG_DATUM(0),
    1182             :                                                          PG_GETARG_DATUM(1)));
    1183             : 
    1184        2008 :     PG_RETURN_FLOAT4(1.0 - res);
    1185             : }
    1186             : 
    1187             : Datum
    1188       12000 : similarity_op(PG_FUNCTION_ARGS)
    1189             : {
    1190       12000 :     float4      res = DatumGetFloat4(DirectFunctionCall2(similarity,
    1191             :                                                          PG_GETARG_DATUM(0),
    1192             :                                                          PG_GETARG_DATUM(1)));
    1193             : 
    1194       12000 :     PG_RETURN_BOOL(res >= similarity_threshold);
    1195             : }
    1196             : 
    1197             : Datum
    1198        3848 : word_similarity_op(PG_FUNCTION_ARGS)
    1199             : {
    1200        3848 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1201        3848 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1202             :     float4      res;
    1203             : 
    1204        7696 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1205        7696 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1206             :                                WORD_SIMILARITY_CHECK_ONLY);
    1207             : 
    1208        3848 :     PG_FREE_IF_COPY(in1, 0);
    1209        3848 :     PG_FREE_IF_COPY(in2, 1);
    1210        3848 :     PG_RETURN_BOOL(res >= word_similarity_threshold);
    1211             : }
    1212             : 
    1213             : Datum
    1214        3848 : word_similarity_commutator_op(PG_FUNCTION_ARGS)
    1215             : {
    1216        3848 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1217        3848 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1218             :     float4      res;
    1219             : 
    1220        7696 :     res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1221        7696 :                                VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1222             :                                WORD_SIMILARITY_CHECK_ONLY);
    1223             : 
    1224        3848 :     PG_FREE_IF_COPY(in1, 0);
    1225        3848 :     PG_FREE_IF_COPY(in2, 1);
    1226        3848 :     PG_RETURN_BOOL(res >= word_similarity_threshold);
    1227             : }
    1228             : 
    1229             : Datum
    1230           0 : word_similarity_dist_op(PG_FUNCTION_ARGS)
    1231             : {
    1232           0 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1233           0 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1234             :     float4      res;
    1235             : 
    1236           0 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1237           0 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1238             :                                0);
    1239             : 
    1240           0 :     PG_FREE_IF_COPY(in1, 0);
    1241           0 :     PG_FREE_IF_COPY(in2, 1);
    1242           0 :     PG_RETURN_FLOAT4(1.0 - res);
    1243             : }
    1244             : 
    1245             : Datum
    1246        1428 : word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
    1247             : {
    1248        1428 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1249        1428 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1250             :     float4      res;
    1251             : 
    1252        2856 :     res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1253        2856 :                                VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1254             :                                0);
    1255             : 
    1256        1428 :     PG_FREE_IF_COPY(in1, 0);
    1257        1428 :     PG_FREE_IF_COPY(in2, 1);
    1258        1428 :     PG_RETURN_FLOAT4(1.0 - res);
    1259             : }
    1260             : 
    1261             : Datum
    1262        5060 : strict_word_similarity_op(PG_FUNCTION_ARGS)
    1263             : {
    1264        5060 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1265        5060 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1266             :     float4      res;
    1267             : 
    1268       10120 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1269       10120 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1270             :                                WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
    1271             : 
    1272        5060 :     PG_FREE_IF_COPY(in1, 0);
    1273        5060 :     PG_FREE_IF_COPY(in2, 1);
    1274        5060 :     PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
    1275             : }
    1276             : 
    1277             : Datum
    1278        5060 : strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
    1279             : {
    1280        5060 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1281        5060 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1282             :     float4      res;
    1283             : 
    1284       10120 :     res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1285       10120 :                                VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1286             :                                WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
    1287             : 
    1288        5060 :     PG_FREE_IF_COPY(in1, 0);
    1289        5060 :     PG_FREE_IF_COPY(in2, 1);
    1290        5060 :     PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
    1291             : }
    1292             : 
    1293             : Datum
    1294           0 : strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
    1295             : {
    1296           0 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1297           0 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1298             :     float4      res;
    1299             : 
    1300           0 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1301           0 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1302             :                                WORD_SIMILARITY_STRICT);
    1303             : 
    1304           0 :     PG_FREE_IF_COPY(in1, 0);
    1305           0 :     PG_FREE_IF_COPY(in2, 1);
    1306           0 :     PG_RETURN_FLOAT4(1.0 - res);
    1307             : }
    1308             : 
    1309             : Datum
    1310        1440 : strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
    1311             : {
    1312        1440 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1313        1440 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1314             :     float4      res;
    1315             : 
    1316        2880 :     res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1317        2880 :                                VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1318             :                                WORD_SIMILARITY_STRICT);
    1319             : 
    1320        1440 :     PG_FREE_IF_COPY(in1, 0);
    1321        1440 :     PG_FREE_IF_COPY(in2, 1);
    1322        1440 :     PG_RETURN_FLOAT4(1.0 - res);
    1323             : }

Generated by: LCOV version 1.14