LCOV - code coverage report
Current view: top level - contrib/pg_trgm - trgm_op.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13devel Lines: 449 491 91.4 %
Date: 2019-09-22 08:06:49 Functions: 51 53 96.2 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * contrib/pg_trgm/trgm_op.c
       3             :  */
       4             : #include "postgres.h"
       5             : 
       6             : #include <ctype.h>
       7             : 
       8             : #include "trgm.h"
       9             : 
      10             : #include "catalog/pg_type.h"
      11             : #include "tsearch/ts_locale.h"
      12             : #include "utils/lsyscache.h"
      13             : #include "utils/memutils.h"
      14             : #include "utils/pg_crc.h"
      15             : 
      16           6 : PG_MODULE_MAGIC;
      17             : 
      18             : /* GUC variables */
      19             : double      similarity_threshold = 0.3f;
      20             : double      word_similarity_threshold = 0.6f;
      21             : double      strict_word_similarity_threshold = 0.5f;
      22             : 
      23             : void        _PG_init(void);
      24             : 
      25           4 : PG_FUNCTION_INFO_V1(set_limit);
      26           4 : PG_FUNCTION_INFO_V1(show_limit);
      27           4 : PG_FUNCTION_INFO_V1(show_trgm);
      28           4 : PG_FUNCTION_INFO_V1(similarity);
      29           4 : PG_FUNCTION_INFO_V1(word_similarity);
      30           4 : PG_FUNCTION_INFO_V1(strict_word_similarity);
      31           4 : PG_FUNCTION_INFO_V1(similarity_dist);
      32           4 : PG_FUNCTION_INFO_V1(similarity_op);
      33           4 : PG_FUNCTION_INFO_V1(word_similarity_op);
      34           4 : PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
      35           2 : PG_FUNCTION_INFO_V1(word_similarity_dist_op);
      36           4 : PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
      37           4 : PG_FUNCTION_INFO_V1(strict_word_similarity_op);
      38           4 : PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op);
      39           2 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op);
      40           4 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op);
      41             : 
      42             : /* Trigram with position */
      43             : typedef struct
      44             : {
      45             :     trgm        trg;
      46             :     int         index;
      47             : } pos_trgm;
      48             : 
      49             : /* Trigram bound type */
      50             : typedef uint8 TrgmBound;
      51             : #define TRGM_BOUND_LEFT             0x01    /* trigram is left bound of word */
      52             : #define TRGM_BOUND_RIGHT            0x02    /* trigram is right bound of word */
      53             : 
      54             : /* Word similarity flags */
      55             : #define WORD_SIMILARITY_CHECK_ONLY  0x01    /* only check existence of similar
      56             :                                              * search pattern in text */
      57             : #define WORD_SIMILARITY_STRICT      0x02    /* force bounds of extent to match
      58             :                                              * word bounds */
      59             : 
      60             : /*
      61             :  * Module load callback
      62             :  */
      63             : void
      64           6 : _PG_init(void)
      65             : {
      66             :     /* Define custom GUC variables. */
      67           6 :     DefineCustomRealVariable("pg_trgm.similarity_threshold",
      68             :                              "Sets the threshold used by the % operator.",
      69             :                              "Valid range is 0.0 .. 1.0.",
      70             :                              &similarity_threshold,
      71             :                              0.3,
      72             :                              0.0,
      73             :                              1.0,
      74             :                              PGC_USERSET,
      75             :                              0,
      76             :                              NULL,
      77             :                              NULL,
      78             :                              NULL);
      79           6 :     DefineCustomRealVariable("pg_trgm.word_similarity_threshold",
      80             :                              "Sets the threshold used by the <% operator.",
      81             :                              "Valid range is 0.0 .. 1.0.",
      82             :                              &word_similarity_threshold,
      83             :                              0.6,
      84             :                              0.0,
      85             :                              1.0,
      86             :                              PGC_USERSET,
      87             :                              0,
      88             :                              NULL,
      89             :                              NULL,
      90             :                              NULL);
      91           6 :     DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold",
      92             :                              "Sets the threshold used by the <<% operator.",
      93             :                              "Valid range is 0.0 .. 1.0.",
      94             :                              &strict_word_similarity_threshold,
      95             :                              0.5,
      96             :                              0.0,
      97             :                              1.0,
      98             :                              PGC_USERSET,
      99             :                              0,
     100             :                              NULL,
     101             :                              NULL,
     102             :                              NULL);
     103           6 : }
     104             : 
     105             : /*
     106             :  * Deprecated function.
     107             :  * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
     108             :  */
     109             : Datum
     110           4 : set_limit(PG_FUNCTION_ARGS)
     111             : {
     112           4 :     float4      nlimit = PG_GETARG_FLOAT4(0);
     113             :     char       *nlimit_str;
     114             :     Oid         func_out_oid;
     115             :     bool        is_varlena;
     116             : 
     117           4 :     getTypeOutputInfo(FLOAT4OID, &func_out_oid, &is_varlena);
     118             : 
     119           4 :     nlimit_str = OidOutputFunctionCall(func_out_oid, Float4GetDatum(nlimit));
     120             : 
     121           4 :     SetConfigOption("pg_trgm.similarity_threshold", nlimit_str,
     122             :                     PGC_USERSET, PGC_S_SESSION);
     123             : 
     124           4 :     PG_RETURN_FLOAT4(similarity_threshold);
     125             : }
     126             : 
     127             : 
     128             : /*
     129             :  * Get similarity threshold for given index scan strategy number.
     130             :  */
     131             : double
     132       80866 : index_strategy_get_limit(StrategyNumber strategy)
     133             : {
     134       80866 :     switch (strategy)
     135             :     {
     136             :         case SimilarityStrategyNumber:
     137       59274 :             return similarity_threshold;
     138             :         case WordSimilarityStrategyNumber:
     139        9636 :             return word_similarity_threshold;
     140             :         case StrictWordSimilarityStrategyNumber:
     141       11956 :             return strict_word_similarity_threshold;
     142             :         default:
     143           0 :             elog(ERROR, "unrecognized strategy number: %d", strategy);
     144             :             break;
     145             :     }
     146             : 
     147             :     return 0.0;                 /* keep compiler quiet */
     148             : }
     149             : 
     150             : /*
     151             :  * Deprecated function.
     152             :  * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
     153             :  */
     154             : Datum
     155       40000 : show_limit(PG_FUNCTION_ARGS)
     156             : {
     157       40000 :     PG_RETURN_FLOAT4(similarity_threshold);
     158             : }
     159             : 
     160             : static int
     161     4550882 : comp_trgm(const void *a, const void *b)
     162             : {
     163     4550882 :     return CMPTRGM(a, b);
     164             : }
     165             : 
     166             : static int
     167      171104 : unique_array(trgm *a, int len)
     168             : {
     169             :     trgm       *curend,
     170             :                *tmp;
     171             : 
     172      171104 :     curend = tmp = a;
     173     1861718 :     while (tmp - a < len)
     174     1519510 :         if (CMPTRGM(tmp, curend))
     175             :         {
     176     1345168 :             curend++;
     177     1345168 :             CPTRGM(curend, tmp);
     178     1345168 :             tmp++;
     179             :         }
     180             :         else
     181      174342 :             tmp++;
     182             : 
     183      171104 :     return curend + 1 - a;
     184             : }
     185             : 
     186             : /*
     187             :  * Finds first word in string, returns pointer to the word,
     188             :  * endword points to the character after word
     189             :  */
     190             : static char *
     191      465576 : find_word(char *str, int lenstr, char **endword, int *charlen)
     192             : {
     193      465576 :     char       *beginword = str;
     194             : 
     195      958250 :     while (beginword - str < lenstr && !ISWORDCHR(beginword))
     196       27098 :         beginword += pg_mblen(beginword);
     197             : 
     198      465576 :     if (beginword - str >= lenstr)
     199      219596 :         return NULL;
     200             : 
     201      245980 :     *endword = beginword;
     202      245980 :     *charlen = 0;
     203     2343054 :     while (*endword - str < lenstr && ISWORDCHR(*endword))
     204             :     {
     205     1851094 :         *endword += pg_mblen(*endword);
     206     1851094 :         (*charlen)++;
     207             :     }
     208             : 
     209      245980 :     return beginword;
     210             : }
     211             : 
     212             : /*
     213             :  * Reduce a trigram (three possibly multi-byte characters) to a trgm,
     214             :  * which is always exactly three bytes.  If we have three single-byte
     215             :  * characters, we just use them as-is; otherwise we form a hash value.
     216             :  */
     217             : void
     218        2414 : compact_trigram(trgm *tptr, char *str, int bytelen)
     219             : {
     220        2414 :     if (bytelen == 3)
     221             :     {
     222        2414 :         CPTRGM(tptr, str);
     223             :     }
     224             :     else
     225             :     {
     226             :         pg_crc32    crc;
     227             : 
     228           0 :         INIT_LEGACY_CRC32(crc);
     229           0 :         COMP_LEGACY_CRC32(crc, str, bytelen);
     230           0 :         FIN_LEGACY_CRC32(crc);
     231             : 
     232             :         /*
     233             :          * use only 3 upper bytes from crc, hope, it's good enough hashing
     234             :          */
     235           0 :         CPTRGM(tptr, &crc);
     236             :     }
     237        2414 : }
     238             : 
     239             : /*
     240             :  * Adds trigrams from words (already padded).
     241             :  */
     242             : static trgm *
     243      246044 : make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
     244             : {
     245      246044 :     char       *ptr = str;
     246             : 
     247      246044 :     if (charlen < 3)
     248           6 :         return tptr;
     249             : 
     250      246038 :     if (bytelen > charlen)
     251             :     {
     252             :         /* Find multibyte character boundaries and apply compact_trigram */
     253           0 :         int         lenfirst = pg_mblen(str),
     254           0 :                     lenmiddle = pg_mblen(str + lenfirst),
     255           0 :                     lenlast = pg_mblen(str + lenfirst + lenmiddle);
     256             : 
     257           0 :         while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
     258             :         {
     259           0 :             compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
     260             : 
     261           0 :             ptr += lenfirst;
     262           0 :             tptr++;
     263             : 
     264           0 :             lenfirst = lenmiddle;
     265           0 :             lenmiddle = lenlast;
     266           0 :             lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
     267             :         }
     268             :     }
     269             :     else
     270             :     {
     271             :         /* Fast path when there are no multibyte characters */
     272             :         Assert(bytelen == charlen);
     273             : 
     274     2589268 :         while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
     275             :         {
     276     2097192 :             CPTRGM(tptr, ptr);
     277     2097192 :             ptr++;
     278     2097192 :             tptr++;
     279             :         }
     280             :     }
     281             : 
     282      246038 :     return tptr;
     283             : }
     284             : 
     285             : /*
     286             :  * Make array of trigrams without sorting and removing duplicate items.
     287             :  *
     288             :  * trg: where to return the array of trigrams.
     289             :  * str: source string, of length slen bytes.
     290             :  * bounds: where to return bounds of trigrams (if needed).
     291             :  *
     292             :  * Returns length of the generated array.
     293             :  */
     294             : static int
     295      219598 : generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
     296             : {
     297             :     trgm       *tptr;
     298             :     char       *buf;
     299             :     int         charlen,
     300             :                 bytelen;
     301             :     char       *bword,
     302             :                *eword;
     303             : 
     304      219598 :     if (slen + LPADDING + RPADDING < 3 || slen == 0)
     305           2 :         return 0;
     306             : 
     307      219596 :     tptr = trg;
     308             : 
     309             :     /* Allocate a buffer for case-folded, blank-padded words */
     310      219596 :     buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
     311             : 
     312             :     if (LPADDING > 0)
     313             :     {
     314      219596 :         *buf = ' ';
     315             :         if (LPADDING > 1)
     316      219596 :             *(buf + 1) = ' ';
     317             :     }
     318             : 
     319      219596 :     eword = str;
     320      685172 :     while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
     321             :     {
     322             : #ifdef IGNORECASE
     323      245980 :         bword = lowerstr_with_len(bword, eword - bword);
     324      245980 :         bytelen = strlen(bword);
     325             : #else
     326             :         bytelen = eword - bword;
     327             : #endif
     328             : 
     329      245980 :         memcpy(buf + LPADDING, bword, bytelen);
     330             : 
     331             : #ifdef IGNORECASE
     332      245980 :         pfree(bword);
     333             : #endif
     334             : 
     335      245980 :         buf[LPADDING + bytelen] = ' ';
     336      245980 :         buf[LPADDING + bytelen + 1] = ' ';
     337             : 
     338             :         /* Calculate trigrams marking their bounds if needed */
     339      245980 :         if (bounds)
     340       24796 :             bounds[tptr - trg] |= TRGM_BOUND_LEFT;
     341      245980 :         tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
     342             :                              charlen + LPADDING + RPADDING);
     343      245980 :         if (bounds)
     344       24796 :             bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
     345             :     }
     346             : 
     347      219596 :     pfree(buf);
     348             : 
     349      219596 :     return tptr - trg;
     350             : }
     351             : 
     352             : /*
     353             :  * Guard against possible overflow in the palloc requests below.  (We
     354             :  * don't worry about the additive constants, since palloc can detect
     355             :  * requests that are a little above MaxAllocSize --- we just need to
     356             :  * prevent integer overflow in the multiplications.)
     357             :  */
     358             : static void
     359      195392 : protect_out_of_mem(int slen)
     360             : {
     361      390784 :     if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
     362      195392 :         (Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
     363           0 :         ereport(ERROR,
     364             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     365             :                  errmsg("out of memory")));
     366      195392 : }
     367             : 
     368             : /*
     369             :  * Make array of trigrams with sorting and removing duplicate items.
     370             :  *
     371             :  * str: source string, of length slen bytes.
     372             :  *
     373             :  * Returns the sorted array of unique trigrams.
     374             :  */
     375             : TRGM *
     376      171094 : generate_trgm(char *str, int slen)
     377             : {
     378             :     TRGM       *trg;
     379             :     int         len;
     380             : 
     381      171094 :     protect_out_of_mem(slen);
     382             : 
     383      171094 :     trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
     384      171094 :     trg->flag = ARRKEY;
     385             : 
     386      171094 :     len = generate_trgm_only(GETARR(trg), str, slen, NULL);
     387      171094 :     SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
     388             : 
     389      171094 :     if (len == 0)
     390           8 :         return trg;
     391             : 
     392             :     /*
     393             :      * Make trigrams unique.
     394             :      */
     395      171086 :     if (len > 1)
     396             :     {
     397      171086 :         qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
     398      171086 :         len = unique_array(GETARR(trg), len);
     399             :     }
     400             : 
     401      171086 :     SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
     402             : 
     403      171086 :     return trg;
     404             : }
     405             : 
     406             : /*
     407             :  * Make array of positional trigrams from two trigram arrays trg1 and trg2.
     408             :  *
     409             :  * trg1: trigram array of search pattern, of length len1. trg1 is required
     410             :  *       word which positions don't matter and replaced with -1.
     411             :  * trg2: trigram array of text, of length len2. trg2 is haystack where we
     412             :  *       search and have to store its positions.
     413             :  *
     414             :  * Returns concatenated trigram array.
     415             :  */
     416             : static pos_trgm *
     417       24252 : make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
     418             : {
     419             :     pos_trgm   *result;
     420             :     int         i,
     421       24252 :                 len = len1 + len2;
     422             : 
     423       24252 :     result = (pos_trgm *) palloc(sizeof(pos_trgm) * len);
     424             : 
     425      241728 :     for (i = 0; i < len1; i++)
     426             :     {
     427      217476 :         memcpy(&result[i].trg, &trg1[i], sizeof(trgm));
     428      217476 :         result[i].index = -1;
     429             :     }
     430             : 
     431      384430 :     for (i = 0; i < len2; i++)
     432             :     {
     433      360178 :         memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm));
     434      360178 :         result[i + len1].index = i;
     435             :     }
     436             : 
     437       24252 :     return result;
     438             : }
     439             : 
     440             : /*
     441             :  * Compare position trigrams: compare trigrams first and position second.
     442             :  */
     443             : static int
     444     2615444 : comp_ptrgm(const void *v1, const void *v2)
     445             : {
     446     2615444 :     const pos_trgm *p1 = (const pos_trgm *) v1;
     447     2615444 :     const pos_trgm *p2 = (const pos_trgm *) v2;
     448             :     int         cmp;
     449             : 
     450     2615444 :     cmp = CMPTRGM(p1->trg, p2->trg);
     451     2615444 :     if (cmp != 0)
     452     2536036 :         return cmp;
     453             : 
     454       79408 :     if (p1->index < p2->index)
     455       42742 :         return -1;
     456       36666 :     else if (p1->index == p2->index)
     457           0 :         return 0;
     458             :     else
     459       36666 :         return 1;
     460             : }
     461             : 
     462             : /*
     463             :  * Iterative search function which calculates maximum similarity with word in
     464             :  * the string. But maximum similarity is calculated only if check_only == false.
     465             :  *
     466             :  * trg2indexes: array which stores indexes of the array "found".
     467             :  * found: array which stores true of false values.
     468             :  * ulen1: count of unique trigrams of array "trg1".
     469             :  * len2: length of array "trg2" and array "trg2indexes".
     470             :  * len: length of the array "found".
     471             :  * lags: set of boolean flags parametrizing similarity calculation.
     472             :  * bounds: whether each trigram is left/right bound of word.
     473             :  *
     474             :  * Returns word similarity.
     475             :  */
     476             : static float4
     477       24252 : iterate_word_similarity(int *trg2indexes,
     478             :                         bool *found,
     479             :                         int ulen1,
     480             :                         int len2,
     481             :                         int len,
     482             :                         uint8 flags,
     483             :                         TrgmBound *bounds)
     484             : {
     485             :     int        *lastpos,
     486             :                 i,
     487       24252 :                 ulen2 = 0,
     488       24252 :                 count = 0,
     489       24252 :                 upper = -1,
     490             :                 lower;
     491             :     float4      smlr_cur,
     492       24252 :                 smlr_max = 0.0f;
     493             :     double      threshold;
     494             : 
     495             :     Assert(bounds || !(flags & WORD_SIMILARITY_STRICT));
     496             : 
     497             :     /* Select appropriate threshold */
     498       48504 :     threshold = (flags & WORD_SIMILARITY_STRICT) ?
     499       24252 :         strict_word_similarity_threshold :
     500             :         word_similarity_threshold;
     501             : 
     502             :     /*
     503             :      * Consider first trigram as initial lower bound for strict word
     504             :      * similarity, or initialize it later with first trigram present for plain
     505             :      * word similarity.
     506             :      */
     507       24252 :     lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1;
     508             : 
     509             :     /* Memorise last position of each trigram */
     510       24252 :     lastpos = (int *) palloc(sizeof(int) * len);
     511       24252 :     memset(lastpos, -1, sizeof(int) * len);
     512             : 
     513      367290 :     for (i = 0; i < len2; i++)
     514             :     {
     515             :         /* Get index of next trigram */
     516      346606 :         int         trgindex = trg2indexes[i];
     517             : 
     518             :         /* Update last position of this trigram */
     519      346606 :         if (lower >= 0 || found[trgindex])
     520             :         {
     521      271578 :             if (lastpos[trgindex] < 0)
     522             :             {
     523      267874 :                 ulen2++;
     524      267874 :                 if (found[trgindex])
     525       61512 :                     count++;
     526             :             }
     527      271578 :             lastpos[trgindex] = i;
     528             :         }
     529             : 
     530             :         /*
     531             :          * Adjust upper bound if trigram is upper bound of word for strict
     532             :          * word similarity, or if trigram is present in required substring for
     533             :          * plain word similarity
     534             :          */
     535      500690 :         if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT)
     536      154084 :             : found[trgindex])
     537             :         {
     538             :             int         prev_lower,
     539             :                         tmp_ulen2,
     540             :                         tmp_lower,
     541             :                         tmp_count;
     542             : 
     543       51270 :             upper = i;
     544       51270 :             if (lower == -1)
     545             :             {
     546        9390 :                 lower = i;
     547        9390 :                 ulen2 = 1;
     548             :             }
     549             : 
     550       51270 :             smlr_cur = CALCSML(count, ulen1, ulen2);
     551             : 
     552             :             /* Also try to adjust lower bound for greater similarity */
     553       51270 :             tmp_count = count;
     554       51270 :             tmp_ulen2 = ulen2;
     555       51270 :             prev_lower = lower;
     556      417176 :             for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
     557             :             {
     558             :                 float       smlr_tmp;
     559             :                 int         tmp_trgindex;
     560             : 
     561             :                 /*
     562             :                  * Adjust lower bound only if trigram is lower bound of word
     563             :                  * for strict word similarity, or consider every trigram as
     564             :                  * lower bound for plain word similarity.
     565             :                  */
     566      369474 :                 if (!(flags & WORD_SIMILARITY_STRICT)
     567      290360 :                     || (bounds[tmp_lower] & TRGM_BOUND_LEFT))
     568             :                 {
     569      119378 :                     smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
     570      119378 :                     if (smlr_tmp > smlr_cur)
     571             :                     {
     572        7022 :                         smlr_cur = smlr_tmp;
     573        7022 :                         ulen2 = tmp_ulen2;
     574        7022 :                         lower = tmp_lower;
     575        7022 :                         count = tmp_count;
     576             :                     }
     577             : 
     578             :                     /*
     579             :                      * If we only check that word similarity is greater than
     580             :                      * threshold we do not need to calculate a maximum
     581             :                      * similarity.
     582             :                      */
     583      119378 :                     if ((flags & WORD_SIMILARITY_CHECK_ONLY)
     584       74228 :                         && smlr_cur >= threshold)
     585        3568 :                         break;
     586             :                 }
     587             : 
     588      365906 :                 tmp_trgindex = trg2indexes[tmp_lower];
     589      365906 :                 if (lastpos[tmp_trgindex] == tmp_lower)
     590             :                 {
     591      361388 :                     tmp_ulen2--;
     592      361388 :                     if (found[tmp_trgindex])
     593       93158 :                         tmp_count--;
     594             :                 }
     595             :             }
     596             : 
     597       51270 :             smlr_max = Max(smlr_max, smlr_cur);
     598             : 
     599             :             /*
     600             :              * if we only check that word similarity is greater than threshold
     601             :              * we do not need to calculate a maximum similarity.
     602             :              */
     603       51270 :             if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold)
     604        3568 :                 break;
     605             : 
     606       81198 :             for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
     607             :             {
     608             :                 int         tmp_trgindex;
     609             : 
     610       33496 :                 tmp_trgindex = trg2indexes[tmp_lower];
     611       33496 :                 if (lastpos[tmp_trgindex] == tmp_lower)
     612       32000 :                     lastpos[tmp_trgindex] = -1;
     613             :             }
     614             :         }
     615             :     }
     616             : 
     617       24252 :     pfree(lastpos);
     618             : 
     619       24252 :     return smlr_max;
     620             : }
     621             : 
     622             : /*
     623             :  * Calculate word similarity.
     624             :  * This function prepare two arrays: "trg2indexes" and "found". Then this arrays
     625             :  * are used to calculate word similarity using iterate_word_similarity().
     626             :  *
     627             :  * "trg2indexes" is array which stores indexes of the array "found".
     628             :  * In other words:
     629             :  * trg2indexes[j] = i;
     630             :  * found[i] = true (or false);
     631             :  * If found[i] == true then there is trigram trg2[j] in array "trg1".
     632             :  * If found[i] == false then there is not trigram trg2[j] in array "trg1".
     633             :  *
     634             :  * str1: search pattern string, of length slen1 bytes.
     635             :  * str2: text in which we are looking for a word, of length slen2 bytes.
     636             :  * flags: set of boolean flags parametrizing similarity calculation.
     637             :  *
     638             :  * Returns word similarity.
     639             :  */
     640             : static float4
     641       24252 : calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
     642             :                      uint8 flags)
     643             : {
     644             :     bool       *found;
     645             :     pos_trgm   *ptrg;
     646             :     trgm       *trg1;
     647             :     trgm       *trg2;
     648             :     int         len1,
     649             :                 len2,
     650             :                 len,
     651             :                 i,
     652             :                 j,
     653             :                 ulen1;
     654             :     int        *trg2indexes;
     655             :     float4      result;
     656             :     TrgmBound  *bounds;
     657             : 
     658       24252 :     protect_out_of_mem(slen1 + slen2);
     659             : 
     660             :     /* Make positional trigrams */
     661       24252 :     trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
     662       24252 :     trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
     663       24252 :     if (flags & WORD_SIMILARITY_STRICT)
     664       13324 :         bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
     665             :     else
     666       10928 :         bounds = NULL;
     667             : 
     668       24252 :     len1 = generate_trgm_only(trg1, str1, slen1, NULL);
     669       24252 :     len2 = generate_trgm_only(trg2, str2, slen2, bounds);
     670             : 
     671       24252 :     ptrg = make_positional_trgm(trg1, len1, trg2, len2);
     672       24252 :     len = len1 + len2;
     673       24252 :     qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
     674             : 
     675       24252 :     pfree(trg1);
     676       24252 :     pfree(trg2);
     677             : 
     678             :     /*
     679             :      * Merge positional trigrams array: enumerate each trigram and find its
     680             :      * presence in required word.
     681             :      */
     682       24252 :     trg2indexes = (int *) palloc(sizeof(int) * len2);
     683       24252 :     found = (bool *) palloc0(sizeof(bool) * len);
     684             : 
     685       24252 :     ulen1 = 0;
     686       24252 :     j = 0;
     687      601906 :     for (i = 0; i < len; i++)
     688             :     {
     689      577654 :         if (i > 0)
     690             :         {
     691      553402 :             int         cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg);
     692             : 
     693      553402 :             if (cmp != 0)
     694             :             {
     695      485002 :                 if (found[j])
     696      202278 :                     ulen1++;
     697      485002 :                 j++;
     698             :             }
     699             :         }
     700             : 
     701      577654 :         if (ptrg[i].index >= 0)
     702             :         {
     703      360178 :             trg2indexes[ptrg[i].index] = j;
     704             :         }
     705             :         else
     706             :         {
     707      217476 :             found[j] = true;
     708             :         }
     709             :     }
     710       24252 :     if (found[j])
     711       15198 :         ulen1++;
     712             : 
     713             :     /* Run iterative procedure to find maximum similarity with word */
     714       24252 :     result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
     715             :                                      flags, bounds);
     716             : 
     717       24252 :     pfree(trg2indexes);
     718       24252 :     pfree(found);
     719       24252 :     pfree(ptrg);
     720             : 
     721       24252 :     return result;
     722             : }
     723             : 
     724             : 
     725             : /*
     726             :  * Extract the next non-wildcard part of a search string, i.e. a word bounded
     727             :  * by '_' or '%' meta-characters, non-word characters or string end.
     728             :  *
     729             :  * str: source string, of length lenstr bytes (need not be null-terminated)
     730             :  * buf: where to return the substring (must be long enough)
     731             :  * *bytelen: receives byte length of the found substring
     732             :  * *charlen: receives character length of the found substring
     733             :  *
     734             :  * Returns pointer to end+1 of the found substring in the source string.
     735             :  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
     736             :  *
     737             :  * If the found word is bounded by non-word characters or string boundaries
     738             :  * then this function will include corresponding padding spaces into buf.
     739             :  */
     740             : static const char *
     741         110 : get_wildcard_part(const char *str, int lenstr,
     742             :                   char *buf, int *bytelen, int *charlen)
     743             : {
     744         110 :     const char *beginword = str;
     745             :     const char *endword;
     746         110 :     char       *s = buf;
     747         110 :     bool        in_leading_wildcard_meta = false;
     748         110 :     bool        in_trailing_wildcard_meta = false;
     749         110 :     bool        in_escape = false;
     750             :     int         clen;
     751             : 
     752             :     /*
     753             :      * Find the first word character, remembering whether preceding character
     754             :      * was wildcard meta-character.  Note that the in_escape state persists
     755             :      * from this loop to the next one, since we may exit at a word character
     756             :      * that is in_escape.
     757             :      */
     758         336 :     while (beginword - str < lenstr)
     759             :     {
     760         180 :         if (in_escape)
     761             :         {
     762           6 :             if (ISWORDCHR(beginword))
     763             :                 break;
     764           0 :             in_escape = false;
     765           0 :             in_leading_wildcard_meta = false;
     766             :         }
     767             :         else
     768             :         {
     769         174 :             if (ISESCAPECHAR(beginword))
     770           6 :                 in_escape = true;
     771         168 :             else if (ISWILDCARDCHAR(beginword))
     772          80 :                 in_leading_wildcard_meta = true;
     773          88 :             else if (ISWORDCHR(beginword))
     774             :                 break;
     775             :             else
     776          30 :                 in_leading_wildcard_meta = false;
     777             :         }
     778         116 :         beginword += pg_mblen(beginword);
     779             :     }
     780             : 
     781             :     /*
     782             :      * Handle string end.
     783             :      */
     784         110 :     if (beginword - str >= lenstr)
     785          46 :         return NULL;
     786             : 
     787             :     /*
     788             :      * Add left padding spaces if preceding character wasn't wildcard
     789             :      * meta-character.
     790             :      */
     791          64 :     *charlen = 0;
     792          64 :     if (!in_leading_wildcard_meta)
     793             :     {
     794             :         if (LPADDING > 0)
     795             :         {
     796          30 :             *s++ = ' ';
     797          30 :             (*charlen)++;
     798             :             if (LPADDING > 1)
     799             :             {
     800          30 :                 *s++ = ' ';
     801          30 :                 (*charlen)++;
     802             :             }
     803             :         }
     804             :     }
     805             : 
     806             :     /*
     807             :      * Copy data into buf until wildcard meta-character, non-word character or
     808             :      * string boundary.  Strip escapes during copy.
     809             :      */
     810          64 :     endword = beginword;
     811         296 :     while (endword - str < lenstr)
     812             :     {
     813         232 :         clen = pg_mblen(endword);
     814         232 :         if (in_escape)
     815             :         {
     816           6 :             if (ISWORDCHR(endword))
     817             :             {
     818           6 :                 memcpy(s, endword, clen);
     819           6 :                 (*charlen)++;
     820           6 :                 s += clen;
     821             :             }
     822             :             else
     823             :             {
     824             :                 /*
     825             :                  * Back up endword to the escape character when stopping at an
     826             :                  * escaped char, so that subsequent get_wildcard_part will
     827             :                  * restart from the escape character.  We assume here that
     828             :                  * escape chars are single-byte.
     829             :                  */
     830           0 :                 endword--;
     831           0 :                 break;
     832             :             }
     833           6 :             in_escape = false;
     834             :         }
     835             :         else
     836             :         {
     837         226 :             if (ISESCAPECHAR(endword))
     838           0 :                 in_escape = true;
     839         226 :             else if (ISWILDCARDCHAR(endword))
     840             :             {
     841          46 :                 in_trailing_wildcard_meta = true;
     842          46 :                 break;
     843             :             }
     844         180 :             else if (ISWORDCHR(endword))
     845             :             {
     846         162 :                 memcpy(s, endword, clen);
     847         162 :                 (*charlen)++;
     848         162 :                 s += clen;
     849             :             }
     850             :             else
     851             :                 break;
     852             :         }
     853         168 :         endword += clen;
     854             :     }
     855             : 
     856             :     /*
     857             :      * Add right padding spaces if next character isn't wildcard
     858             :      * meta-character.
     859             :      */
     860          64 :     if (!in_trailing_wildcard_meta)
     861             :     {
     862             :         if (RPADDING > 0)
     863             :         {
     864          18 :             *s++ = ' ';
     865          18 :             (*charlen)++;
     866             :             if (RPADDING > 1)
     867             :             {
     868             :                 *s++ = ' ';
     869             :                 (*charlen)++;
     870             :             }
     871             :         }
     872             :     }
     873             : 
     874          64 :     *bytelen = s - buf;
     875          64 :     return endword;
     876             : }
     877             : 
     878             : /*
     879             :  * Generates trigrams for wildcard search string.
     880             :  *
     881             :  * Returns array of trigrams that must occur in any string that matches the
     882             :  * wildcard string.  For example, given pattern "a%bcd%" the trigrams
     883             :  * " a", "bcd" would be extracted.
     884             :  */
     885             : TRGM *
     886          46 : generate_wildcard_trgm(const char *str, int slen)
     887             : {
     888             :     TRGM       *trg;
     889             :     char       *buf,
     890             :                *buf2;
     891             :     trgm       *tptr;
     892             :     int         len,
     893             :                 charlen,
     894             :                 bytelen;
     895             :     const char *eword;
     896             : 
     897          46 :     protect_out_of_mem(slen);
     898             : 
     899          46 :     trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
     900          46 :     trg->flag = ARRKEY;
     901          46 :     SET_VARSIZE(trg, TRGMHDRSIZE);
     902             : 
     903          46 :     if (slen + LPADDING + RPADDING < 3 || slen == 0)
     904           0 :         return trg;
     905             : 
     906          46 :     tptr = GETARR(trg);
     907             : 
     908             :     /* Allocate a buffer for blank-padded, but not yet case-folded, words */
     909          46 :     buf = palloc(sizeof(char) * (slen + 4));
     910             : 
     911             :     /*
     912             :      * Extract trigrams from each substring extracted by get_wildcard_part.
     913             :      */
     914          46 :     eword = str;
     915         156 :     while ((eword = get_wildcard_part(eword, slen - (eword - str),
     916             :                                       buf, &bytelen, &charlen)) != NULL)
     917             :     {
     918             : #ifdef IGNORECASE
     919          64 :         buf2 = lowerstr_with_len(buf, bytelen);
     920          64 :         bytelen = strlen(buf2);
     921             : #else
     922             :         buf2 = buf;
     923             : #endif
     924             : 
     925             :         /*
     926             :          * count trigrams
     927             :          */
     928          64 :         tptr = make_trigrams(tptr, buf2, bytelen, charlen);
     929             : 
     930             : #ifdef IGNORECASE
     931          64 :         pfree(buf2);
     932             : #endif
     933             :     }
     934             : 
     935          46 :     pfree(buf);
     936             : 
     937          46 :     if ((len = tptr - GETARR(trg)) == 0)
     938           0 :         return trg;
     939             : 
     940             :     /*
     941             :      * Make trigrams unique.
     942             :      */
     943          46 :     if (len > 1)
     944             :     {
     945          18 :         qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
     946          18 :         len = unique_array(GETARR(trg), len);
     947             :     }
     948             : 
     949          46 :     SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
     950             : 
     951          46 :     return trg;
     952             : }
     953             : 
     954             : uint32
     955       69000 : trgm2int(trgm *ptr)
     956             : {
     957       69000 :     uint32      val = 0;
     958             : 
     959       69000 :     val |= *(((unsigned char *) ptr));
     960       69000 :     val <<= 8;
     961       69000 :     val |= *(((unsigned char *) ptr) + 1);
     962       69000 :     val <<= 8;
     963       69000 :     val |= *(((unsigned char *) ptr) + 2);
     964             : 
     965       69000 :     return val;
     966             : }
     967             : 
     968             : Datum
     969          14 : show_trgm(PG_FUNCTION_ARGS)
     970             : {
     971          14 :     text       *in = PG_GETARG_TEXT_PP(0);
     972             :     TRGM       *trg;
     973             :     Datum      *d;
     974             :     ArrayType  *a;
     975             :     trgm       *ptr;
     976             :     int         i;
     977             : 
     978          14 :     trg = generate_trgm(VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
     979          14 :     d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));
     980             : 
     981          88 :     for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
     982             :     {
     983          74 :         text       *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));
     984             : 
     985          74 :         if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr))
     986             :         {
     987           0 :             snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
     988           0 :             SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
     989             :         }
     990             :         else
     991             :         {
     992          74 :             SET_VARSIZE(item, VARHDRSZ + 3);
     993          74 :             CPTRGM(VARDATA(item), ptr);
     994             :         }
     995          74 :         d[i] = PointerGetDatum(item);
     996             :     }
     997             : 
     998          14 :     a = construct_array(
     999             :                         d,
    1000          14 :                         ARRNELEM(trg),
    1001             :                         TEXTOID,
    1002             :                         -1,
    1003             :                         false,
    1004             :                         'i'
    1005             :         );
    1006             : 
    1007          88 :     for (i = 0; i < ARRNELEM(trg); i++)
    1008          74 :         pfree(DatumGetPointer(d[i]));
    1009             : 
    1010          14 :     pfree(d);
    1011          14 :     pfree(trg);
    1012          14 :     PG_FREE_IF_COPY(in, 0);
    1013             : 
    1014          14 :     PG_RETURN_POINTER(a);
    1015             : }
    1016             : 
    1017             : float4
    1018      128810 : cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
    1019             : {
    1020             :     trgm       *ptr1,
    1021             :                *ptr2;
    1022      128810 :     int         count = 0;
    1023             :     int         len1,
    1024             :                 len2;
    1025             : 
    1026      128810 :     ptr1 = GETARR(trg1);
    1027      128810 :     ptr2 = GETARR(trg2);
    1028             : 
    1029      128810 :     len1 = ARRNELEM(trg1);
    1030      128810 :     len2 = ARRNELEM(trg2);
    1031             : 
    1032             :     /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
    1033      128810 :     if (len1 <= 0 || len2 <= 0)
    1034           2 :         return (float4) 0.0;
    1035             : 
    1036     1713170 :     while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
    1037             :     {
    1038     1455554 :         int         res = CMPTRGM(ptr1, ptr2);
    1039             : 
    1040     1455554 :         if (res < 0)
    1041      314464 :             ptr1++;
    1042     1141090 :         else if (res > 0)
    1043      377188 :             ptr2++;
    1044             :         else
    1045             :         {
    1046      763902 :             ptr1++;
    1047      763902 :             ptr2++;
    1048      763902 :             count++;
    1049             :         }
    1050             :     }
    1051             : 
    1052             :     /*
    1053             :      * If inexact then len2 is equal to count, because we don't know actual
    1054             :      * length of second string in inexact search and we can assume that count
    1055             :      * is a lower bound of len2.
    1056             :      */
    1057      128808 :     return CALCSML(count, len1, inexact ? count : len2);
    1058             : }
    1059             : 
    1060             : 
    1061             : /*
    1062             :  * Returns whether trg2 contains all trigrams in trg1.
    1063             :  * This relies on the trigram arrays being sorted.
    1064             :  */
    1065             : bool
    1066          56 : trgm_contained_by(TRGM *trg1, TRGM *trg2)
    1067             : {
    1068             :     trgm       *ptr1,
    1069             :                *ptr2;
    1070             :     int         len1,
    1071             :                 len2;
    1072             : 
    1073          56 :     ptr1 = GETARR(trg1);
    1074          56 :     ptr2 = GETARR(trg2);
    1075             : 
    1076          56 :     len1 = ARRNELEM(trg1);
    1077          56 :     len2 = ARRNELEM(trg2);
    1078             : 
    1079         356 :     while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
    1080             :     {
    1081         278 :         int         res = CMPTRGM(ptr1, ptr2);
    1082             : 
    1083         278 :         if (res < 0)
    1084          34 :             return false;
    1085         244 :         else if (res > 0)
    1086         206 :             ptr2++;
    1087             :         else
    1088             :         {
    1089          38 :             ptr1++;
    1090          38 :             ptr2++;
    1091             :         }
    1092             :     }
    1093          22 :     if (ptr1 - GETARR(trg1) < len1)
    1094           8 :         return false;
    1095             :     else
    1096          14 :         return true;
    1097             : }
    1098             : 
    1099             : /*
    1100             :  * Return a palloc'd boolean array showing, for each trigram in "query",
    1101             :  * whether it is present in the trigram array "key".
    1102             :  * This relies on the "key" array being sorted, but "query" need not be.
    1103             :  */
    1104             : bool *
    1105        2120 : trgm_presence_map(TRGM *query, TRGM *key)
    1106             : {
    1107             :     bool       *result;
    1108        2120 :     trgm       *ptrq = GETARR(query),
    1109        2120 :                *ptrk = GETARR(key);
    1110        2120 :     int         lenq = ARRNELEM(query),
    1111        2120 :                 lenk = ARRNELEM(key),
    1112             :                 i;
    1113             : 
    1114        2120 :     result = (bool *) palloc0(lenq * sizeof(bool));
    1115             : 
    1116             :     /* for each query trigram, do a binary search in the key array */
    1117      507248 :     for (i = 0; i < lenq; i++)
    1118             :     {
    1119      505128 :         int         lo = 0;
    1120      505128 :         int         hi = lenk;
    1121             : 
    1122     2877658 :         while (lo < hi)
    1123             :         {
    1124     1875468 :             int         mid = (lo + hi) / 2;
    1125     1875468 :             int         res = CMPTRGM(ptrq, ptrk + mid);
    1126             : 
    1127     1875468 :             if (res < 0)
    1128      783390 :                 hi = mid;
    1129     1092078 :             else if (res > 0)
    1130     1084012 :                 lo = mid + 1;
    1131             :             else
    1132             :             {
    1133        8066 :                 result[i] = true;
    1134        8066 :                 break;
    1135             :             }
    1136             :         }
    1137      505128 :         ptrq++;
    1138             :     }
    1139             : 
    1140        2120 :     return result;
    1141             : }
    1142             : 
    1143             : Datum
    1144       60678 : similarity(PG_FUNCTION_ARGS)
    1145             : {
    1146       60678 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1147       60678 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1148             :     TRGM       *trg1,
    1149             :                *trg2;
    1150             :     float4      res;
    1151             : 
    1152       60678 :     trg1 = generate_trgm(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1));
    1153       60678 :     trg2 = generate_trgm(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2));
    1154             : 
    1155       60678 :     res = cnt_sml(trg1, trg2, false);
    1156             : 
    1157       60678 :     pfree(trg1);
    1158       60678 :     pfree(trg2);
    1159       60678 :     PG_FREE_IF_COPY(in1, 0);
    1160       60678 :     PG_FREE_IF_COPY(in2, 1);
    1161             : 
    1162       60678 :     PG_RETURN_FLOAT4(res);
    1163             : }
    1164             : 
    1165             : Datum
    1166        1804 : word_similarity(PG_FUNCTION_ARGS)
    1167             : {
    1168        1804 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1169        1804 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1170             :     float4      res;
    1171             : 
    1172        7216 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1173        7216 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1174             :                                0);
    1175             : 
    1176        1804 :     PG_FREE_IF_COPY(in1, 0);
    1177        1804 :     PG_FREE_IF_COPY(in2, 1);
    1178        1804 :     PG_RETURN_FLOAT4(res);
    1179             : }
    1180             : 
    1181             : Datum
    1182        1764 : strict_word_similarity(PG_FUNCTION_ARGS)
    1183             : {
    1184        1764 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1185        1764 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1186             :     float4      res;
    1187             : 
    1188        7056 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1189        7056 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1190             :                                WORD_SIMILARITY_STRICT);
    1191             : 
    1192        1764 :     PG_FREE_IF_COPY(in1, 0);
    1193        1764 :     PG_FREE_IF_COPY(in2, 1);
    1194        1764 :     PG_RETURN_FLOAT4(res);
    1195             : }
    1196             : 
    1197             : Datum
    1198        2004 : similarity_dist(PG_FUNCTION_ARGS)
    1199             : {
    1200        2004 :     float4      res = DatumGetFloat4(DirectFunctionCall2(similarity,
    1201             :                                                          PG_GETARG_DATUM(0),
    1202             :                                                          PG_GETARG_DATUM(1)));
    1203             : 
    1204        2004 :     PG_RETURN_FLOAT4(1.0 - res);
    1205             : }
    1206             : 
    1207             : Datum
    1208       12000 : similarity_op(PG_FUNCTION_ARGS)
    1209             : {
    1210       12000 :     float4      res = DatumGetFloat4(DirectFunctionCall2(similarity,
    1211             :                                                          PG_GETARG_DATUM(0),
    1212             :                                                          PG_GETARG_DATUM(1)));
    1213             : 
    1214       12000 :     PG_RETURN_BOOL(res >= similarity_threshold);
    1215             : }
    1216             : 
    1217             : Datum
    1218        3794 : word_similarity_op(PG_FUNCTION_ARGS)
    1219             : {
    1220        3794 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1221        3794 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1222             :     float4      res;
    1223             : 
    1224       15176 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1225       15176 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1226             :                                WORD_SIMILARITY_CHECK_ONLY);
    1227             : 
    1228        3794 :     PG_FREE_IF_COPY(in1, 0);
    1229        3794 :     PG_FREE_IF_COPY(in2, 1);
    1230        3794 :     PG_RETURN_BOOL(res >= word_similarity_threshold);
    1231             : }
    1232             : 
    1233             : Datum
    1234        3902 : word_similarity_commutator_op(PG_FUNCTION_ARGS)
    1235             : {
    1236        3902 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1237        3902 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1238             :     float4      res;
    1239             : 
    1240       15608 :     res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1241       15608 :                                VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1242             :                                WORD_SIMILARITY_CHECK_ONLY);
    1243             : 
    1244        3902 :     PG_FREE_IF_COPY(in1, 0);
    1245        3902 :     PG_FREE_IF_COPY(in2, 1);
    1246        3902 :     PG_RETURN_BOOL(res >= word_similarity_threshold);
    1247             : }
    1248             : 
    1249             : Datum
    1250           0 : word_similarity_dist_op(PG_FUNCTION_ARGS)
    1251             : {
    1252           0 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1253           0 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1254             :     float4      res;
    1255             : 
    1256           0 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1257           0 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1258             :                                0);
    1259             : 
    1260           0 :     PG_FREE_IF_COPY(in1, 0);
    1261           0 :     PG_FREE_IF_COPY(in2, 1);
    1262           0 :     PG_RETURN_FLOAT4(1.0 - res);
    1263             : }
    1264             : 
    1265             : Datum
    1266        1428 : word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
    1267             : {
    1268        1428 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1269        1428 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1270             :     float4      res;
    1271             : 
    1272        5712 :     res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1273        5712 :                                VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1274             :                                0);
    1275             : 
    1276        1428 :     PG_FREE_IF_COPY(in1, 0);
    1277        1428 :     PG_FREE_IF_COPY(in2, 1);
    1278        1428 :     PG_RETURN_FLOAT4(1.0 - res);
    1279             : }
    1280             : 
    1281             : Datum
    1282        4898 : strict_word_similarity_op(PG_FUNCTION_ARGS)
    1283             : {
    1284        4898 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1285        4898 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1286             :     float4      res;
    1287             : 
    1288       19592 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1289       19592 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1290             :                                WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
    1291             : 
    1292        4898 :     PG_FREE_IF_COPY(in1, 0);
    1293        4898 :     PG_FREE_IF_COPY(in2, 1);
    1294        4898 :     PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
    1295             : }
    1296             : 
    1297             : Datum
    1298        5222 : strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
    1299             : {
    1300        5222 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1301        5222 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1302             :     float4      res;
    1303             : 
    1304       20888 :     res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1305       20888 :                                VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1306             :                                WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
    1307             : 
    1308        5222 :     PG_FREE_IF_COPY(in1, 0);
    1309        5222 :     PG_FREE_IF_COPY(in2, 1);
    1310        5222 :     PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
    1311             : }
    1312             : 
    1313             : Datum
    1314           0 : strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
    1315             : {
    1316           0 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1317           0 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1318             :     float4      res;
    1319             : 
    1320           0 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1321           0 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1322             :                                WORD_SIMILARITY_STRICT);
    1323             : 
    1324           0 :     PG_FREE_IF_COPY(in1, 0);
    1325           0 :     PG_FREE_IF_COPY(in2, 1);
    1326           0 :     PG_RETURN_FLOAT4(1.0 - res);
    1327             : }
    1328             : 
    1329             : Datum
    1330        1440 : strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
    1331             : {
    1332        1440 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1333        1440 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1334             :     float4      res;
    1335             : 
    1336        5760 :     res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1337        5760 :                                VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1338             :                                WORD_SIMILARITY_STRICT);
    1339             : 
    1340        1440 :     PG_FREE_IF_COPY(in1, 0);
    1341        1440 :     PG_FREE_IF_COPY(in2, 1);
    1342        1440 :     PG_RETURN_FLOAT4(1.0 - res);
    1343             : }

Generated by: LCOV version 1.13