LCOV - code coverage report
Current view: top level - contrib/pg_trgm - trgm_op.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 445 486 91.6 %
Date: 2025-01-18 04:15:08 Functions: 50 52 96.2 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * contrib/pg_trgm/trgm_op.c
       3             :  */
       4             : #include "postgres.h"
       5             : 
       6             : #include <ctype.h>
       7             : 
       8             : #include "catalog/pg_collation_d.h"
       9             : #include "catalog/pg_type.h"
      10             : #include "common/int.h"
      11             : #include "lib/qunique.h"
      12             : #include "miscadmin.h"
      13             : #include "trgm.h"
      14             : #include "tsearch/ts_locale.h"
      15             : #include "utils/formatting.h"
      16             : #include "utils/guc.h"
      17             : #include "utils/lsyscache.h"
      18             : #include "utils/memutils.h"
      19             : #include "utils/pg_crc.h"
      20             : 
      21           6 : PG_MODULE_MAGIC;
      22             : 
      23             : /* GUC variables */
      24             : double      similarity_threshold = 0.3f;
      25             : double      word_similarity_threshold = 0.6f;
      26             : double      strict_word_similarity_threshold = 0.5f;
      27             : 
      28           4 : PG_FUNCTION_INFO_V1(set_limit);
      29           4 : PG_FUNCTION_INFO_V1(show_limit);
      30           4 : PG_FUNCTION_INFO_V1(show_trgm);
      31           4 : PG_FUNCTION_INFO_V1(similarity);
      32           4 : PG_FUNCTION_INFO_V1(word_similarity);
      33           4 : PG_FUNCTION_INFO_V1(strict_word_similarity);
      34           4 : PG_FUNCTION_INFO_V1(similarity_dist);
      35           4 : PG_FUNCTION_INFO_V1(similarity_op);
      36           4 : PG_FUNCTION_INFO_V1(word_similarity_op);
      37           4 : PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
      38           2 : PG_FUNCTION_INFO_V1(word_similarity_dist_op);
      39           4 : PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
      40           4 : PG_FUNCTION_INFO_V1(strict_word_similarity_op);
      41           4 : PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op);
      42           2 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op);
      43           4 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op);
      44             : 
      45             : /* Trigram with position */
      46             : typedef struct
      47             : {
      48             :     trgm        trg;
      49             :     int         index;
      50             : } pos_trgm;
      51             : 
      52             : /* Trigram bound type */
      53             : typedef uint8 TrgmBound;
      54             : #define TRGM_BOUND_LEFT             0x01    /* trigram is left bound of word */
      55             : #define TRGM_BOUND_RIGHT            0x02    /* trigram is right bound of word */
      56             : 
      57             : /* Word similarity flags */
      58             : #define WORD_SIMILARITY_CHECK_ONLY  0x01    /* only check existence of similar
      59             :                                              * search pattern in text */
      60             : #define WORD_SIMILARITY_STRICT      0x02    /* force bounds of extent to match
      61             :                                              * word bounds */
      62             : 
      63             : /*
      64             :  * Module load callback
      65             :  */
      66             : void
      67           6 : _PG_init(void)
      68             : {
      69             :     /* Define custom GUC variables. */
      70           6 :     DefineCustomRealVariable("pg_trgm.similarity_threshold",
      71             :                              "Sets the threshold used by the % operator.",
      72             :                              "Valid range is 0.0 .. 1.0.",
      73             :                              &similarity_threshold,
      74             :                              0.3f,
      75             :                              0.0,
      76             :                              1.0,
      77             :                              PGC_USERSET,
      78             :                              0,
      79             :                              NULL,
      80             :                              NULL,
      81             :                              NULL);
      82           6 :     DefineCustomRealVariable("pg_trgm.word_similarity_threshold",
      83             :                              "Sets the threshold used by the <% operator.",
      84             :                              "Valid range is 0.0 .. 1.0.",
      85             :                              &word_similarity_threshold,
      86             :                              0.6f,
      87             :                              0.0,
      88             :                              1.0,
      89             :                              PGC_USERSET,
      90             :                              0,
      91             :                              NULL,
      92             :                              NULL,
      93             :                              NULL);
      94           6 :     DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold",
      95             :                              "Sets the threshold used by the <<% operator.",
      96             :                              "Valid range is 0.0 .. 1.0.",
      97             :                              &strict_word_similarity_threshold,
      98             :                              0.5f,
      99             :                              0.0,
     100             :                              1.0,
     101             :                              PGC_USERSET,
     102             :                              0,
     103             :                              NULL,
     104             :                              NULL,
     105             :                              NULL);
     106             : 
     107           6 :     MarkGUCPrefixReserved("pg_trgm");
     108           6 : }
     109             : 
     110             : /*
     111             :  * Deprecated function.
     112             :  * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
     113             :  */
     114             : Datum
     115           4 : set_limit(PG_FUNCTION_ARGS)
     116             : {
     117           4 :     float4      nlimit = PG_GETARG_FLOAT4(0);
     118             :     char       *nlimit_str;
     119             :     Oid         func_out_oid;
     120             :     bool        is_varlena;
     121             : 
     122           4 :     getTypeOutputInfo(FLOAT4OID, &func_out_oid, &is_varlena);
     123             : 
     124           4 :     nlimit_str = OidOutputFunctionCall(func_out_oid, Float4GetDatum(nlimit));
     125             : 
     126           4 :     SetConfigOption("pg_trgm.similarity_threshold", nlimit_str,
     127             :                     PGC_USERSET, PGC_S_SESSION);
     128             : 
     129           4 :     PG_RETURN_FLOAT4(similarity_threshold);
     130             : }
     131             : 
     132             : 
     133             : /*
     134             :  * Get similarity threshold for given index scan strategy number.
     135             :  */
     136             : double
     137       85720 : index_strategy_get_limit(StrategyNumber strategy)
     138             : {
     139       85720 :     switch (strategy)
     140             :     {
     141       64112 :         case SimilarityStrategyNumber:
     142       64112 :             return similarity_threshold;
     143        9644 :         case WordSimilarityStrategyNumber:
     144        9644 :             return word_similarity_threshold;
     145       11964 :         case StrictWordSimilarityStrategyNumber:
     146       11964 :             return strict_word_similarity_threshold;
     147           0 :         default:
     148           0 :             elog(ERROR, "unrecognized strategy number: %d", strategy);
     149             :             break;
     150             :     }
     151             : 
     152             :     return 0.0;                 /* keep compiler quiet */
     153             : }
     154             : 
     155             : /*
     156             :  * Deprecated function.
     157             :  * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
     158             :  */
     159             : Datum
     160       40000 : show_limit(PG_FUNCTION_ARGS)
     161             : {
     162       40000 :     PG_RETURN_FLOAT4(similarity_threshold);
     163             : }
     164             : 
     165             : static int
     166     6374440 : comp_trgm(const void *a, const void *b)
     167             : {
     168     6374440 :     return CMPTRGM(a, b);
     169             : }
     170             : 
     171             : /*
     172             :  * Finds first word in string, returns pointer to the word,
     173             :  * endword points to the character after word
     174             :  */
     175             : static char *
     176      482822 : find_word(char *str, int lenstr, char **endword, int *charlen)
     177             : {
     178      482822 :     char       *beginword = str;
     179             : 
     180      510150 :     while (beginword - str < lenstr && !ISWORDCHR(beginword))
     181       27328 :         beginword += pg_mblen(beginword);
     182             : 
     183      482822 :     if (beginword - str >= lenstr)
     184      228160 :         return NULL;
     185             : 
     186      254662 :     *endword = beginword;
     187      254662 :     *charlen = 0;
     188     2199358 :     while (*endword - str < lenstr && ISWORDCHR(*endword))
     189             :     {
     190     1944696 :         *endword += pg_mblen(*endword);
     191     1944696 :         (*charlen)++;
     192             :     }
     193             : 
     194      254662 :     return beginword;
     195             : }
     196             : 
     197             : /*
     198             :  * Reduce a trigram (three possibly multi-byte characters) to a trgm,
     199             :  * which is always exactly three bytes.  If we have three single-byte
     200             :  * characters, we just use them as-is; otherwise we form a hash value.
     201             :  */
     202             : void
     203        2918 : compact_trigram(trgm *tptr, char *str, int bytelen)
     204             : {
     205        2918 :     if (bytelen == 3)
     206             :     {
     207        2918 :         CPTRGM(tptr, str);
     208             :     }
     209             :     else
     210             :     {
     211             :         pg_crc32    crc;
     212             : 
     213           0 :         INIT_LEGACY_CRC32(crc);
     214           0 :         COMP_LEGACY_CRC32(crc, str, bytelen);
     215           0 :         FIN_LEGACY_CRC32(crc);
     216             : 
     217             :         /*
     218             :          * use only 3 upper bytes from crc, hope, it's good enough hashing
     219             :          */
     220           0 :         CPTRGM(tptr, &crc);
     221             :     }
     222        2918 : }
     223             : 
     224             : /*
     225             :  * Adds trigrams from words (already padded).
     226             :  */
     227             : static trgm *
     228      254790 : make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
     229             : {
     230      254790 :     char       *ptr = str;
     231             : 
     232      254790 :     if (charlen < 3)
     233          54 :         return tptr;
     234             : 
     235      254736 :     if (bytelen > charlen)
     236             :     {
     237             :         /* Find multibyte character boundaries and apply compact_trigram */
     238           0 :         int         lenfirst = pg_mblen(str),
     239           0 :                     lenmiddle = pg_mblen(str + lenfirst),
     240           0 :                     lenlast = pg_mblen(str + lenfirst + lenmiddle);
     241             : 
     242           0 :         while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
     243             :         {
     244           0 :             compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
     245             : 
     246           0 :             ptr += lenfirst;
     247           0 :             tptr++;
     248             : 
     249           0 :             lenfirst = lenmiddle;
     250           0 :             lenmiddle = lenlast;
     251           0 :             lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
     252             :         }
     253             :     }
     254             :     else
     255             :     {
     256             :         /* Fast path when there are no multibyte characters */
     257             :         Assert(bytelen == charlen);
     258             : 
     259     2454276 :         while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
     260             :         {
     261     2199540 :             CPTRGM(tptr, ptr);
     262     2199540 :             ptr++;
     263     2199540 :             tptr++;
     264             :         }
     265             :     }
     266             : 
     267      254736 :     return tptr;
     268             : }
     269             : 
     270             : /*
     271             :  * Make array of trigrams without sorting and removing duplicate items.
     272             :  *
     273             :  * trg: where to return the array of trigrams.
     274             :  * str: source string, of length slen bytes.
     275             :  * bounds: where to return bounds of trigrams (if needed).
     276             :  *
     277             :  * Returns length of the generated array.
     278             :  */
     279             : static int
     280      228162 : generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
     281             : {
     282             :     trgm       *tptr;
     283             :     char       *buf;
     284             :     int         charlen,
     285             :                 bytelen;
     286             :     char       *bword,
     287             :                *eword;
     288             : 
     289      228162 :     if (slen + LPADDING + RPADDING < 3 || slen == 0)
     290           2 :         return 0;
     291             : 
     292      228160 :     tptr = trg;
     293             : 
     294             :     /* Allocate a buffer for case-folded, blank-padded words */
     295      228160 :     buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
     296             : 
     297             :     if (LPADDING > 0)
     298             :     {
     299      228160 :         *buf = ' ';
     300             :         if (LPADDING > 1)
     301      228160 :             *(buf + 1) = ' ';
     302             :     }
     303             : 
     304      228160 :     eword = str;
     305      482822 :     while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
     306             :     {
     307             : #ifdef IGNORECASE
     308      254662 :         bword = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID);
     309      254662 :         bytelen = strlen(bword);
     310             : #else
     311             :         bytelen = eword - bword;
     312             : #endif
     313             : 
     314      254662 :         memcpy(buf + LPADDING, bword, bytelen);
     315             : 
     316             : #ifdef IGNORECASE
     317      254662 :         pfree(bword);
     318             : #endif
     319             : 
     320      254662 :         buf[LPADDING + bytelen] = ' ';
     321      254662 :         buf[LPADDING + bytelen + 1] = ' ';
     322             : 
     323             :         /* Calculate trigrams marking their bounds if needed */
     324      254662 :         if (bounds)
     325       24796 :             bounds[tptr - trg] |= TRGM_BOUND_LEFT;
     326      254662 :         tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
     327             :                              charlen + LPADDING + RPADDING);
     328      254662 :         if (bounds)
     329       24796 :             bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
     330             :     }
     331             : 
     332      228160 :     pfree(buf);
     333             : 
     334      228160 :     return tptr - trg;
     335             : }
     336             : 
     337             : /*
     338             :  * Guard against possible overflow in the palloc requests below.  (We
     339             :  * don't worry about the additive constants, since palloc can detect
     340             :  * requests that are a little above MaxAllocSize --- we just need to
     341             :  * prevent integer overflow in the multiplications.)
     342             :  */
     343             : static void
     344      204020 : protect_out_of_mem(int slen)
     345             : {
     346      204020 :     if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
     347      204020 :         (Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
     348           0 :         ereport(ERROR,
     349             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     350             :                  errmsg("out of memory")));
     351      204020 : }
     352             : 
     353             : /*
     354             :  * Make array of trigrams with sorting and removing duplicate items.
     355             :  *
     356             :  * str: source string, of length slen bytes.
     357             :  *
     358             :  * Returns the sorted array of unique trigrams.
     359             :  */
     360             : TRGM *
     361      179658 : generate_trgm(char *str, int slen)
     362             : {
     363             :     TRGM       *trg;
     364             :     int         len;
     365             : 
     366      179658 :     protect_out_of_mem(slen);
     367             : 
     368      179658 :     trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
     369      179658 :     trg->flag = ARRKEY;
     370             : 
     371      179658 :     len = generate_trgm_only(GETARR(trg), str, slen, NULL);
     372      179658 :     SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
     373             : 
     374      179658 :     if (len == 0)
     375           8 :         return trg;
     376             : 
     377             :     /*
     378             :      * Make trigrams unique.
     379             :      */
     380      179650 :     if (len > 1)
     381             :     {
     382      179650 :         qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
     383      179650 :         len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
     384             :     }
     385             : 
     386      179650 :     SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
     387             : 
     388      179650 :     return trg;
     389             : }
     390             : 
     391             : /*
     392             :  * Make array of positional trigrams from two trigram arrays trg1 and trg2.
     393             :  *
     394             :  * trg1: trigram array of search pattern, of length len1. trg1 is required
     395             :  *       word which positions don't matter and replaced with -1.
     396             :  * trg2: trigram array of text, of length len2. trg2 is haystack where we
     397             :  *       search and have to store its positions.
     398             :  *
     399             :  * Returns concatenated trigram array.
     400             :  */
     401             : static pos_trgm *
     402       24252 : make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
     403             : {
     404             :     pos_trgm   *result;
     405             :     int         i,
     406       24252 :                 len = len1 + len2;
     407             : 
     408       24252 :     result = (pos_trgm *) palloc(sizeof(pos_trgm) * len);
     409             : 
     410      241728 :     for (i = 0; i < len1; i++)
     411             :     {
     412      217476 :         memcpy(&result[i].trg, &trg1[i], sizeof(trgm));
     413      217476 :         result[i].index = -1;
     414             :     }
     415             : 
     416      384434 :     for (i = 0; i < len2; i++)
     417             :     {
     418      360182 :         memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm));
     419      360182 :         result[i + len1].index = i;
     420             :     }
     421             : 
     422       24252 :     return result;
     423             : }
     424             : 
     425             : /*
     426             :  * Compare position trigrams: compare trigrams first and position second.
     427             :  */
     428             : static int
     429     2615470 : comp_ptrgm(const void *v1, const void *v2)
     430             : {
     431     2615470 :     const pos_trgm *p1 = (const pos_trgm *) v1;
     432     2615470 :     const pos_trgm *p2 = (const pos_trgm *) v2;
     433             :     int         cmp;
     434             : 
     435     2615470 :     cmp = CMPTRGM(p1->trg, p2->trg);
     436     2615470 :     if (cmp != 0)
     437     2536054 :         return cmp;
     438             : 
     439       79416 :     return pg_cmp_s32(p1->index, p2->index);
     440             : }
     441             : 
     442             : /*
     443             :  * Iterative search function which calculates maximum similarity with word in
     444             :  * the string. Maximum similarity is only calculated only if the flag
     445             :  * WORD_SIMILARITY_CHECK_ONLY isn't set.
     446             :  *
     447             :  * trg2indexes: array which stores indexes of the array "found".
     448             :  * found: array which stores true of false values.
     449             :  * ulen1: count of unique trigrams of array "trg1".
     450             :  * len2: length of array "trg2" and array "trg2indexes".
     451             :  * len: length of the array "found".
     452             :  * flags: set of boolean flags parameterizing similarity calculation.
     453             :  * bounds: whether each trigram is left/right bound of word.
     454             :  *
     455             :  * Returns word similarity.
     456             :  */
     457             : static float4
     458       24252 : iterate_word_similarity(int *trg2indexes,
     459             :                         bool *found,
     460             :                         int ulen1,
     461             :                         int len2,
     462             :                         int len,
     463             :                         uint8 flags,
     464             :                         TrgmBound *bounds)
     465             : {
     466             :     int        *lastpos,
     467             :                 i,
     468       24252 :                 ulen2 = 0,
     469       24252 :                 count = 0,
     470       24252 :                 upper = -1,
     471             :                 lower;
     472             :     float4      smlr_cur,
     473       24252 :                 smlr_max = 0.0f;
     474             :     double      threshold;
     475             : 
     476             :     Assert(bounds || !(flags & WORD_SIMILARITY_STRICT));
     477             : 
     478             :     /* Select appropriate threshold */
     479       48504 :     threshold = (flags & WORD_SIMILARITY_STRICT) ?
     480       24252 :         strict_word_similarity_threshold :
     481             :         word_similarity_threshold;
     482             : 
     483             :     /*
     484             :      * Consider first trigram as initial lower bound for strict word
     485             :      * similarity, or initialize it later with first trigram present for plain
     486             :      * word similarity.
     487             :      */
     488       24252 :     lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1;
     489             : 
     490             :     /* Memorise last position of each trigram */
     491       24252 :     lastpos = (int *) palloc(sizeof(int) * len);
     492       24252 :     memset(lastpos, -1, sizeof(int) * len);
     493             : 
     494      367294 :     for (i = 0; i < len2; i++)
     495             :     {
     496             :         int         trgindex;
     497             : 
     498      346610 :         CHECK_FOR_INTERRUPTS();
     499             : 
     500             :         /* Get index of next trigram */
     501      346610 :         trgindex = trg2indexes[i];
     502             : 
     503             :         /* Update last position of this trigram */
     504      346610 :         if (lower >= 0 || found[trgindex])
     505             :         {
     506      271594 :             if (lastpos[trgindex] < 0)
     507             :             {
     508      267886 :                 ulen2++;
     509      267886 :                 if (found[trgindex])
     510       61512 :                     count++;
     511             :             }
     512      271594 :             lastpos[trgindex] = i;
     513             :         }
     514             : 
     515             :         /*
     516             :          * Adjust upper bound if trigram is upper bound of word for strict
     517             :          * word similarity, or if trigram is present in required substring for
     518             :          * plain word similarity
     519             :          */
     520      500704 :         if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT)
     521      154094 :             : found[trgindex])
     522             :         {
     523             :             int         prev_lower,
     524             :                         tmp_ulen2,
     525             :                         tmp_lower,
     526             :                         tmp_count;
     527             : 
     528       51274 :             upper = i;
     529       51274 :             if (lower == -1)
     530             :             {
     531        9390 :                 lower = i;
     532        9390 :                 ulen2 = 1;
     533             :             }
     534             : 
     535       51274 :             smlr_cur = CALCSML(count, ulen1, ulen2);
     536             : 
     537             :             /* Also try to adjust lower bound for greater similarity */
     538       51274 :             tmp_count = count;
     539       51274 :             tmp_ulen2 = ulen2;
     540       51274 :             prev_lower = lower;
     541      417206 :             for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
     542             :             {
     543             :                 float       smlr_tmp;
     544             :                 int         tmp_trgindex;
     545             : 
     546             :                 /*
     547             :                  * Adjust lower bound only if trigram is lower bound of word
     548             :                  * for strict word similarity, or consider every trigram as
     549             :                  * lower bound for plain word similarity.
     550             :                  */
     551      369500 :                 if (!(flags & WORD_SIMILARITY_STRICT)
     552      290354 :                     || (bounds[tmp_lower] & TRGM_BOUND_LEFT))
     553             :                 {
     554      119410 :                     smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
     555      119410 :                     if (smlr_tmp > smlr_cur)
     556             :                     {
     557        7040 :                         smlr_cur = smlr_tmp;
     558        7040 :                         ulen2 = tmp_ulen2;
     559        7040 :                         lower = tmp_lower;
     560        7040 :                         count = tmp_count;
     561             :                     }
     562             : 
     563             :                     /*
     564             :                      * If we only check that word similarity is greater than
     565             :                      * threshold we do not need to calculate a maximum
     566             :                      * similarity.
     567             :                      */
     568      119410 :                     if ((flags & WORD_SIMILARITY_CHECK_ONLY)
     569       74228 :                         && smlr_cur >= threshold)
     570        3568 :                         break;
     571             :                 }
     572             : 
     573      365932 :                 tmp_trgindex = trg2indexes[tmp_lower];
     574      365932 :                 if (lastpos[tmp_trgindex] == tmp_lower)
     575             :                 {
     576      361408 :                     tmp_ulen2--;
     577      361408 :                     if (found[tmp_trgindex])
     578       93168 :                         tmp_count--;
     579             :                 }
     580             :             }
     581             : 
     582       51274 :             smlr_max = Max(smlr_max, smlr_cur);
     583             : 
     584             :             /*
     585             :              * if we only check that word similarity is greater than threshold
     586             :              * we do not need to calculate a maximum similarity.
     587             :              */
     588       51274 :             if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold)
     589        3568 :                 break;
     590             : 
     591       81224 :             for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
     592             :             {
     593             :                 int         tmp_trgindex;
     594             : 
     595       33518 :                 tmp_trgindex = trg2indexes[tmp_lower];
     596       33518 :                 if (lastpos[tmp_trgindex] == tmp_lower)
     597       32018 :                     lastpos[tmp_trgindex] = -1;
     598             :             }
     599             :         }
     600             :     }
     601             : 
     602       24252 :     pfree(lastpos);
     603             : 
     604       24252 :     return smlr_max;
     605             : }
     606             : 
     607             : /*
     608             :  * Calculate word similarity.
     609             :  * This function prepare two arrays: "trg2indexes" and "found". Then this arrays
     610             :  * are used to calculate word similarity using iterate_word_similarity().
     611             :  *
     612             :  * "trg2indexes" is array which stores indexes of the array "found".
     613             :  * In other words:
     614             :  * trg2indexes[j] = i;
     615             :  * found[i] = true (or false);
     616             :  * If found[i] == true then there is trigram trg2[j] in array "trg1".
     617             :  * If found[i] == false then there is not trigram trg2[j] in array "trg1".
     618             :  *
     619             :  * str1: search pattern string, of length slen1 bytes.
     620             :  * str2: text in which we are looking for a word, of length slen2 bytes.
     621             :  * flags: set of boolean flags parameterizing similarity calculation.
     622             :  *
     623             :  * Returns word similarity.
     624             :  */
     625             : static float4
     626       24252 : calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
     627             :                      uint8 flags)
     628             : {
     629             :     bool       *found;
     630             :     pos_trgm   *ptrg;
     631             :     trgm       *trg1;
     632             :     trgm       *trg2;
     633             :     int         len1,
     634             :                 len2,
     635             :                 len,
     636             :                 i,
     637             :                 j,
     638             :                 ulen1;
     639             :     int        *trg2indexes;
     640             :     float4      result;
     641             :     TrgmBound  *bounds;
     642             : 
     643       24252 :     protect_out_of_mem(slen1 + slen2);
     644             : 
     645             :     /* Make positional trigrams */
     646       24252 :     trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
     647       24252 :     trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
     648       24252 :     if (flags & WORD_SIMILARITY_STRICT)
     649       13324 :         bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
     650             :     else
     651       10928 :         bounds = NULL;
     652             : 
     653       24252 :     len1 = generate_trgm_only(trg1, str1, slen1, NULL);
     654       24252 :     len2 = generate_trgm_only(trg2, str2, slen2, bounds);
     655             : 
     656       24252 :     ptrg = make_positional_trgm(trg1, len1, trg2, len2);
     657       24252 :     len = len1 + len2;
     658       24252 :     qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
     659             : 
     660       24252 :     pfree(trg1);
     661       24252 :     pfree(trg2);
     662             : 
     663             :     /*
     664             :      * Merge positional trigrams array: enumerate each trigram and find its
     665             :      * presence in required word.
     666             :      */
     667       24252 :     trg2indexes = (int *) palloc(sizeof(int) * len2);
     668       24252 :     found = (bool *) palloc0(sizeof(bool) * len);
     669             : 
     670       24252 :     ulen1 = 0;
     671       24252 :     j = 0;
     672      601910 :     for (i = 0; i < len; i++)
     673             :     {
     674      577658 :         if (i > 0)
     675             :         {
     676      553406 :             int         cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg);
     677             : 
     678      553406 :             if (cmp != 0)
     679             :             {
     680      485002 :                 if (found[j])
     681      202278 :                     ulen1++;
     682      485002 :                 j++;
     683             :             }
     684             :         }
     685             : 
     686      577658 :         if (ptrg[i].index >= 0)
     687             :         {
     688      360182 :             trg2indexes[ptrg[i].index] = j;
     689             :         }
     690             :         else
     691             :         {
     692      217476 :             found[j] = true;
     693             :         }
     694             :     }
     695       24252 :     if (found[j])
     696       15198 :         ulen1++;
     697             : 
     698             :     /* Run iterative procedure to find maximum similarity with word */
     699       24252 :     result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
     700             :                                      flags, bounds);
     701             : 
     702       24252 :     pfree(trg2indexes);
     703       24252 :     pfree(found);
     704       24252 :     pfree(ptrg);
     705             : 
     706       24252 :     return result;
     707             : }
     708             : 
     709             : 
     710             : /*
     711             :  * Extract the next non-wildcard part of a search string, i.e. a word bounded
     712             :  * by '_' or '%' meta-characters, non-word characters or string end.
     713             :  *
     714             :  * str: source string, of length lenstr bytes (need not be null-terminated)
     715             :  * buf: where to return the substring (must be long enough)
     716             :  * *bytelen: receives byte length of the found substring
     717             :  * *charlen: receives character length of the found substring
     718             :  *
     719             :  * Returns pointer to end+1 of the found substring in the source string.
     720             :  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
     721             :  *
     722             :  * If the found word is bounded by non-word characters or string boundaries
     723             :  * then this function will include corresponding padding spaces into buf.
     724             :  */
     725             : static const char *
     726         238 : get_wildcard_part(const char *str, int lenstr,
     727             :                   char *buf, int *bytelen, int *charlen)
     728             : {
     729         238 :     const char *beginword = str;
     730             :     const char *endword;
     731         238 :     char       *s = buf;
     732         238 :     bool        in_leading_wildcard_meta = false;
     733         238 :     bool        in_trailing_wildcard_meta = false;
     734         238 :     bool        in_escape = false;
     735             :     int         clen;
     736             : 
     737             :     /*
     738             :      * Find the first word character, remembering whether preceding character
     739             :      * was wildcard meta-character.  Note that the in_escape state persists
     740             :      * from this loop to the next one, since we may exit at a word character
     741             :      * that is in_escape.
     742             :      */
     743         482 :     while (beginword - str < lenstr)
     744             :     {
     745         372 :         if (in_escape)
     746             :         {
     747           6 :             if (ISWORDCHR(beginword))
     748           6 :                 break;
     749           0 :             in_escape = false;
     750           0 :             in_leading_wildcard_meta = false;
     751             :         }
     752             :         else
     753             :         {
     754         366 :             if (ISESCAPECHAR(beginword))
     755           6 :                 in_escape = true;
     756         360 :             else if (ISWILDCARDCHAR(beginword))
     757         208 :                 in_leading_wildcard_meta = true;
     758         152 :             else if (ISWORDCHR(beginword))
     759         122 :                 break;
     760             :             else
     761          30 :                 in_leading_wildcard_meta = false;
     762             :         }
     763         244 :         beginword += pg_mblen(beginword);
     764             :     }
     765             : 
     766             :     /*
     767             :      * Handle string end.
     768             :      */
     769         238 :     if (beginword - str >= lenstr)
     770         110 :         return NULL;
     771             : 
     772             :     /*
     773             :      * Add left padding spaces if preceding character wasn't wildcard
     774             :      * meta-character.
     775             :      */
     776         128 :     *charlen = 0;
     777         128 :     if (!in_leading_wildcard_meta)
     778             :     {
     779             :         if (LPADDING > 0)
     780             :         {
     781          30 :             *s++ = ' ';
     782          30 :             (*charlen)++;
     783             :             if (LPADDING > 1)
     784             :             {
     785          30 :                 *s++ = ' ';
     786          30 :                 (*charlen)++;
     787             :             }
     788             :         }
     789             :     }
     790             : 
     791             :     /*
     792             :      * Copy data into buf until wildcard meta-character, non-word character or
     793             :      * string boundary.  Strip escapes during copy.
     794             :      */
     795         128 :     endword = beginword;
     796         488 :     while (endword - str < lenstr)
     797             :     {
     798         488 :         clen = pg_mblen(endword);
     799         488 :         if (in_escape)
     800             :         {
     801           6 :             if (ISWORDCHR(endword))
     802             :             {
     803           6 :                 memcpy(s, endword, clen);
     804           6 :                 (*charlen)++;
     805           6 :                 s += clen;
     806             :             }
     807             :             else
     808             :             {
     809             :                 /*
     810             :                  * Back up endword to the escape character when stopping at an
     811             :                  * escaped char, so that subsequent get_wildcard_part will
     812             :                  * restart from the escape character.  We assume here that
     813             :                  * escape chars are single-byte.
     814             :                  */
     815           0 :                 endword--;
     816           0 :                 break;
     817             :             }
     818           6 :             in_escape = false;
     819             :         }
     820             :         else
     821             :         {
     822         482 :             if (ISESCAPECHAR(endword))
     823           0 :                 in_escape = true;
     824         482 :             else if (ISWILDCARDCHAR(endword))
     825             :             {
     826         110 :                 in_trailing_wildcard_meta = true;
     827         110 :                 break;
     828             :             }
     829         372 :             else if (ISWORDCHR(endword))
     830             :             {
     831         354 :                 memcpy(s, endword, clen);
     832         354 :                 (*charlen)++;
     833         354 :                 s += clen;
     834             :             }
     835             :             else
     836          18 :                 break;
     837             :         }
     838         360 :         endword += clen;
     839             :     }
     840             : 
     841             :     /*
     842             :      * Add right padding spaces if next character isn't wildcard
     843             :      * meta-character.
     844             :      */
     845         128 :     if (!in_trailing_wildcard_meta)
     846             :     {
     847             :         if (RPADDING > 0)
     848             :         {
     849          18 :             *s++ = ' ';
     850          18 :             (*charlen)++;
     851             :             if (RPADDING > 1)
     852             :             {
     853             :                 *s++ = ' ';
     854             :                 (*charlen)++;
     855             :             }
     856             :         }
     857             :     }
     858             : 
     859         128 :     *bytelen = s - buf;
     860         128 :     return endword;
     861             : }
     862             : 
     863             : /*
     864             :  * Generates trigrams for wildcard search string.
     865             :  *
     866             :  * Returns array of trigrams that must occur in any string that matches the
     867             :  * wildcard string.  For example, given pattern "a%bcd%" the trigrams
     868             :  * " a", "bcd" would be extracted.
     869             :  */
     870             : TRGM *
     871         110 : generate_wildcard_trgm(const char *str, int slen)
     872             : {
     873             :     TRGM       *trg;
     874             :     char       *buf,
     875             :                *buf2;
     876             :     trgm       *tptr;
     877             :     int         len,
     878             :                 charlen,
     879             :                 bytelen;
     880             :     const char *eword;
     881             : 
     882         110 :     protect_out_of_mem(slen);
     883             : 
     884         110 :     trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
     885         110 :     trg->flag = ARRKEY;
     886         110 :     SET_VARSIZE(trg, TRGMHDRSIZE);
     887             : 
     888         110 :     if (slen + LPADDING + RPADDING < 3 || slen == 0)
     889           0 :         return trg;
     890             : 
     891         110 :     tptr = GETARR(trg);
     892             : 
     893             :     /* Allocate a buffer for blank-padded, but not yet case-folded, words */
     894         110 :     buf = palloc(sizeof(char) * (slen + 4));
     895             : 
     896             :     /*
     897             :      * Extract trigrams from each substring extracted by get_wildcard_part.
     898             :      */
     899         110 :     eword = str;
     900         238 :     while ((eword = get_wildcard_part(eword, slen - (eword - str),
     901             :                                       buf, &bytelen, &charlen)) != NULL)
     902             :     {
     903             : #ifdef IGNORECASE
     904         128 :         buf2 = str_tolower(buf, bytelen, DEFAULT_COLLATION_OID);
     905         128 :         bytelen = strlen(buf2);
     906             : #else
     907             :         buf2 = buf;
     908             : #endif
     909             : 
     910             :         /*
     911             :          * count trigrams
     912             :          */
     913         128 :         tptr = make_trigrams(tptr, buf2, bytelen, charlen);
     914             : 
     915             : #ifdef IGNORECASE
     916         128 :         pfree(buf2);
     917             : #endif
     918             :     }
     919             : 
     920         110 :     pfree(buf);
     921             : 
     922         110 :     if ((len = tptr - GETARR(trg)) == 0)
     923          48 :         return trg;
     924             : 
     925             :     /*
     926             :      * Make trigrams unique.
     927             :      */
     928          62 :     if (len > 1)
     929             :     {
     930          34 :         qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
     931          34 :         len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
     932             :     }
     933             : 
     934          62 :     SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
     935             : 
     936          62 :     return trg;
     937             : }
     938             : 
     939             : uint32
     940       69546 : trgm2int(trgm *ptr)
     941             : {
     942       69546 :     uint32      val = 0;
     943             : 
     944       69546 :     val |= *(((unsigned char *) ptr));
     945       69546 :     val <<= 8;
     946       69546 :     val |= *(((unsigned char *) ptr) + 1);
     947       69546 :     val <<= 8;
     948       69546 :     val |= *(((unsigned char *) ptr) + 2);
     949             : 
     950       69546 :     return val;
     951             : }
     952             : 
     953             : Datum
     954          14 : show_trgm(PG_FUNCTION_ARGS)
     955             : {
     956          14 :     text       *in = PG_GETARG_TEXT_PP(0);
     957             :     TRGM       *trg;
     958             :     Datum      *d;
     959             :     ArrayType  *a;
     960             :     trgm       *ptr;
     961             :     int         i;
     962             : 
     963          14 :     trg = generate_trgm(VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
     964          14 :     d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));
     965             : 
     966          88 :     for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
     967             :     {
     968          74 :         text       *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));
     969             : 
     970          74 :         if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr))
     971             :         {
     972           0 :             snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
     973           0 :             SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
     974             :         }
     975             :         else
     976             :         {
     977          74 :             SET_VARSIZE(item, VARHDRSZ + 3);
     978          74 :             CPTRGM(VARDATA(item), ptr);
     979             :         }
     980          74 :         d[i] = PointerGetDatum(item);
     981             :     }
     982             : 
     983          14 :     a = construct_array_builtin(d, ARRNELEM(trg), TEXTOID);
     984             : 
     985          88 :     for (i = 0; i < ARRNELEM(trg); i++)
     986          74 :         pfree(DatumGetPointer(d[i]));
     987             : 
     988          14 :     pfree(d);
     989          14 :     pfree(trg);
     990          14 :     PG_FREE_IF_COPY(in, 0);
     991             : 
     992          14 :     PG_RETURN_POINTER(a);
     993             : }
     994             : 
     995             : float4
     996      136904 : cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
     997             : {
     998             :     trgm       *ptr1,
     999             :                *ptr2;
    1000      136904 :     int         count = 0;
    1001             :     int         len1,
    1002             :                 len2;
    1003             : 
    1004      136904 :     ptr1 = GETARR(trg1);
    1005      136904 :     ptr2 = GETARR(trg2);
    1006             : 
    1007      136904 :     len1 = ARRNELEM(trg1);
    1008      136904 :     len2 = ARRNELEM(trg2);
    1009             : 
    1010             :     /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
    1011      136904 :     if (len1 <= 0 || len2 <= 0)
    1012           2 :         return (float4) 0.0;
    1013             : 
    1014     1741320 :     while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
    1015             :     {
    1016     1604418 :         int         res = CMPTRGM(ptr1, ptr2);
    1017             : 
    1018     1604418 :         if (res < 0)
    1019      360056 :             ptr1++;
    1020     1244362 :         else if (res > 0)
    1021      421496 :             ptr2++;
    1022             :         else
    1023             :         {
    1024      822866 :             ptr1++;
    1025      822866 :             ptr2++;
    1026      822866 :             count++;
    1027             :         }
    1028             :     }
    1029             : 
    1030             :     /*
    1031             :      * If inexact then len2 is equal to count, because we don't know actual
    1032             :      * length of second string in inexact search and we can assume that count
    1033             :      * is a lower bound of len2.
    1034             :      */
    1035      136902 :     return CALCSML(count, len1, inexact ? count : len2);
    1036             : }
    1037             : 
    1038             : 
    1039             : /*
    1040             :  * Returns whether trg2 contains all trigrams in trg1.
    1041             :  * This relies on the trigram arrays being sorted.
    1042             :  */
    1043             : bool
    1044         380 : trgm_contained_by(TRGM *trg1, TRGM *trg2)
    1045             : {
    1046             :     trgm       *ptr1,
    1047             :                *ptr2;
    1048             :     int         len1,
    1049             :                 len2;
    1050             : 
    1051         380 :     ptr1 = GETARR(trg1);
    1052         380 :     ptr2 = GETARR(trg2);
    1053             : 
    1054         380 :     len1 = ARRNELEM(trg1);
    1055         380 :     len2 = ARRNELEM(trg2);
    1056             : 
    1057        1244 :     while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
    1058             :     {
    1059        1198 :         int         res = CMPTRGM(ptr1, ptr2);
    1060             : 
    1061        1198 :         if (res < 0)
    1062         334 :             return false;
    1063         864 :         else if (res > 0)
    1064         640 :             ptr2++;
    1065             :         else
    1066             :         {
    1067         224 :             ptr1++;
    1068         224 :             ptr2++;
    1069             :         }
    1070             :     }
    1071          46 :     if (ptr1 - GETARR(trg1) < len1)
    1072           8 :         return false;
    1073             :     else
    1074          38 :         return true;
    1075             : }
    1076             : 
    1077             : /*
    1078             :  * Return a palloc'd boolean array showing, for each trigram in "query",
    1079             :  * whether it is present in the trigram array "key".
    1080             :  * This relies on the "key" array being sorted, but "query" need not be.
    1081             :  */
    1082             : bool *
    1083        4300 : trgm_presence_map(TRGM *query, TRGM *key)
    1084             : {
    1085             :     bool       *result;
    1086        4300 :     trgm       *ptrq = GETARR(query),
    1087        4300 :                *ptrk = GETARR(key);
    1088        4300 :     int         lenq = ARRNELEM(query),
    1089        4300 :                 lenk = ARRNELEM(key),
    1090             :                 i;
    1091             : 
    1092        4300 :     result = (bool *) palloc0(lenq * sizeof(bool));
    1093             : 
    1094             :     /* for each query trigram, do a binary search in the key array */
    1095     1015120 :     for (i = 0; i < lenq; i++)
    1096             :     {
    1097     1010820 :         int         lo = 0;
    1098     1010820 :         int         hi = lenk;
    1099             : 
    1100     4747306 :         while (lo < hi)
    1101             :         {
    1102     3752564 :             int         mid = (lo + hi) / 2;
    1103     3752564 :             int         res = CMPTRGM(ptrq, ptrk + mid);
    1104             : 
    1105     3752564 :             if (res < 0)
    1106     1568164 :                 hi = mid;
    1107     2184400 :             else if (res > 0)
    1108     2168322 :                 lo = mid + 1;
    1109             :             else
    1110             :             {
    1111       16078 :                 result[i] = true;
    1112       16078 :                 break;
    1113             :             }
    1114             :         }
    1115     1010820 :         ptrq++;
    1116             :     }
    1117             : 
    1118        4300 :     return result;
    1119             : }
    1120             : 
    1121             : Datum
    1122       62904 : similarity(PG_FUNCTION_ARGS)
    1123             : {
    1124       62904 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1125       62904 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1126             :     TRGM       *trg1,
    1127             :                *trg2;
    1128             :     float4      res;
    1129             : 
    1130       62904 :     trg1 = generate_trgm(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1));
    1131       62904 :     trg2 = generate_trgm(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2));
    1132             : 
    1133       62904 :     res = cnt_sml(trg1, trg2, false);
    1134             : 
    1135       62904 :     pfree(trg1);
    1136       62904 :     pfree(trg2);
    1137       62904 :     PG_FREE_IF_COPY(in1, 0);
    1138       62904 :     PG_FREE_IF_COPY(in2, 1);
    1139             : 
    1140       62904 :     PG_RETURN_FLOAT4(res);
    1141             : }
    1142             : 
    1143             : Datum
    1144        1804 : word_similarity(PG_FUNCTION_ARGS)
    1145             : {
    1146        1804 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1147        1804 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1148             :     float4      res;
    1149             : 
    1150        3608 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1151        3608 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1152             :                                0);
    1153             : 
    1154        1804 :     PG_FREE_IF_COPY(in1, 0);
    1155        1804 :     PG_FREE_IF_COPY(in2, 1);
    1156        1804 :     PG_RETURN_FLOAT4(res);
    1157             : }
    1158             : 
    1159             : Datum
    1160        1764 : strict_word_similarity(PG_FUNCTION_ARGS)
    1161             : {
    1162        1764 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1163        1764 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1164             :     float4      res;
    1165             : 
    1166        3528 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1167        3528 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1168             :                                WORD_SIMILARITY_STRICT);
    1169             : 
    1170        1764 :     PG_FREE_IF_COPY(in1, 0);
    1171        1764 :     PG_FREE_IF_COPY(in2, 1);
    1172        1764 :     PG_RETURN_FLOAT4(res);
    1173             : }
    1174             : 
    1175             : Datum
    1176        2008 : similarity_dist(PG_FUNCTION_ARGS)
    1177             : {
    1178        2008 :     float4      res = DatumGetFloat4(DirectFunctionCall2(similarity,
    1179             :                                                          PG_GETARG_DATUM(0),
    1180             :                                                          PG_GETARG_DATUM(1)));
    1181             : 
    1182        2008 :     PG_RETURN_FLOAT4(1.0 - res);
    1183             : }
    1184             : 
    1185             : Datum
    1186       12000 : similarity_op(PG_FUNCTION_ARGS)
    1187             : {
    1188       12000 :     float4      res = DatumGetFloat4(DirectFunctionCall2(similarity,
    1189             :                                                          PG_GETARG_DATUM(0),
    1190             :                                                          PG_GETARG_DATUM(1)));
    1191             : 
    1192       12000 :     PG_RETURN_BOOL(res >= similarity_threshold);
    1193             : }
    1194             : 
    1195             : Datum
    1196        3848 : word_similarity_op(PG_FUNCTION_ARGS)
    1197             : {
    1198        3848 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1199        3848 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1200             :     float4      res;
    1201             : 
    1202        7696 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1203        7696 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1204             :                                WORD_SIMILARITY_CHECK_ONLY);
    1205             : 
    1206        3848 :     PG_FREE_IF_COPY(in1, 0);
    1207        3848 :     PG_FREE_IF_COPY(in2, 1);
    1208        3848 :     PG_RETURN_BOOL(res >= word_similarity_threshold);
    1209             : }
    1210             : 
    1211             : Datum
    1212        3848 : word_similarity_commutator_op(PG_FUNCTION_ARGS)
    1213             : {
    1214        3848 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1215        3848 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1216             :     float4      res;
    1217             : 
    1218        7696 :     res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1219        7696 :                                VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1220             :                                WORD_SIMILARITY_CHECK_ONLY);
    1221             : 
    1222        3848 :     PG_FREE_IF_COPY(in1, 0);
    1223        3848 :     PG_FREE_IF_COPY(in2, 1);
    1224        3848 :     PG_RETURN_BOOL(res >= word_similarity_threshold);
    1225             : }
    1226             : 
    1227             : Datum
    1228           0 : word_similarity_dist_op(PG_FUNCTION_ARGS)
    1229             : {
    1230           0 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1231           0 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1232             :     float4      res;
    1233             : 
    1234           0 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1235           0 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1236             :                                0);
    1237             : 
    1238           0 :     PG_FREE_IF_COPY(in1, 0);
    1239           0 :     PG_FREE_IF_COPY(in2, 1);
    1240           0 :     PG_RETURN_FLOAT4(1.0 - res);
    1241             : }
    1242             : 
    1243             : Datum
    1244        1428 : word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
    1245             : {
    1246        1428 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1247        1428 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1248             :     float4      res;
    1249             : 
    1250        2856 :     res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1251        2856 :                                VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1252             :                                0);
    1253             : 
    1254        1428 :     PG_FREE_IF_COPY(in1, 0);
    1255        1428 :     PG_FREE_IF_COPY(in2, 1);
    1256        1428 :     PG_RETURN_FLOAT4(1.0 - res);
    1257             : }
    1258             : 
    1259             : Datum
    1260        5060 : strict_word_similarity_op(PG_FUNCTION_ARGS)
    1261             : {
    1262        5060 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1263        5060 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1264             :     float4      res;
    1265             : 
    1266       10120 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1267       10120 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1268             :                                WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
    1269             : 
    1270        5060 :     PG_FREE_IF_COPY(in1, 0);
    1271        5060 :     PG_FREE_IF_COPY(in2, 1);
    1272        5060 :     PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
    1273             : }
    1274             : 
    1275             : Datum
    1276        5060 : strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
    1277             : {
    1278        5060 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1279        5060 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1280             :     float4      res;
    1281             : 
    1282       10120 :     res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1283       10120 :                                VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1284             :                                WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
    1285             : 
    1286        5060 :     PG_FREE_IF_COPY(in1, 0);
    1287        5060 :     PG_FREE_IF_COPY(in2, 1);
    1288        5060 :     PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
    1289             : }
    1290             : 
    1291             : Datum
    1292           0 : strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
    1293             : {
    1294           0 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1295           0 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1296             :     float4      res;
    1297             : 
    1298           0 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1299           0 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1300             :                                WORD_SIMILARITY_STRICT);
    1301             : 
    1302           0 :     PG_FREE_IF_COPY(in1, 0);
    1303           0 :     PG_FREE_IF_COPY(in2, 1);
    1304           0 :     PG_RETURN_FLOAT4(1.0 - res);
    1305             : }
    1306             : 
    1307             : Datum
    1308        1440 : strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
    1309             : {
    1310        1440 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1311        1440 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1312             :     float4      res;
    1313             : 
    1314        2880 :     res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1315        2880 :                                VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1316             :                                WORD_SIMILARITY_STRICT);
    1317             : 
    1318        1440 :     PG_FREE_IF_COPY(in1, 0);
    1319        1440 :     PG_FREE_IF_COPY(in2, 1);
    1320        1440 :     PG_RETURN_FLOAT4(1.0 - res);
    1321             : }

Generated by: LCOV version 1.14