Line data Source code
1 : /*
2 : * contrib/pg_trgm/trgm_op.c
3 : */
4 : #include "postgres.h"
5 :
6 : #include <ctype.h>
7 :
8 : #include "catalog/pg_collation_d.h"
9 : #include "catalog/pg_type.h"
10 : #include "common/int.h"
11 : #include "lib/qunique.h"
12 : #include "miscadmin.h"
13 : #include "trgm.h"
14 : #include "tsearch/ts_locale.h"
15 : #include "utils/formatting.h"
16 : #include "utils/guc.h"
17 : #include "utils/lsyscache.h"
18 : #include "utils/memutils.h"
19 : #include "utils/pg_crc.h"
20 :
21 6 : PG_MODULE_MAGIC;
22 :
23 : /* GUC variables */
24 : double similarity_threshold = 0.3f;
25 : double word_similarity_threshold = 0.6f;
26 : double strict_word_similarity_threshold = 0.5f;
27 :
28 4 : PG_FUNCTION_INFO_V1(set_limit);
29 4 : PG_FUNCTION_INFO_V1(show_limit);
30 4 : PG_FUNCTION_INFO_V1(show_trgm);
31 4 : PG_FUNCTION_INFO_V1(similarity);
32 4 : PG_FUNCTION_INFO_V1(word_similarity);
33 4 : PG_FUNCTION_INFO_V1(strict_word_similarity);
34 4 : PG_FUNCTION_INFO_V1(similarity_dist);
35 4 : PG_FUNCTION_INFO_V1(similarity_op);
36 4 : PG_FUNCTION_INFO_V1(word_similarity_op);
37 4 : PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
38 2 : PG_FUNCTION_INFO_V1(word_similarity_dist_op);
39 4 : PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
40 4 : PG_FUNCTION_INFO_V1(strict_word_similarity_op);
41 4 : PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op);
42 2 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op);
43 4 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op);
44 :
45 : static int CMPTRGM_CHOOSE(const void *a, const void *b);
46 : int (*CMPTRGM) (const void *a, const void *b) = CMPTRGM_CHOOSE;
47 :
48 : /* Trigram with position */
49 : typedef struct
50 : {
51 : trgm trg;
52 : int index;
53 : } pos_trgm;
54 :
55 : /* Trigram bound type */
56 : typedef uint8 TrgmBound;
57 : #define TRGM_BOUND_LEFT 0x01 /* trigram is left bound of word */
58 : #define TRGM_BOUND_RIGHT 0x02 /* trigram is right bound of word */
59 :
60 : /* Word similarity flags */
61 : #define WORD_SIMILARITY_CHECK_ONLY 0x01 /* only check existence of similar
62 : * search pattern in text */
63 : #define WORD_SIMILARITY_STRICT 0x02 /* force bounds of extent to match
64 : * word bounds */
65 :
66 : /*
67 : * Module load callback
68 : */
69 : void
70 6 : _PG_init(void)
71 : {
72 : /* Define custom GUC variables. */
73 6 : DefineCustomRealVariable("pg_trgm.similarity_threshold",
74 : "Sets the threshold used by the % operator.",
75 : "Valid range is 0.0 .. 1.0.",
76 : &similarity_threshold,
77 : 0.3f,
78 : 0.0,
79 : 1.0,
80 : PGC_USERSET,
81 : 0,
82 : NULL,
83 : NULL,
84 : NULL);
85 6 : DefineCustomRealVariable("pg_trgm.word_similarity_threshold",
86 : "Sets the threshold used by the <% operator.",
87 : "Valid range is 0.0 .. 1.0.",
88 : &word_similarity_threshold,
89 : 0.6f,
90 : 0.0,
91 : 1.0,
92 : PGC_USERSET,
93 : 0,
94 : NULL,
95 : NULL,
96 : NULL);
97 6 : DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold",
98 : "Sets the threshold used by the <<% operator.",
99 : "Valid range is 0.0 .. 1.0.",
100 : &strict_word_similarity_threshold,
101 : 0.5f,
102 : 0.0,
103 : 1.0,
104 : PGC_USERSET,
105 : 0,
106 : NULL,
107 : NULL,
108 : NULL);
109 :
110 6 : MarkGUCPrefixReserved("pg_trgm");
111 6 : }
112 :
113 : #define CMPCHAR(a,b) ( ((a)==(b)) ? 0 : ( ((a)<(b)) ? -1 : 1 ) )
114 :
115 : /*
116 : * Functions for comparing two trgms while treating each char as "signed char" or
117 : * "unsigned char".
118 : */
119 : static inline int
120 14921412 : CMPTRGM_SIGNED(const void *a, const void *b)
121 : {
122 : #define CMPPCHAR_S(a,b,i) CMPCHAR( *(((const signed char*)(a))+i), *(((const signed char*)(b))+i) )
123 :
124 10743246 : return CMPPCHAR_S(a, b, 0) ? CMPPCHAR_S(a, b, 0)
125 32522646 : : (CMPPCHAR_S(a, b, 1) ? CMPPCHAR_S(a, b, 1)
126 6857988 : : CMPPCHAR_S(a, b, 2));
127 : }
128 :
129 : static inline int
130 0 : CMPTRGM_UNSIGNED(const void *a, const void *b)
131 : {
132 : #define CMPPCHAR_UNS(a,b,i) CMPCHAR( *(((const unsigned char*)(a))+i), *(((const unsigned char*)(b))+i) )
133 :
134 0 : return CMPPCHAR_UNS(a, b, 0) ? CMPPCHAR_UNS(a, b, 0)
135 0 : : (CMPPCHAR_UNS(a, b, 1) ? CMPPCHAR_UNS(a, b, 1)
136 0 : : CMPPCHAR_UNS(a, b, 2));
137 : }
138 :
139 : /*
140 : * This gets called on the first call. It replaces the function pointer so
141 : * that subsequent calls are routed directly to the chosen implementation.
142 : */
143 : static int
144 6 : CMPTRGM_CHOOSE(const void *a, const void *b)
145 : {
146 6 : if (GetDefaultCharSignedness())
147 6 : CMPTRGM = CMPTRGM_SIGNED;
148 : else
149 0 : CMPTRGM = CMPTRGM_UNSIGNED;
150 :
151 6 : return CMPTRGM(a, b);
152 : }
153 :
154 : /*
155 : * Deprecated function.
156 : * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
157 : */
158 : Datum
159 4 : set_limit(PG_FUNCTION_ARGS)
160 : {
161 4 : float4 nlimit = PG_GETARG_FLOAT4(0);
162 : char *nlimit_str;
163 : Oid func_out_oid;
164 : bool is_varlena;
165 :
166 4 : getTypeOutputInfo(FLOAT4OID, &func_out_oid, &is_varlena);
167 :
168 4 : nlimit_str = OidOutputFunctionCall(func_out_oid, Float4GetDatum(nlimit));
169 :
170 4 : SetConfigOption("pg_trgm.similarity_threshold", nlimit_str,
171 : PGC_USERSET, PGC_S_SESSION);
172 :
173 4 : PG_RETURN_FLOAT4(similarity_threshold);
174 : }
175 :
176 :
177 : /*
178 : * Get similarity threshold for given index scan strategy number.
179 : */
180 : double
181 86806 : index_strategy_get_limit(StrategyNumber strategy)
182 : {
183 86806 : switch (strategy)
184 : {
185 65198 : case SimilarityStrategyNumber:
186 65198 : return similarity_threshold;
187 9644 : case WordSimilarityStrategyNumber:
188 9644 : return word_similarity_threshold;
189 11964 : case StrictWordSimilarityStrategyNumber:
190 11964 : return strict_word_similarity_threshold;
191 0 : default:
192 0 : elog(ERROR, "unrecognized strategy number: %d", strategy);
193 : break;
194 : }
195 :
196 : return 0.0; /* keep compiler quiet */
197 : }
198 :
199 : /*
200 : * Deprecated function.
201 : * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
202 : */
203 : Datum
204 40000 : show_limit(PG_FUNCTION_ARGS)
205 : {
206 40000 : PG_RETURN_FLOAT4(similarity_threshold);
207 : }
208 :
209 : static int
210 6374440 : comp_trgm(const void *a, const void *b)
211 : {
212 6374440 : return CMPTRGM(a, b);
213 : }
214 :
215 : /*
216 : * Finds first word in string, returns pointer to the word,
217 : * endword points to the character after word
218 : */
219 : static char *
220 482822 : find_word(char *str, int lenstr, char **endword, int *charlen)
221 : {
222 482822 : char *beginword = str;
223 :
224 510150 : while (beginword - str < lenstr && !ISWORDCHR(beginword))
225 27328 : beginword += pg_mblen(beginword);
226 :
227 482822 : if (beginword - str >= lenstr)
228 228160 : return NULL;
229 :
230 254662 : *endword = beginword;
231 254662 : *charlen = 0;
232 2199348 : while (*endword - str < lenstr && ISWORDCHR(*endword))
233 : {
234 1944686 : *endword += pg_mblen(*endword);
235 1944686 : (*charlen)++;
236 : }
237 :
238 254662 : return beginword;
239 : }
240 :
241 : /*
242 : * Reduce a trigram (three possibly multi-byte characters) to a trgm,
243 : * which is always exactly three bytes. If we have three single-byte
244 : * characters, we just use them as-is; otherwise we form a hash value.
245 : */
246 : void
247 2918 : compact_trigram(trgm *tptr, char *str, int bytelen)
248 : {
249 2918 : if (bytelen == 3)
250 : {
251 2918 : CPTRGM(tptr, str);
252 : }
253 : else
254 : {
255 : pg_crc32 crc;
256 :
257 0 : INIT_LEGACY_CRC32(crc);
258 0 : COMP_LEGACY_CRC32(crc, str, bytelen);
259 0 : FIN_LEGACY_CRC32(crc);
260 :
261 : /*
262 : * use only 3 upper bytes from crc, hope, it's good enough hashing
263 : */
264 0 : CPTRGM(tptr, &crc);
265 : }
266 2918 : }
267 :
268 : /*
269 : * Adds trigrams from words (already padded).
270 : */
271 : static trgm *
272 254790 : make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
273 : {
274 254790 : char *ptr = str;
275 :
276 254790 : if (charlen < 3)
277 54 : return tptr;
278 :
279 254736 : if (bytelen > charlen)
280 : {
281 : /* Find multibyte character boundaries and apply compact_trigram */
282 0 : int lenfirst = pg_mblen(str),
283 0 : lenmiddle = pg_mblen(str + lenfirst),
284 0 : lenlast = pg_mblen(str + lenfirst + lenmiddle);
285 :
286 0 : while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
287 : {
288 0 : compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
289 :
290 0 : ptr += lenfirst;
291 0 : tptr++;
292 :
293 0 : lenfirst = lenmiddle;
294 0 : lenmiddle = lenlast;
295 0 : lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
296 : }
297 : }
298 : else
299 : {
300 : /* Fast path when there are no multibyte characters */
301 : Assert(bytelen == charlen);
302 :
303 2454266 : while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
304 : {
305 2199530 : CPTRGM(tptr, ptr);
306 2199530 : ptr++;
307 2199530 : tptr++;
308 : }
309 : }
310 :
311 254736 : return tptr;
312 : }
313 :
314 : /*
315 : * Make array of trigrams without sorting and removing duplicate items.
316 : *
317 : * trg: where to return the array of trigrams.
318 : * str: source string, of length slen bytes.
319 : * bounds: where to return bounds of trigrams (if needed).
320 : *
321 : * Returns length of the generated array.
322 : */
323 : static int
324 228162 : generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
325 : {
326 : trgm *tptr;
327 : char *buf;
328 : int charlen,
329 : bytelen;
330 : char *bword,
331 : *eword;
332 :
333 228162 : if (slen + LPADDING + RPADDING < 3 || slen == 0)
334 2 : return 0;
335 :
336 228160 : tptr = trg;
337 :
338 : /* Allocate a buffer for case-folded, blank-padded words */
339 228160 : buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
340 :
341 : if (LPADDING > 0)
342 : {
343 228160 : *buf = ' ';
344 : if (LPADDING > 1)
345 228160 : *(buf + 1) = ' ';
346 : }
347 :
348 228160 : eword = str;
349 482822 : while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
350 : {
351 : #ifdef IGNORECASE
352 254662 : bword = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID);
353 254662 : bytelen = strlen(bword);
354 : #else
355 : bytelen = eword - bword;
356 : #endif
357 :
358 254662 : memcpy(buf + LPADDING, bword, bytelen);
359 :
360 : #ifdef IGNORECASE
361 254662 : pfree(bword);
362 : #endif
363 :
364 254662 : buf[LPADDING + bytelen] = ' ';
365 254662 : buf[LPADDING + bytelen + 1] = ' ';
366 :
367 : /* Calculate trigrams marking their bounds if needed */
368 254662 : if (bounds)
369 24796 : bounds[tptr - trg] |= TRGM_BOUND_LEFT;
370 254662 : tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
371 : charlen + LPADDING + RPADDING);
372 254662 : if (bounds)
373 24796 : bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
374 : }
375 :
376 228160 : pfree(buf);
377 :
378 228160 : return tptr - trg;
379 : }
380 :
381 : /*
382 : * Guard against possible overflow in the palloc requests below. (We
383 : * don't worry about the additive constants, since palloc can detect
384 : * requests that are a little above MaxAllocSize --- we just need to
385 : * prevent integer overflow in the multiplications.)
386 : */
387 : static void
388 204020 : protect_out_of_mem(int slen)
389 : {
390 204020 : if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
391 204020 : (Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
392 0 : ereport(ERROR,
393 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
394 : errmsg("out of memory")));
395 204020 : }
396 :
397 : /*
398 : * Make array of trigrams with sorting and removing duplicate items.
399 : *
400 : * str: source string, of length slen bytes.
401 : *
402 : * Returns the sorted array of unique trigrams.
403 : */
404 : TRGM *
405 179658 : generate_trgm(char *str, int slen)
406 : {
407 : TRGM *trg;
408 : int len;
409 :
410 179658 : protect_out_of_mem(slen);
411 :
412 179658 : trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
413 179658 : trg->flag = ARRKEY;
414 :
415 179658 : len = generate_trgm_only(GETARR(trg), str, slen, NULL);
416 179658 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
417 :
418 179658 : if (len == 0)
419 8 : return trg;
420 :
421 : /*
422 : * Make trigrams unique.
423 : */
424 179650 : if (len > 1)
425 : {
426 179650 : qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
427 179650 : len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
428 : }
429 :
430 179650 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
431 :
432 179650 : return trg;
433 : }
434 :
435 : /*
436 : * Make array of positional trigrams from two trigram arrays trg1 and trg2.
437 : *
438 : * trg1: trigram array of search pattern, of length len1. trg1 is required
439 : * word which positions don't matter and replaced with -1.
440 : * trg2: trigram array of text, of length len2. trg2 is haystack where we
441 : * search and have to store its positions.
442 : *
443 : * Returns concatenated trigram array.
444 : */
445 : static pos_trgm *
446 24252 : make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
447 : {
448 : pos_trgm *result;
449 : int i,
450 24252 : len = len1 + len2;
451 :
452 24252 : result = (pos_trgm *) palloc(sizeof(pos_trgm) * len);
453 :
454 241728 : for (i = 0; i < len1; i++)
455 : {
456 217476 : memcpy(&result[i].trg, &trg1[i], sizeof(trgm));
457 217476 : result[i].index = -1;
458 : }
459 :
460 384424 : for (i = 0; i < len2; i++)
461 : {
462 360172 : memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm));
463 360172 : result[i + len1].index = i;
464 : }
465 :
466 24252 : return result;
467 : }
468 :
469 : /*
470 : * Compare position trigrams: compare trigrams first and position second.
471 : */
472 : static int
473 2615444 : comp_ptrgm(const void *v1, const void *v2)
474 : {
475 2615444 : const pos_trgm *p1 = (const pos_trgm *) v1;
476 2615444 : const pos_trgm *p2 = (const pos_trgm *) v2;
477 : int cmp;
478 :
479 2615444 : cmp = CMPTRGM(p1->trg, p2->trg);
480 2615444 : if (cmp != 0)
481 2536034 : return cmp;
482 :
483 79410 : return pg_cmp_s32(p1->index, p2->index);
484 : }
485 :
486 : /*
487 : * Iterative search function which calculates maximum similarity with word in
488 : * the string. Maximum similarity is only calculated only if the flag
489 : * WORD_SIMILARITY_CHECK_ONLY isn't set.
490 : *
491 : * trg2indexes: array which stores indexes of the array "found".
492 : * found: array which stores true of false values.
493 : * ulen1: count of unique trigrams of array "trg1".
494 : * len2: length of array "trg2" and array "trg2indexes".
495 : * len: length of the array "found".
496 : * flags: set of boolean flags parameterizing similarity calculation.
497 : * bounds: whether each trigram is left/right bound of word.
498 : *
499 : * Returns word similarity.
500 : */
501 : static float4
502 24252 : iterate_word_similarity(int *trg2indexes,
503 : bool *found,
504 : int ulen1,
505 : int len2,
506 : int len,
507 : uint8 flags,
508 : TrgmBound *bounds)
509 : {
510 : int *lastpos,
511 : i,
512 24252 : ulen2 = 0,
513 24252 : count = 0,
514 24252 : upper = -1,
515 : lower;
516 : float4 smlr_cur,
517 24252 : smlr_max = 0.0f;
518 : double threshold;
519 :
520 : Assert(bounds || !(flags & WORD_SIMILARITY_STRICT));
521 :
522 : /* Select appropriate threshold */
523 48504 : threshold = (flags & WORD_SIMILARITY_STRICT) ?
524 24252 : strict_word_similarity_threshold :
525 : word_similarity_threshold;
526 :
527 : /*
528 : * Consider first trigram as initial lower bound for strict word
529 : * similarity, or initialize it later with first trigram present for plain
530 : * word similarity.
531 : */
532 24252 : lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1;
533 :
534 : /* Memorise last position of each trigram */
535 24252 : lastpos = (int *) palloc(sizeof(int) * len);
536 24252 : memset(lastpos, -1, sizeof(int) * len);
537 :
538 367284 : for (i = 0; i < len2; i++)
539 : {
540 : int trgindex;
541 :
542 346600 : CHECK_FOR_INTERRUPTS();
543 :
544 : /* Get index of next trigram */
545 346600 : trgindex = trg2indexes[i];
546 :
547 : /* Update last position of this trigram */
548 346600 : if (lower >= 0 || found[trgindex])
549 : {
550 271584 : if (lastpos[trgindex] < 0)
551 : {
552 267876 : ulen2++;
553 267876 : if (found[trgindex])
554 61512 : count++;
555 : }
556 271584 : lastpos[trgindex] = i;
557 : }
558 :
559 : /*
560 : * Adjust upper bound if trigram is upper bound of word for strict
561 : * word similarity, or if trigram is present in required substring for
562 : * plain word similarity
563 : */
564 500684 : if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT)
565 154084 : : found[trgindex])
566 : {
567 : int prev_lower,
568 : tmp_ulen2,
569 : tmp_lower,
570 : tmp_count;
571 :
572 51272 : upper = i;
573 51272 : if (lower == -1)
574 : {
575 9390 : lower = i;
576 9390 : ulen2 = 1;
577 : }
578 :
579 51272 : smlr_cur = CALCSML(count, ulen1, ulen2);
580 :
581 : /* Also try to adjust lower bound for greater similarity */
582 51272 : tmp_count = count;
583 51272 : tmp_ulen2 = ulen2;
584 51272 : prev_lower = lower;
585 417180 : for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
586 : {
587 : float smlr_tmp;
588 : int tmp_trgindex;
589 :
590 : /*
591 : * Adjust lower bound only if trigram is lower bound of word
592 : * for strict word similarity, or consider every trigram as
593 : * lower bound for plain word similarity.
594 : */
595 369476 : if (!(flags & WORD_SIMILARITY_STRICT)
596 290346 : || (bounds[tmp_lower] & TRGM_BOUND_LEFT))
597 : {
598 119394 : smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
599 119394 : if (smlr_tmp > smlr_cur)
600 : {
601 7024 : smlr_cur = smlr_tmp;
602 7024 : ulen2 = tmp_ulen2;
603 7024 : lower = tmp_lower;
604 7024 : count = tmp_count;
605 : }
606 :
607 : /*
608 : * If we only check that word similarity is greater than
609 : * threshold we do not need to calculate a maximum
610 : * similarity.
611 : */
612 119394 : if ((flags & WORD_SIMILARITY_CHECK_ONLY)
613 74228 : && smlr_cur >= threshold)
614 3568 : break;
615 : }
616 :
617 365908 : tmp_trgindex = trg2indexes[tmp_lower];
618 365908 : if (lastpos[tmp_trgindex] == tmp_lower)
619 : {
620 361384 : tmp_ulen2--;
621 361384 : if (found[tmp_trgindex])
622 93158 : tmp_count--;
623 : }
624 : }
625 :
626 51272 : smlr_max = Max(smlr_max, smlr_cur);
627 :
628 : /*
629 : * if we only check that word similarity is greater than threshold
630 : * we do not need to calculate a maximum similarity.
631 : */
632 51272 : if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold)
633 3568 : break;
634 :
635 81216 : for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
636 : {
637 : int tmp_trgindex;
638 :
639 33512 : tmp_trgindex = trg2indexes[tmp_lower];
640 33512 : if (lastpos[tmp_trgindex] == tmp_lower)
641 32014 : lastpos[tmp_trgindex] = -1;
642 : }
643 : }
644 : }
645 :
646 24252 : pfree(lastpos);
647 :
648 24252 : return smlr_max;
649 : }
650 :
651 : /*
652 : * Calculate word similarity.
653 : * This function prepare two arrays: "trg2indexes" and "found". Then this arrays
654 : * are used to calculate word similarity using iterate_word_similarity().
655 : *
656 : * "trg2indexes" is array which stores indexes of the array "found".
657 : * In other words:
658 : * trg2indexes[j] = i;
659 : * found[i] = true (or false);
660 : * If found[i] == true then there is trigram trg2[j] in array "trg1".
661 : * If found[i] == false then there is not trigram trg2[j] in array "trg1".
662 : *
663 : * str1: search pattern string, of length slen1 bytes.
664 : * str2: text in which we are looking for a word, of length slen2 bytes.
665 : * flags: set of boolean flags parameterizing similarity calculation.
666 : *
667 : * Returns word similarity.
668 : */
669 : static float4
670 24252 : calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
671 : uint8 flags)
672 : {
673 : bool *found;
674 : pos_trgm *ptrg;
675 : trgm *trg1;
676 : trgm *trg2;
677 : int len1,
678 : len2,
679 : len,
680 : i,
681 : j,
682 : ulen1;
683 : int *trg2indexes;
684 : float4 result;
685 : TrgmBound *bounds;
686 :
687 24252 : protect_out_of_mem(slen1 + slen2);
688 :
689 : /* Make positional trigrams */
690 24252 : trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
691 24252 : trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
692 24252 : if (flags & WORD_SIMILARITY_STRICT)
693 13324 : bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
694 : else
695 10928 : bounds = NULL;
696 :
697 24252 : len1 = generate_trgm_only(trg1, str1, slen1, NULL);
698 24252 : len2 = generate_trgm_only(trg2, str2, slen2, bounds);
699 :
700 24252 : ptrg = make_positional_trgm(trg1, len1, trg2, len2);
701 24252 : len = len1 + len2;
702 24252 : qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
703 :
704 24252 : pfree(trg1);
705 24252 : pfree(trg2);
706 :
707 : /*
708 : * Merge positional trigrams array: enumerate each trigram and find its
709 : * presence in required word.
710 : */
711 24252 : trg2indexes = (int *) palloc(sizeof(int) * len2);
712 24252 : found = (bool *) palloc0(sizeof(bool) * len);
713 :
714 24252 : ulen1 = 0;
715 24252 : j = 0;
716 601900 : for (i = 0; i < len; i++)
717 : {
718 577648 : if (i > 0)
719 : {
720 553396 : int cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg);
721 :
722 553396 : if (cmp != 0)
723 : {
724 484992 : if (found[j])
725 202274 : ulen1++;
726 484992 : j++;
727 : }
728 : }
729 :
730 577648 : if (ptrg[i].index >= 0)
731 : {
732 360172 : trg2indexes[ptrg[i].index] = j;
733 : }
734 : else
735 : {
736 217476 : found[j] = true;
737 : }
738 : }
739 24252 : if (found[j])
740 15202 : ulen1++;
741 :
742 : /* Run iterative procedure to find maximum similarity with word */
743 24252 : result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
744 : flags, bounds);
745 :
746 24252 : pfree(trg2indexes);
747 24252 : pfree(found);
748 24252 : pfree(ptrg);
749 :
750 24252 : return result;
751 : }
752 :
753 :
754 : /*
755 : * Extract the next non-wildcard part of a search string, i.e. a word bounded
756 : * by '_' or '%' meta-characters, non-word characters or string end.
757 : *
758 : * str: source string, of length lenstr bytes (need not be null-terminated)
759 : * buf: where to return the substring (must be long enough)
760 : * *bytelen: receives byte length of the found substring
761 : * *charlen: receives character length of the found substring
762 : *
763 : * Returns pointer to end+1 of the found substring in the source string.
764 : * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
765 : *
766 : * If the found word is bounded by non-word characters or string boundaries
767 : * then this function will include corresponding padding spaces into buf.
768 : */
769 : static const char *
770 238 : get_wildcard_part(const char *str, int lenstr,
771 : char *buf, int *bytelen, int *charlen)
772 : {
773 238 : const char *beginword = str;
774 : const char *endword;
775 238 : char *s = buf;
776 238 : bool in_leading_wildcard_meta = false;
777 238 : bool in_trailing_wildcard_meta = false;
778 238 : bool in_escape = false;
779 : int clen;
780 :
781 : /*
782 : * Find the first word character, remembering whether preceding character
783 : * was wildcard meta-character. Note that the in_escape state persists
784 : * from this loop to the next one, since we may exit at a word character
785 : * that is in_escape.
786 : */
787 482 : while (beginword - str < lenstr)
788 : {
789 372 : if (in_escape)
790 : {
791 6 : if (ISWORDCHR(beginword))
792 6 : break;
793 0 : in_escape = false;
794 0 : in_leading_wildcard_meta = false;
795 : }
796 : else
797 : {
798 366 : if (ISESCAPECHAR(beginword))
799 6 : in_escape = true;
800 360 : else if (ISWILDCARDCHAR(beginword))
801 208 : in_leading_wildcard_meta = true;
802 152 : else if (ISWORDCHR(beginword))
803 122 : break;
804 : else
805 30 : in_leading_wildcard_meta = false;
806 : }
807 244 : beginword += pg_mblen(beginword);
808 : }
809 :
810 : /*
811 : * Handle string end.
812 : */
813 238 : if (beginword - str >= lenstr)
814 110 : return NULL;
815 :
816 : /*
817 : * Add left padding spaces if preceding character wasn't wildcard
818 : * meta-character.
819 : */
820 128 : *charlen = 0;
821 128 : if (!in_leading_wildcard_meta)
822 : {
823 : if (LPADDING > 0)
824 : {
825 30 : *s++ = ' ';
826 30 : (*charlen)++;
827 : if (LPADDING > 1)
828 : {
829 30 : *s++ = ' ';
830 30 : (*charlen)++;
831 : }
832 : }
833 : }
834 :
835 : /*
836 : * Copy data into buf until wildcard meta-character, non-word character or
837 : * string boundary. Strip escapes during copy.
838 : */
839 128 : endword = beginword;
840 488 : while (endword - str < lenstr)
841 : {
842 488 : clen = pg_mblen(endword);
843 488 : if (in_escape)
844 : {
845 6 : if (ISWORDCHR(endword))
846 : {
847 6 : memcpy(s, endword, clen);
848 6 : (*charlen)++;
849 6 : s += clen;
850 : }
851 : else
852 : {
853 : /*
854 : * Back up endword to the escape character when stopping at an
855 : * escaped char, so that subsequent get_wildcard_part will
856 : * restart from the escape character. We assume here that
857 : * escape chars are single-byte.
858 : */
859 0 : endword--;
860 0 : break;
861 : }
862 6 : in_escape = false;
863 : }
864 : else
865 : {
866 482 : if (ISESCAPECHAR(endword))
867 0 : in_escape = true;
868 482 : else if (ISWILDCARDCHAR(endword))
869 : {
870 110 : in_trailing_wildcard_meta = true;
871 110 : break;
872 : }
873 372 : else if (ISWORDCHR(endword))
874 : {
875 354 : memcpy(s, endword, clen);
876 354 : (*charlen)++;
877 354 : s += clen;
878 : }
879 : else
880 18 : break;
881 : }
882 360 : endword += clen;
883 : }
884 :
885 : /*
886 : * Add right padding spaces if next character isn't wildcard
887 : * meta-character.
888 : */
889 128 : if (!in_trailing_wildcard_meta)
890 : {
891 : if (RPADDING > 0)
892 : {
893 18 : *s++ = ' ';
894 18 : (*charlen)++;
895 : if (RPADDING > 1)
896 : {
897 : *s++ = ' ';
898 : (*charlen)++;
899 : }
900 : }
901 : }
902 :
903 128 : *bytelen = s - buf;
904 128 : return endword;
905 : }
906 :
907 : /*
908 : * Generates trigrams for wildcard search string.
909 : *
910 : * Returns array of trigrams that must occur in any string that matches the
911 : * wildcard string. For example, given pattern "a%bcd%" the trigrams
912 : * " a", "bcd" would be extracted.
913 : */
914 : TRGM *
915 110 : generate_wildcard_trgm(const char *str, int slen)
916 : {
917 : TRGM *trg;
918 : char *buf,
919 : *buf2;
920 : trgm *tptr;
921 : int len,
922 : charlen,
923 : bytelen;
924 : const char *eword;
925 :
926 110 : protect_out_of_mem(slen);
927 :
928 110 : trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
929 110 : trg->flag = ARRKEY;
930 110 : SET_VARSIZE(trg, TRGMHDRSIZE);
931 :
932 110 : if (slen + LPADDING + RPADDING < 3 || slen == 0)
933 0 : return trg;
934 :
935 110 : tptr = GETARR(trg);
936 :
937 : /* Allocate a buffer for blank-padded, but not yet case-folded, words */
938 110 : buf = palloc(sizeof(char) * (slen + 4));
939 :
940 : /*
941 : * Extract trigrams from each substring extracted by get_wildcard_part.
942 : */
943 110 : eword = str;
944 238 : while ((eword = get_wildcard_part(eword, slen - (eword - str),
945 : buf, &bytelen, &charlen)) != NULL)
946 : {
947 : #ifdef IGNORECASE
948 128 : buf2 = str_tolower(buf, bytelen, DEFAULT_COLLATION_OID);
949 128 : bytelen = strlen(buf2);
950 : #else
951 : buf2 = buf;
952 : #endif
953 :
954 : /*
955 : * count trigrams
956 : */
957 128 : tptr = make_trigrams(tptr, buf2, bytelen, charlen);
958 :
959 : #ifdef IGNORECASE
960 128 : pfree(buf2);
961 : #endif
962 : }
963 :
964 110 : pfree(buf);
965 :
966 110 : if ((len = tptr - GETARR(trg)) == 0)
967 48 : return trg;
968 :
969 : /*
970 : * Make trigrams unique.
971 : */
972 62 : if (len > 1)
973 : {
974 34 : qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
975 34 : len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
976 : }
977 :
978 62 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
979 :
980 62 : return trg;
981 : }
982 :
983 : uint32
984 69546 : trgm2int(trgm *ptr)
985 : {
986 69546 : uint32 val = 0;
987 :
988 69546 : val |= *(((unsigned char *) ptr));
989 69546 : val <<= 8;
990 69546 : val |= *(((unsigned char *) ptr) + 1);
991 69546 : val <<= 8;
992 69546 : val |= *(((unsigned char *) ptr) + 2);
993 :
994 69546 : return val;
995 : }
996 :
997 : Datum
998 14 : show_trgm(PG_FUNCTION_ARGS)
999 : {
1000 14 : text *in = PG_GETARG_TEXT_PP(0);
1001 : TRGM *trg;
1002 : Datum *d;
1003 : ArrayType *a;
1004 : trgm *ptr;
1005 : int i;
1006 :
1007 14 : trg = generate_trgm(VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
1008 14 : d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));
1009 :
1010 88 : for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
1011 : {
1012 74 : text *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));
1013 :
1014 74 : if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr))
1015 : {
1016 0 : snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
1017 0 : SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
1018 : }
1019 : else
1020 : {
1021 74 : SET_VARSIZE(item, VARHDRSZ + 3);
1022 74 : CPTRGM(VARDATA(item), ptr);
1023 : }
1024 74 : d[i] = PointerGetDatum(item);
1025 : }
1026 :
1027 14 : a = construct_array_builtin(d, ARRNELEM(trg), TEXTOID);
1028 :
1029 88 : for (i = 0; i < ARRNELEM(trg); i++)
1030 74 : pfree(DatumGetPointer(d[i]));
1031 :
1032 14 : pfree(d);
1033 14 : pfree(trg);
1034 14 : PG_FREE_IF_COPY(in, 0);
1035 :
1036 14 : PG_RETURN_POINTER(a);
1037 : }
1038 :
1039 : float4
1040 138214 : cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
1041 : {
1042 : trgm *ptr1,
1043 : *ptr2;
1044 138214 : int count = 0;
1045 : int len1,
1046 : len2;
1047 :
1048 138214 : ptr1 = GETARR(trg1);
1049 138214 : ptr2 = GETARR(trg2);
1050 :
1051 138214 : len1 = ARRNELEM(trg1);
1052 138214 : len2 = ARRNELEM(trg2);
1053 :
1054 : /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
1055 138214 : if (len1 <= 0 || len2 <= 0)
1056 2 : return (float4) 0.0;
1057 :
1058 1762582 : while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
1059 : {
1060 1624370 : int res = CMPTRGM(ptr1, ptr2);
1061 :
1062 1624370 : if (res < 0)
1063 369688 : ptr1++;
1064 1254682 : else if (res > 0)
1065 430438 : ptr2++;
1066 : else
1067 : {
1068 824244 : ptr1++;
1069 824244 : ptr2++;
1070 824244 : count++;
1071 : }
1072 : }
1073 :
1074 : /*
1075 : * If inexact then len2 is equal to count, because we don't know actual
1076 : * length of second string in inexact search and we can assume that count
1077 : * is a lower bound of len2.
1078 : */
1079 138212 : return CALCSML(count, len1, inexact ? count : len2);
1080 : }
1081 :
1082 :
1083 : /*
1084 : * Returns whether trg2 contains all trigrams in trg1.
1085 : * This relies on the trigram arrays being sorted.
1086 : */
1087 : bool
1088 380 : trgm_contained_by(TRGM *trg1, TRGM *trg2)
1089 : {
1090 : trgm *ptr1,
1091 : *ptr2;
1092 : int len1,
1093 : len2;
1094 :
1095 380 : ptr1 = GETARR(trg1);
1096 380 : ptr2 = GETARR(trg2);
1097 :
1098 380 : len1 = ARRNELEM(trg1);
1099 380 : len2 = ARRNELEM(trg2);
1100 :
1101 1244 : while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
1102 : {
1103 1198 : int res = CMPTRGM(ptr1, ptr2);
1104 :
1105 1198 : if (res < 0)
1106 334 : return false;
1107 864 : else if (res > 0)
1108 640 : ptr2++;
1109 : else
1110 : {
1111 224 : ptr1++;
1112 224 : ptr2++;
1113 : }
1114 : }
1115 46 : if (ptr1 - GETARR(trg1) < len1)
1116 8 : return false;
1117 : else
1118 38 : return true;
1119 : }
1120 :
1121 : /*
1122 : * Return a palloc'd boolean array showing, for each trigram in "query",
1123 : * whether it is present in the trigram array "key".
1124 : * This relies on the "key" array being sorted, but "query" need not be.
1125 : */
1126 : bool *
1127 4300 : trgm_presence_map(TRGM *query, TRGM *key)
1128 : {
1129 : bool *result;
1130 4300 : trgm *ptrq = GETARR(query),
1131 4300 : *ptrk = GETARR(key);
1132 4300 : int lenq = ARRNELEM(query),
1133 4300 : lenk = ARRNELEM(key),
1134 : i;
1135 :
1136 4300 : result = (bool *) palloc0(lenq * sizeof(bool));
1137 :
1138 : /* for each query trigram, do a binary search in the key array */
1139 1015120 : for (i = 0; i < lenq; i++)
1140 : {
1141 1010820 : int lo = 0;
1142 1010820 : int hi = lenk;
1143 :
1144 4747306 : while (lo < hi)
1145 : {
1146 3752564 : int mid = (lo + hi) / 2;
1147 3752564 : int res = CMPTRGM(ptrq, ptrk + mid);
1148 :
1149 3752564 : if (res < 0)
1150 1568164 : hi = mid;
1151 2184400 : else if (res > 0)
1152 2168322 : lo = mid + 1;
1153 : else
1154 : {
1155 16078 : result[i] = true;
1156 16078 : break;
1157 : }
1158 : }
1159 1010820 : ptrq++;
1160 : }
1161 :
1162 4300 : return result;
1163 : }
1164 :
1165 : Datum
1166 62904 : similarity(PG_FUNCTION_ARGS)
1167 : {
1168 62904 : text *in1 = PG_GETARG_TEXT_PP(0);
1169 62904 : text *in2 = PG_GETARG_TEXT_PP(1);
1170 : TRGM *trg1,
1171 : *trg2;
1172 : float4 res;
1173 :
1174 62904 : trg1 = generate_trgm(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1));
1175 62904 : trg2 = generate_trgm(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2));
1176 :
1177 62904 : res = cnt_sml(trg1, trg2, false);
1178 :
1179 62904 : pfree(trg1);
1180 62904 : pfree(trg2);
1181 62904 : PG_FREE_IF_COPY(in1, 0);
1182 62904 : PG_FREE_IF_COPY(in2, 1);
1183 :
1184 62904 : PG_RETURN_FLOAT4(res);
1185 : }
1186 :
1187 : Datum
1188 1804 : word_similarity(PG_FUNCTION_ARGS)
1189 : {
1190 1804 : text *in1 = PG_GETARG_TEXT_PP(0);
1191 1804 : text *in2 = PG_GETARG_TEXT_PP(1);
1192 : float4 res;
1193 :
1194 3608 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1195 3608 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1196 : 0);
1197 :
1198 1804 : PG_FREE_IF_COPY(in1, 0);
1199 1804 : PG_FREE_IF_COPY(in2, 1);
1200 1804 : PG_RETURN_FLOAT4(res);
1201 : }
1202 :
1203 : Datum
1204 1764 : strict_word_similarity(PG_FUNCTION_ARGS)
1205 : {
1206 1764 : text *in1 = PG_GETARG_TEXT_PP(0);
1207 1764 : text *in2 = PG_GETARG_TEXT_PP(1);
1208 : float4 res;
1209 :
1210 3528 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1211 3528 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1212 : WORD_SIMILARITY_STRICT);
1213 :
1214 1764 : PG_FREE_IF_COPY(in1, 0);
1215 1764 : PG_FREE_IF_COPY(in2, 1);
1216 1764 : PG_RETURN_FLOAT4(res);
1217 : }
1218 :
1219 : Datum
1220 2008 : similarity_dist(PG_FUNCTION_ARGS)
1221 : {
1222 2008 : float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
1223 : PG_GETARG_DATUM(0),
1224 : PG_GETARG_DATUM(1)));
1225 :
1226 2008 : PG_RETURN_FLOAT4(1.0 - res);
1227 : }
1228 :
1229 : Datum
1230 12000 : similarity_op(PG_FUNCTION_ARGS)
1231 : {
1232 12000 : float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
1233 : PG_GETARG_DATUM(0),
1234 : PG_GETARG_DATUM(1)));
1235 :
1236 12000 : PG_RETURN_BOOL(res >= similarity_threshold);
1237 : }
1238 :
1239 : Datum
1240 3848 : word_similarity_op(PG_FUNCTION_ARGS)
1241 : {
1242 3848 : text *in1 = PG_GETARG_TEXT_PP(0);
1243 3848 : text *in2 = PG_GETARG_TEXT_PP(1);
1244 : float4 res;
1245 :
1246 7696 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1247 7696 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1248 : WORD_SIMILARITY_CHECK_ONLY);
1249 :
1250 3848 : PG_FREE_IF_COPY(in1, 0);
1251 3848 : PG_FREE_IF_COPY(in2, 1);
1252 3848 : PG_RETURN_BOOL(res >= word_similarity_threshold);
1253 : }
1254 :
1255 : Datum
1256 3848 : word_similarity_commutator_op(PG_FUNCTION_ARGS)
1257 : {
1258 3848 : text *in1 = PG_GETARG_TEXT_PP(0);
1259 3848 : text *in2 = PG_GETARG_TEXT_PP(1);
1260 : float4 res;
1261 :
1262 7696 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1263 7696 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1264 : WORD_SIMILARITY_CHECK_ONLY);
1265 :
1266 3848 : PG_FREE_IF_COPY(in1, 0);
1267 3848 : PG_FREE_IF_COPY(in2, 1);
1268 3848 : PG_RETURN_BOOL(res >= word_similarity_threshold);
1269 : }
1270 :
1271 : Datum
1272 0 : word_similarity_dist_op(PG_FUNCTION_ARGS)
1273 : {
1274 0 : text *in1 = PG_GETARG_TEXT_PP(0);
1275 0 : text *in2 = PG_GETARG_TEXT_PP(1);
1276 : float4 res;
1277 :
1278 0 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1279 0 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1280 : 0);
1281 :
1282 0 : PG_FREE_IF_COPY(in1, 0);
1283 0 : PG_FREE_IF_COPY(in2, 1);
1284 0 : PG_RETURN_FLOAT4(1.0 - res);
1285 : }
1286 :
1287 : Datum
1288 1428 : word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
1289 : {
1290 1428 : text *in1 = PG_GETARG_TEXT_PP(0);
1291 1428 : text *in2 = PG_GETARG_TEXT_PP(1);
1292 : float4 res;
1293 :
1294 2856 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1295 2856 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1296 : 0);
1297 :
1298 1428 : PG_FREE_IF_COPY(in1, 0);
1299 1428 : PG_FREE_IF_COPY(in2, 1);
1300 1428 : PG_RETURN_FLOAT4(1.0 - res);
1301 : }
1302 :
1303 : Datum
1304 5060 : strict_word_similarity_op(PG_FUNCTION_ARGS)
1305 : {
1306 5060 : text *in1 = PG_GETARG_TEXT_PP(0);
1307 5060 : text *in2 = PG_GETARG_TEXT_PP(1);
1308 : float4 res;
1309 :
1310 10120 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1311 10120 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1312 : WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
1313 :
1314 5060 : PG_FREE_IF_COPY(in1, 0);
1315 5060 : PG_FREE_IF_COPY(in2, 1);
1316 5060 : PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
1317 : }
1318 :
1319 : Datum
1320 5060 : strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
1321 : {
1322 5060 : text *in1 = PG_GETARG_TEXT_PP(0);
1323 5060 : text *in2 = PG_GETARG_TEXT_PP(1);
1324 : float4 res;
1325 :
1326 10120 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1327 10120 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1328 : WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
1329 :
1330 5060 : PG_FREE_IF_COPY(in1, 0);
1331 5060 : PG_FREE_IF_COPY(in2, 1);
1332 5060 : PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
1333 : }
1334 :
1335 : Datum
1336 0 : strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
1337 : {
1338 0 : text *in1 = PG_GETARG_TEXT_PP(0);
1339 0 : text *in2 = PG_GETARG_TEXT_PP(1);
1340 : float4 res;
1341 :
1342 0 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1343 0 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1344 : WORD_SIMILARITY_STRICT);
1345 :
1346 0 : PG_FREE_IF_COPY(in1, 0);
1347 0 : PG_FREE_IF_COPY(in2, 1);
1348 0 : PG_RETURN_FLOAT4(1.0 - res);
1349 : }
1350 :
1351 : Datum
1352 1440 : strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
1353 : {
1354 1440 : text *in1 = PG_GETARG_TEXT_PP(0);
1355 1440 : text *in2 = PG_GETARG_TEXT_PP(1);
1356 : float4 res;
1357 :
1358 2880 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1359 2880 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1360 : WORD_SIMILARITY_STRICT);
1361 :
1362 1440 : PG_FREE_IF_COPY(in1, 0);
1363 1440 : PG_FREE_IF_COPY(in2, 1);
1364 1440 : PG_RETURN_FLOAT4(1.0 - res);
1365 : }
|