Line data Source code
1 : /*
2 : * contrib/pg_trgm/trgm_op.c
3 : */
4 : #include "postgres.h"
5 :
6 : #include <ctype.h>
7 :
8 : #include "catalog/pg_collation_d.h"
9 : #include "catalog/pg_type.h"
10 : #include "common/int.h"
11 : #include "lib/qunique.h"
12 : #include "miscadmin.h"
13 : #include "trgm.h"
14 : #include "tsearch/ts_locale.h"
15 : #include "utils/formatting.h"
16 : #include "utils/guc.h"
17 : #include "utils/lsyscache.h"
18 : #include "utils/memutils.h"
19 : #include "utils/pg_crc.h"
20 :
21 6 : PG_MODULE_MAGIC_EXT(
22 : .name = "pg_trgm",
23 : .version = PG_VERSION
24 : );
25 :
26 : /* GUC variables */
27 : double similarity_threshold = 0.3f;
28 : double word_similarity_threshold = 0.6f;
29 : double strict_word_similarity_threshold = 0.5f;
30 :
31 4 : PG_FUNCTION_INFO_V1(set_limit);
32 4 : PG_FUNCTION_INFO_V1(show_limit);
33 4 : PG_FUNCTION_INFO_V1(show_trgm);
34 4 : PG_FUNCTION_INFO_V1(similarity);
35 4 : PG_FUNCTION_INFO_V1(word_similarity);
36 4 : PG_FUNCTION_INFO_V1(strict_word_similarity);
37 4 : PG_FUNCTION_INFO_V1(similarity_dist);
38 4 : PG_FUNCTION_INFO_V1(similarity_op);
39 4 : PG_FUNCTION_INFO_V1(word_similarity_op);
40 4 : PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
41 2 : PG_FUNCTION_INFO_V1(word_similarity_dist_op);
42 4 : PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
43 4 : PG_FUNCTION_INFO_V1(strict_word_similarity_op);
44 4 : PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op);
45 2 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op);
46 4 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op);
47 :
48 : static int CMPTRGM_CHOOSE(const void *a, const void *b);
49 : int (*CMPTRGM) (const void *a, const void *b) = CMPTRGM_CHOOSE;
50 :
51 : /* Trigram with position */
52 : typedef struct
53 : {
54 : trgm trg;
55 : int index;
56 : } pos_trgm;
57 :
58 : /* Trigram bound type */
59 : typedef uint8 TrgmBound;
60 : #define TRGM_BOUND_LEFT 0x01 /* trigram is left bound of word */
61 : #define TRGM_BOUND_RIGHT 0x02 /* trigram is right bound of word */
62 :
63 : /* Word similarity flags */
64 : #define WORD_SIMILARITY_CHECK_ONLY 0x01 /* only check existence of similar
65 : * search pattern in text */
66 : #define WORD_SIMILARITY_STRICT 0x02 /* force bounds of extent to match
67 : * word bounds */
68 :
69 : /*
70 : * Module load callback
71 : */
72 : void
73 6 : _PG_init(void)
74 : {
75 : /* Define custom GUC variables. */
76 6 : DefineCustomRealVariable("pg_trgm.similarity_threshold",
77 : "Sets the threshold used by the % operator.",
78 : "Valid range is 0.0 .. 1.0.",
79 : &similarity_threshold,
80 : 0.3f,
81 : 0.0,
82 : 1.0,
83 : PGC_USERSET,
84 : 0,
85 : NULL,
86 : NULL,
87 : NULL);
88 6 : DefineCustomRealVariable("pg_trgm.word_similarity_threshold",
89 : "Sets the threshold used by the <% operator.",
90 : "Valid range is 0.0 .. 1.0.",
91 : &word_similarity_threshold,
92 : 0.6f,
93 : 0.0,
94 : 1.0,
95 : PGC_USERSET,
96 : 0,
97 : NULL,
98 : NULL,
99 : NULL);
100 6 : DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold",
101 : "Sets the threshold used by the <<% operator.",
102 : "Valid range is 0.0 .. 1.0.",
103 : &strict_word_similarity_threshold,
104 : 0.5f,
105 : 0.0,
106 : 1.0,
107 : PGC_USERSET,
108 : 0,
109 : NULL,
110 : NULL,
111 : NULL);
112 :
113 6 : MarkGUCPrefixReserved("pg_trgm");
114 6 : }
115 :
116 : #define CMPCHAR(a,b) ( ((a)==(b)) ? 0 : ( ((a)<(b)) ? -1 : 1 ) )
117 :
118 : /*
119 : * Functions for comparing two trgms while treating each char as "signed char" or
120 : * "unsigned char".
121 : */
122 : static inline int
123 14918914 : CMPTRGM_SIGNED(const void *a, const void *b)
124 : {
125 : #define CMPPCHAR_S(a,b,i) CMPCHAR( *(((const signed char*)(a))+i), *(((const signed char*)(b))+i) )
126 :
127 10742630 : return CMPPCHAR_S(a, b, 0) ? CMPPCHAR_S(a, b, 0)
128 32517480 : : (CMPPCHAR_S(a, b, 1) ? CMPPCHAR_S(a, b, 1)
129 6855936 : : CMPPCHAR_S(a, b, 2));
130 : }
131 :
132 : static inline int
133 0 : CMPTRGM_UNSIGNED(const void *a, const void *b)
134 : {
135 : #define CMPPCHAR_UNS(a,b,i) CMPCHAR( *(((const unsigned char*)(a))+i), *(((const unsigned char*)(b))+i) )
136 :
137 0 : return CMPPCHAR_UNS(a, b, 0) ? CMPPCHAR_UNS(a, b, 0)
138 0 : : (CMPPCHAR_UNS(a, b, 1) ? CMPPCHAR_UNS(a, b, 1)
139 0 : : CMPPCHAR_UNS(a, b, 2));
140 : }
141 :
142 : /*
143 : * This gets called on the first call. It replaces the function pointer so
144 : * that subsequent calls are routed directly to the chosen implementation.
145 : */
146 : static int
147 6 : CMPTRGM_CHOOSE(const void *a, const void *b)
148 : {
149 6 : if (GetDefaultCharSignedness())
150 6 : CMPTRGM = CMPTRGM_SIGNED;
151 : else
152 0 : CMPTRGM = CMPTRGM_UNSIGNED;
153 :
154 6 : return CMPTRGM(a, b);
155 : }
156 :
157 : /*
158 : * Deprecated function.
159 : * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
160 : */
161 : Datum
162 4 : set_limit(PG_FUNCTION_ARGS)
163 : {
164 4 : float4 nlimit = PG_GETARG_FLOAT4(0);
165 : char *nlimit_str;
166 : Oid func_out_oid;
167 : bool is_varlena;
168 :
169 4 : getTypeOutputInfo(FLOAT4OID, &func_out_oid, &is_varlena);
170 :
171 4 : nlimit_str = OidOutputFunctionCall(func_out_oid, Float4GetDatum(nlimit));
172 :
173 4 : SetConfigOption("pg_trgm.similarity_threshold", nlimit_str,
174 : PGC_USERSET, PGC_S_SESSION);
175 :
176 4 : PG_RETURN_FLOAT4(similarity_threshold);
177 : }
178 :
179 :
180 : /*
181 : * Get similarity threshold for given index scan strategy number.
182 : */
183 : double
184 86954 : index_strategy_get_limit(StrategyNumber strategy)
185 : {
186 86954 : switch (strategy)
187 : {
188 65354 : case SimilarityStrategyNumber:
189 65354 : return similarity_threshold;
190 9636 : case WordSimilarityStrategyNumber:
191 9636 : return word_similarity_threshold;
192 11964 : case StrictWordSimilarityStrategyNumber:
193 11964 : return strict_word_similarity_threshold;
194 0 : default:
195 0 : elog(ERROR, "unrecognized strategy number: %d", strategy);
196 : break;
197 : }
198 :
199 : return 0.0; /* keep compiler quiet */
200 : }
201 :
202 : /*
203 : * Deprecated function.
204 : * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
205 : */
206 : Datum
207 40000 : show_limit(PG_FUNCTION_ARGS)
208 : {
209 40000 : PG_RETURN_FLOAT4(similarity_threshold);
210 : }
211 :
212 : static int
213 6374440 : comp_trgm(const void *a, const void *b)
214 : {
215 6374440 : return CMPTRGM(a, b);
216 : }
217 :
218 : /*
219 : * Finds first word in string, returns pointer to the word,
220 : * endword points to the character after word
221 : */
222 : static char *
223 482820 : find_word(char *str, int lenstr, char **endword, int *charlen)
224 : {
225 482820 : char *beginword = str;
226 :
227 510146 : while (beginword - str < lenstr && !ISWORDCHR(beginword))
228 27326 : beginword += pg_mblen(beginword);
229 :
230 482820 : if (beginword - str >= lenstr)
231 228160 : return NULL;
232 :
233 254660 : *endword = beginword;
234 254660 : *charlen = 0;
235 2199350 : while (*endword - str < lenstr && ISWORDCHR(*endword))
236 : {
237 1944690 : *endword += pg_mblen(*endword);
238 1944690 : (*charlen)++;
239 : }
240 :
241 254660 : return beginword;
242 : }
243 :
244 : /*
245 : * Reduce a trigram (three possibly multi-byte characters) to a trgm,
246 : * which is always exactly three bytes. If we have three single-byte
247 : * characters, we just use them as-is; otherwise we form a hash value.
248 : */
249 : void
250 2918 : compact_trigram(trgm *tptr, char *str, int bytelen)
251 : {
252 2918 : if (bytelen == 3)
253 : {
254 2918 : CPTRGM(tptr, str);
255 : }
256 : else
257 : {
258 : pg_crc32 crc;
259 :
260 0 : INIT_LEGACY_CRC32(crc);
261 0 : COMP_LEGACY_CRC32(crc, str, bytelen);
262 0 : FIN_LEGACY_CRC32(crc);
263 :
264 : /*
265 : * use only 3 upper bytes from crc, hope, it's good enough hashing
266 : */
267 0 : CPTRGM(tptr, &crc);
268 : }
269 2918 : }
270 :
271 : /*
272 : * Adds trigrams from words (already padded).
273 : */
274 : static trgm *
275 254788 : make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
276 : {
277 254788 : char *ptr = str;
278 :
279 254788 : if (charlen < 3)
280 54 : return tptr;
281 :
282 254734 : if (bytelen > charlen)
283 : {
284 : /* Find multibyte character boundaries and apply compact_trigram */
285 0 : int lenfirst = pg_mblen(str),
286 0 : lenmiddle = pg_mblen(str + lenfirst),
287 0 : lenlast = pg_mblen(str + lenfirst + lenmiddle);
288 :
289 0 : while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
290 : {
291 0 : compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
292 :
293 0 : ptr += lenfirst;
294 0 : tptr++;
295 :
296 0 : lenfirst = lenmiddle;
297 0 : lenmiddle = lenlast;
298 0 : lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
299 : }
300 : }
301 : else
302 : {
303 : /* Fast path when there are no multibyte characters */
304 : Assert(bytelen == charlen);
305 :
306 2454266 : while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
307 : {
308 2199532 : CPTRGM(tptr, ptr);
309 2199532 : ptr++;
310 2199532 : tptr++;
311 : }
312 : }
313 :
314 254734 : return tptr;
315 : }
316 :
317 : /*
318 : * Make array of trigrams without sorting and removing duplicate items.
319 : *
320 : * trg: where to return the array of trigrams.
321 : * str: source string, of length slen bytes.
322 : * bounds: where to return bounds of trigrams (if needed).
323 : *
324 : * Returns length of the generated array.
325 : */
326 : static int
327 228162 : generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
328 : {
329 : trgm *tptr;
330 : char *buf;
331 : int charlen,
332 : bytelen;
333 : char *bword,
334 : *eword;
335 :
336 228162 : if (slen + LPADDING + RPADDING < 3 || slen == 0)
337 2 : return 0;
338 :
339 228160 : tptr = trg;
340 :
341 : /* Allocate a buffer for case-folded, blank-padded words */
342 228160 : buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
343 :
344 : if (LPADDING > 0)
345 : {
346 228160 : *buf = ' ';
347 : if (LPADDING > 1)
348 228160 : *(buf + 1) = ' ';
349 : }
350 :
351 228160 : eword = str;
352 482820 : while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
353 : {
354 : #ifdef IGNORECASE
355 254660 : bword = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID);
356 254660 : bytelen = strlen(bword);
357 : #else
358 : bytelen = eword - bword;
359 : #endif
360 :
361 254660 : memcpy(buf + LPADDING, bword, bytelen);
362 :
363 : #ifdef IGNORECASE
364 254660 : pfree(bword);
365 : #endif
366 :
367 254660 : buf[LPADDING + bytelen] = ' ';
368 254660 : buf[LPADDING + bytelen + 1] = ' ';
369 :
370 : /* Calculate trigrams marking their bounds if needed */
371 254660 : if (bounds)
372 24796 : bounds[tptr - trg] |= TRGM_BOUND_LEFT;
373 254660 : tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
374 : charlen + LPADDING + RPADDING);
375 254660 : if (bounds)
376 24796 : bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
377 : }
378 :
379 228160 : pfree(buf);
380 :
381 228160 : return tptr - trg;
382 : }
383 :
384 : /*
385 : * Guard against possible overflow in the palloc requests below. (We
386 : * don't worry about the additive constants, since palloc can detect
387 : * requests that are a little above MaxAllocSize --- we just need to
388 : * prevent integer overflow in the multiplications.)
389 : */
390 : static void
391 204020 : protect_out_of_mem(int slen)
392 : {
393 204020 : if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
394 204020 : (Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
395 0 : ereport(ERROR,
396 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
397 : errmsg("out of memory")));
398 204020 : }
399 :
400 : /*
401 : * Make array of trigrams with sorting and removing duplicate items.
402 : *
403 : * str: source string, of length slen bytes.
404 : *
405 : * Returns the sorted array of unique trigrams.
406 : */
407 : TRGM *
408 179658 : generate_trgm(char *str, int slen)
409 : {
410 : TRGM *trg;
411 : int len;
412 :
413 179658 : protect_out_of_mem(slen);
414 :
415 179658 : trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
416 179658 : trg->flag = ARRKEY;
417 :
418 179658 : len = generate_trgm_only(GETARR(trg), str, slen, NULL);
419 179658 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
420 :
421 179658 : if (len == 0)
422 8 : return trg;
423 :
424 : /*
425 : * Make trigrams unique.
426 : */
427 179650 : if (len > 1)
428 : {
429 179650 : qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
430 179650 : len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
431 : }
432 :
433 179650 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
434 :
435 179650 : return trg;
436 : }
437 :
438 : /*
439 : * Make array of positional trigrams from two trigram arrays trg1 and trg2.
440 : *
441 : * trg1: trigram array of search pattern, of length len1. trg1 is required
442 : * word which positions don't matter and replaced with -1.
443 : * trg2: trigram array of text, of length len2. trg2 is haystack where we
444 : * search and have to store its positions.
445 : *
446 : * Returns concatenated trigram array.
447 : */
448 : static pos_trgm *
449 24252 : make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
450 : {
451 : pos_trgm *result;
452 : int i,
453 24252 : len = len1 + len2;
454 :
455 24252 : result = (pos_trgm *) palloc(sizeof(pos_trgm) * len);
456 :
457 241728 : for (i = 0; i < len1; i++)
458 : {
459 217476 : memcpy(&result[i].trg, &trg1[i], sizeof(trgm));
460 217476 : result[i].index = -1;
461 : }
462 :
463 384426 : for (i = 0; i < len2; i++)
464 : {
465 360174 : memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm));
466 360174 : result[i + len1].index = i;
467 : }
468 :
469 24252 : return result;
470 : }
471 :
472 : /*
473 : * Compare position trigrams: compare trigrams first and position second.
474 : */
475 : static int
476 2615418 : comp_ptrgm(const void *v1, const void *v2)
477 : {
478 2615418 : const pos_trgm *p1 = (const pos_trgm *) v1;
479 2615418 : const pos_trgm *p2 = (const pos_trgm *) v2;
480 : int cmp;
481 :
482 2615418 : cmp = CMPTRGM(p1->trg, p2->trg);
483 2615418 : if (cmp != 0)
484 2536014 : return cmp;
485 :
486 79404 : return pg_cmp_s32(p1->index, p2->index);
487 : }
488 :
489 : /*
490 : * Iterative search function which calculates maximum similarity with word in
491 : * the string. Maximum similarity is only calculated only if the flag
492 : * WORD_SIMILARITY_CHECK_ONLY isn't set.
493 : *
494 : * trg2indexes: array which stores indexes of the array "found".
495 : * found: array which stores true of false values.
496 : * ulen1: count of unique trigrams of array "trg1".
497 : * len2: length of array "trg2" and array "trg2indexes".
498 : * len: length of the array "found".
499 : * flags: set of boolean flags parameterizing similarity calculation.
500 : * bounds: whether each trigram is left/right bound of word.
501 : *
502 : * Returns word similarity.
503 : */
504 : static float4
505 24252 : iterate_word_similarity(int *trg2indexes,
506 : bool *found,
507 : int ulen1,
508 : int len2,
509 : int len,
510 : uint8 flags,
511 : TrgmBound *bounds)
512 : {
513 : int *lastpos,
514 : i,
515 24252 : ulen2 = 0,
516 24252 : count = 0,
517 24252 : upper = -1,
518 : lower;
519 : float4 smlr_cur,
520 24252 : smlr_max = 0.0f;
521 : double threshold;
522 :
523 : Assert(bounds || !(flags & WORD_SIMILARITY_STRICT));
524 :
525 : /* Select appropriate threshold */
526 48504 : threshold = (flags & WORD_SIMILARITY_STRICT) ?
527 24252 : strict_word_similarity_threshold :
528 : word_similarity_threshold;
529 :
530 : /*
531 : * Consider first trigram as initial lower bound for strict word
532 : * similarity, or initialize it later with first trigram present for plain
533 : * word similarity.
534 : */
535 24252 : lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1;
536 :
537 : /* Memorise last position of each trigram */
538 24252 : lastpos = (int *) palloc(sizeof(int) * len);
539 24252 : memset(lastpos, -1, sizeof(int) * len);
540 :
541 367286 : for (i = 0; i < len2; i++)
542 : {
543 : int trgindex;
544 :
545 346602 : CHECK_FOR_INTERRUPTS();
546 :
547 : /* Get index of next trigram */
548 346602 : trgindex = trg2indexes[i];
549 :
550 : /* Update last position of this trigram */
551 346602 : if (lower >= 0 || found[trgindex])
552 : {
553 271586 : if (lastpos[trgindex] < 0)
554 : {
555 267882 : ulen2++;
556 267882 : if (found[trgindex])
557 61512 : count++;
558 : }
559 271586 : lastpos[trgindex] = i;
560 : }
561 :
562 : /*
563 : * Adjust upper bound if trigram is upper bound of word for strict
564 : * word similarity, or if trigram is present in required substring for
565 : * plain word similarity
566 : */
567 500688 : if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT)
568 154086 : : found[trgindex])
569 : {
570 : int prev_lower,
571 : tmp_ulen2,
572 : tmp_lower,
573 : tmp_count;
574 :
575 51270 : upper = i;
576 51270 : if (lower == -1)
577 : {
578 9390 : lower = i;
579 9390 : ulen2 = 1;
580 : }
581 :
582 51270 : smlr_cur = CALCSML(count, ulen1, ulen2);
583 :
584 : /* Also try to adjust lower bound for greater similarity */
585 51270 : tmp_count = count;
586 51270 : tmp_ulen2 = ulen2;
587 51270 : prev_lower = lower;
588 417186 : for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
589 : {
590 : float smlr_tmp;
591 : int tmp_trgindex;
592 :
593 : /*
594 : * Adjust lower bound only if trigram is lower bound of word
595 : * for strict word similarity, or consider every trigram as
596 : * lower bound for plain word similarity.
597 : */
598 369484 : if (!(flags & WORD_SIMILARITY_STRICT)
599 290354 : || (bounds[tmp_lower] & TRGM_BOUND_LEFT))
600 : {
601 119394 : smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
602 119394 : if (smlr_tmp > smlr_cur)
603 : {
604 7022 : smlr_cur = smlr_tmp;
605 7022 : ulen2 = tmp_ulen2;
606 7022 : lower = tmp_lower;
607 7022 : count = tmp_count;
608 : }
609 :
610 : /*
611 : * If we only check that word similarity is greater than
612 : * threshold we do not need to calculate a maximum
613 : * similarity.
614 : */
615 119394 : if ((flags & WORD_SIMILARITY_CHECK_ONLY)
616 74228 : && smlr_cur >= threshold)
617 3568 : break;
618 : }
619 :
620 365916 : tmp_trgindex = trg2indexes[tmp_lower];
621 365916 : if (lastpos[tmp_trgindex] == tmp_lower)
622 : {
623 361398 : tmp_ulen2--;
624 361398 : if (found[tmp_trgindex])
625 93160 : tmp_count--;
626 : }
627 : }
628 :
629 51270 : smlr_max = Max(smlr_max, smlr_cur);
630 :
631 : /*
632 : * if we only check that word similarity is greater than threshold
633 : * we do not need to calculate a maximum similarity.
634 : */
635 51270 : if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold)
636 3568 : break;
637 :
638 81198 : for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
639 : {
640 : int tmp_trgindex;
641 :
642 33496 : tmp_trgindex = trg2indexes[tmp_lower];
643 33496 : if (lastpos[tmp_trgindex] == tmp_lower)
644 32000 : lastpos[tmp_trgindex] = -1;
645 : }
646 : }
647 : }
648 :
649 24252 : pfree(lastpos);
650 :
651 24252 : return smlr_max;
652 : }
653 :
654 : /*
655 : * Calculate word similarity.
656 : * This function prepare two arrays: "trg2indexes" and "found". Then this arrays
657 : * are used to calculate word similarity using iterate_word_similarity().
658 : *
659 : * "trg2indexes" is array which stores indexes of the array "found".
660 : * In other words:
661 : * trg2indexes[j] = i;
662 : * found[i] = true (or false);
663 : * If found[i] == true then there is trigram trg2[j] in array "trg1".
664 : * If found[i] == false then there is not trigram trg2[j] in array "trg1".
665 : *
666 : * str1: search pattern string, of length slen1 bytes.
667 : * str2: text in which we are looking for a word, of length slen2 bytes.
668 : * flags: set of boolean flags parameterizing similarity calculation.
669 : *
670 : * Returns word similarity.
671 : */
672 : static float4
673 24252 : calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
674 : uint8 flags)
675 : {
676 : bool *found;
677 : pos_trgm *ptrg;
678 : trgm *trg1;
679 : trgm *trg2;
680 : int len1,
681 : len2,
682 : len,
683 : i,
684 : j,
685 : ulen1;
686 : int *trg2indexes;
687 : float4 result;
688 : TrgmBound *bounds;
689 :
690 24252 : protect_out_of_mem(slen1 + slen2);
691 :
692 : /* Make positional trigrams */
693 24252 : trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
694 24252 : trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
695 24252 : if (flags & WORD_SIMILARITY_STRICT)
696 13324 : bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
697 : else
698 10928 : bounds = NULL;
699 :
700 24252 : len1 = generate_trgm_only(trg1, str1, slen1, NULL);
701 24252 : len2 = generate_trgm_only(trg2, str2, slen2, bounds);
702 :
703 24252 : ptrg = make_positional_trgm(trg1, len1, trg2, len2);
704 24252 : len = len1 + len2;
705 24252 : qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
706 :
707 24252 : pfree(trg1);
708 24252 : pfree(trg2);
709 :
710 : /*
711 : * Merge positional trigrams array: enumerate each trigram and find its
712 : * presence in required word.
713 : */
714 24252 : trg2indexes = (int *) palloc(sizeof(int) * len2);
715 24252 : found = (bool *) palloc0(sizeof(bool) * len);
716 :
717 24252 : ulen1 = 0;
718 24252 : j = 0;
719 601902 : for (i = 0; i < len; i++)
720 : {
721 577650 : if (i > 0)
722 : {
723 553398 : int cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg);
724 :
725 553398 : if (cmp != 0)
726 : {
727 484998 : if (found[j])
728 202276 : ulen1++;
729 484998 : j++;
730 : }
731 : }
732 :
733 577650 : if (ptrg[i].index >= 0)
734 : {
735 360174 : trg2indexes[ptrg[i].index] = j;
736 : }
737 : else
738 : {
739 217476 : found[j] = true;
740 : }
741 : }
742 24252 : if (found[j])
743 15200 : ulen1++;
744 :
745 : /* Run iterative procedure to find maximum similarity with word */
746 24252 : result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
747 : flags, bounds);
748 :
749 24252 : pfree(trg2indexes);
750 24252 : pfree(found);
751 24252 : pfree(ptrg);
752 :
753 24252 : return result;
754 : }
755 :
756 :
757 : /*
758 : * Extract the next non-wildcard part of a search string, i.e. a word bounded
759 : * by '_' or '%' meta-characters, non-word characters or string end.
760 : *
761 : * str: source string, of length lenstr bytes (need not be null-terminated)
762 : * buf: where to return the substring (must be long enough)
763 : * *bytelen: receives byte length of the found substring
764 : * *charlen: receives character length of the found substring
765 : *
766 : * Returns pointer to end+1 of the found substring in the source string.
767 : * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
768 : *
769 : * If the found word is bounded by non-word characters or string boundaries
770 : * then this function will include corresponding padding spaces into buf.
771 : */
772 : static const char *
773 238 : get_wildcard_part(const char *str, int lenstr,
774 : char *buf, int *bytelen, int *charlen)
775 : {
776 238 : const char *beginword = str;
777 : const char *endword;
778 238 : char *s = buf;
779 238 : bool in_leading_wildcard_meta = false;
780 238 : bool in_trailing_wildcard_meta = false;
781 238 : bool in_escape = false;
782 : int clen;
783 :
784 : /*
785 : * Find the first word character, remembering whether preceding character
786 : * was wildcard meta-character. Note that the in_escape state persists
787 : * from this loop to the next one, since we may exit at a word character
788 : * that is in_escape.
789 : */
790 482 : while (beginword - str < lenstr)
791 : {
792 372 : if (in_escape)
793 : {
794 6 : if (ISWORDCHR(beginword))
795 6 : break;
796 0 : in_escape = false;
797 0 : in_leading_wildcard_meta = false;
798 : }
799 : else
800 : {
801 366 : if (ISESCAPECHAR(beginword))
802 6 : in_escape = true;
803 360 : else if (ISWILDCARDCHAR(beginword))
804 208 : in_leading_wildcard_meta = true;
805 152 : else if (ISWORDCHR(beginword))
806 122 : break;
807 : else
808 30 : in_leading_wildcard_meta = false;
809 : }
810 244 : beginword += pg_mblen(beginword);
811 : }
812 :
813 : /*
814 : * Handle string end.
815 : */
816 238 : if (beginword - str >= lenstr)
817 110 : return NULL;
818 :
819 : /*
820 : * Add left padding spaces if preceding character wasn't wildcard
821 : * meta-character.
822 : */
823 128 : *charlen = 0;
824 128 : if (!in_leading_wildcard_meta)
825 : {
826 : if (LPADDING > 0)
827 : {
828 30 : *s++ = ' ';
829 30 : (*charlen)++;
830 : if (LPADDING > 1)
831 : {
832 30 : *s++ = ' ';
833 30 : (*charlen)++;
834 : }
835 : }
836 : }
837 :
838 : /*
839 : * Copy data into buf until wildcard meta-character, non-word character or
840 : * string boundary. Strip escapes during copy.
841 : */
842 128 : endword = beginword;
843 488 : while (endword - str < lenstr)
844 : {
845 488 : clen = pg_mblen(endword);
846 488 : if (in_escape)
847 : {
848 6 : if (ISWORDCHR(endword))
849 : {
850 6 : memcpy(s, endword, clen);
851 6 : (*charlen)++;
852 6 : s += clen;
853 : }
854 : else
855 : {
856 : /*
857 : * Back up endword to the escape character when stopping at an
858 : * escaped char, so that subsequent get_wildcard_part will
859 : * restart from the escape character. We assume here that
860 : * escape chars are single-byte.
861 : */
862 0 : endword--;
863 0 : break;
864 : }
865 6 : in_escape = false;
866 : }
867 : else
868 : {
869 482 : if (ISESCAPECHAR(endword))
870 0 : in_escape = true;
871 482 : else if (ISWILDCARDCHAR(endword))
872 : {
873 110 : in_trailing_wildcard_meta = true;
874 110 : break;
875 : }
876 372 : else if (ISWORDCHR(endword))
877 : {
878 354 : memcpy(s, endword, clen);
879 354 : (*charlen)++;
880 354 : s += clen;
881 : }
882 : else
883 18 : break;
884 : }
885 360 : endword += clen;
886 : }
887 :
888 : /*
889 : * Add right padding spaces if next character isn't wildcard
890 : * meta-character.
891 : */
892 128 : if (!in_trailing_wildcard_meta)
893 : {
894 : if (RPADDING > 0)
895 : {
896 18 : *s++ = ' ';
897 18 : (*charlen)++;
898 : if (RPADDING > 1)
899 : {
900 : *s++ = ' ';
901 : (*charlen)++;
902 : }
903 : }
904 : }
905 :
906 128 : *bytelen = s - buf;
907 128 : return endword;
908 : }
909 :
910 : /*
911 : * Generates trigrams for wildcard search string.
912 : *
913 : * Returns array of trigrams that must occur in any string that matches the
914 : * wildcard string. For example, given pattern "a%bcd%" the trigrams
915 : * " a", "bcd" would be extracted.
916 : */
917 : TRGM *
918 110 : generate_wildcard_trgm(const char *str, int slen)
919 : {
920 : TRGM *trg;
921 : char *buf,
922 : *buf2;
923 : trgm *tptr;
924 : int len,
925 : charlen,
926 : bytelen;
927 : const char *eword;
928 :
929 110 : protect_out_of_mem(slen);
930 :
931 110 : trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
932 110 : trg->flag = ARRKEY;
933 110 : SET_VARSIZE(trg, TRGMHDRSIZE);
934 :
935 110 : if (slen + LPADDING + RPADDING < 3 || slen == 0)
936 0 : return trg;
937 :
938 110 : tptr = GETARR(trg);
939 :
940 : /* Allocate a buffer for blank-padded, but not yet case-folded, words */
941 110 : buf = palloc(sizeof(char) * (slen + 4));
942 :
943 : /*
944 : * Extract trigrams from each substring extracted by get_wildcard_part.
945 : */
946 110 : eword = str;
947 238 : while ((eword = get_wildcard_part(eword, slen - (eword - str),
948 : buf, &bytelen, &charlen)) != NULL)
949 : {
950 : #ifdef IGNORECASE
951 128 : buf2 = str_tolower(buf, bytelen, DEFAULT_COLLATION_OID);
952 128 : bytelen = strlen(buf2);
953 : #else
954 : buf2 = buf;
955 : #endif
956 :
957 : /*
958 : * count trigrams
959 : */
960 128 : tptr = make_trigrams(tptr, buf2, bytelen, charlen);
961 :
962 : #ifdef IGNORECASE
963 128 : pfree(buf2);
964 : #endif
965 : }
966 :
967 110 : pfree(buf);
968 :
969 110 : if ((len = tptr - GETARR(trg)) == 0)
970 48 : return trg;
971 :
972 : /*
973 : * Make trigrams unique.
974 : */
975 62 : if (len > 1)
976 : {
977 34 : qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
978 34 : len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
979 : }
980 :
981 62 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
982 :
983 62 : return trg;
984 : }
985 :
986 : uint32
987 69546 : trgm2int(trgm *ptr)
988 : {
989 69546 : uint32 val = 0;
990 :
991 69546 : val |= *(((unsigned char *) ptr));
992 69546 : val <<= 8;
993 69546 : val |= *(((unsigned char *) ptr) + 1);
994 69546 : val <<= 8;
995 69546 : val |= *(((unsigned char *) ptr) + 2);
996 :
997 69546 : return val;
998 : }
999 :
1000 : Datum
1001 14 : show_trgm(PG_FUNCTION_ARGS)
1002 : {
1003 14 : text *in = PG_GETARG_TEXT_PP(0);
1004 : TRGM *trg;
1005 : Datum *d;
1006 : ArrayType *a;
1007 : trgm *ptr;
1008 : int i;
1009 :
1010 14 : trg = generate_trgm(VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
1011 14 : d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));
1012 :
1013 88 : for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
1014 : {
1015 74 : text *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));
1016 :
1017 74 : if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr))
1018 : {
1019 0 : snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
1020 0 : SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
1021 : }
1022 : else
1023 : {
1024 74 : SET_VARSIZE(item, VARHDRSZ + 3);
1025 74 : CPTRGM(VARDATA(item), ptr);
1026 : }
1027 74 : d[i] = PointerGetDatum(item);
1028 : }
1029 :
1030 14 : a = construct_array_builtin(d, ARRNELEM(trg), TEXTOID);
1031 :
1032 88 : for (i = 0; i < ARRNELEM(trg); i++)
1033 74 : pfree(DatumGetPointer(d[i]));
1034 :
1035 14 : pfree(d);
1036 14 : pfree(trg);
1037 14 : PG_FREE_IF_COPY(in, 0);
1038 :
1039 14 : PG_RETURN_POINTER(a);
1040 : }
1041 :
1042 : float4
1043 138146 : cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
1044 : {
1045 : trgm *ptr1,
1046 : *ptr2;
1047 138146 : int count = 0;
1048 : int len1,
1049 : len2;
1050 :
1051 138146 : ptr1 = GETARR(trg1);
1052 138146 : ptr2 = GETARR(trg2);
1053 :
1054 138146 : len1 = ARRNELEM(trg1);
1055 138146 : len2 = ARRNELEM(trg2);
1056 :
1057 : /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
1058 138146 : if (len1 <= 0 || len2 <= 0)
1059 2 : return (float4) 0.0;
1060 :
1061 1760040 : while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
1062 : {
1063 1621896 : int res = CMPTRGM(ptr1, ptr2);
1064 :
1065 1621896 : if (res < 0)
1066 368810 : ptr1++;
1067 1253086 : else if (res > 0)
1068 430232 : ptr2++;
1069 : else
1070 : {
1071 822854 : ptr1++;
1072 822854 : ptr2++;
1073 822854 : count++;
1074 : }
1075 : }
1076 :
1077 : /*
1078 : * If inexact then len2 is equal to count, because we don't know actual
1079 : * length of second string in inexact search and we can assume that count
1080 : * is a lower bound of len2.
1081 : */
1082 138144 : return CALCSML(count, len1, inexact ? count : len2);
1083 : }
1084 :
1085 :
1086 : /*
1087 : * Returns whether trg2 contains all trigrams in trg1.
1088 : * This relies on the trigram arrays being sorted.
1089 : */
1090 : bool
1091 380 : trgm_contained_by(TRGM *trg1, TRGM *trg2)
1092 : {
1093 : trgm *ptr1,
1094 : *ptr2;
1095 : int len1,
1096 : len2;
1097 :
1098 380 : ptr1 = GETARR(trg1);
1099 380 : ptr2 = GETARR(trg2);
1100 :
1101 380 : len1 = ARRNELEM(trg1);
1102 380 : len2 = ARRNELEM(trg2);
1103 :
1104 1244 : while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
1105 : {
1106 1198 : int res = CMPTRGM(ptr1, ptr2);
1107 :
1108 1198 : if (res < 0)
1109 334 : return false;
1110 864 : else if (res > 0)
1111 640 : ptr2++;
1112 : else
1113 : {
1114 224 : ptr1++;
1115 224 : ptr2++;
1116 : }
1117 : }
1118 46 : if (ptr1 - GETARR(trg1) < len1)
1119 8 : return false;
1120 : else
1121 38 : return true;
1122 : }
1123 :
1124 : /*
1125 : * Return a palloc'd boolean array showing, for each trigram in "query",
1126 : * whether it is present in the trigram array "key".
1127 : * This relies on the "key" array being sorted, but "query" need not be.
1128 : */
1129 : bool *
1130 4300 : trgm_presence_map(TRGM *query, TRGM *key)
1131 : {
1132 : bool *result;
1133 4300 : trgm *ptrq = GETARR(query),
1134 4300 : *ptrk = GETARR(key);
1135 4300 : int lenq = ARRNELEM(query),
1136 4300 : lenk = ARRNELEM(key),
1137 : i;
1138 :
1139 4300 : result = (bool *) palloc0(lenq * sizeof(bool));
1140 :
1141 : /* for each query trigram, do a binary search in the key array */
1142 1015120 : for (i = 0; i < lenq; i++)
1143 : {
1144 1010820 : int lo = 0;
1145 1010820 : int hi = lenk;
1146 :
1147 4747306 : while (lo < hi)
1148 : {
1149 3752564 : int mid = (lo + hi) / 2;
1150 3752564 : int res = CMPTRGM(ptrq, ptrk + mid);
1151 :
1152 3752564 : if (res < 0)
1153 1568164 : hi = mid;
1154 2184400 : else if (res > 0)
1155 2168322 : lo = mid + 1;
1156 : else
1157 : {
1158 16078 : result[i] = true;
1159 16078 : break;
1160 : }
1161 : }
1162 1010820 : ptrq++;
1163 : }
1164 :
1165 4300 : return result;
1166 : }
1167 :
1168 : Datum
1169 62904 : similarity(PG_FUNCTION_ARGS)
1170 : {
1171 62904 : text *in1 = PG_GETARG_TEXT_PP(0);
1172 62904 : text *in2 = PG_GETARG_TEXT_PP(1);
1173 : TRGM *trg1,
1174 : *trg2;
1175 : float4 res;
1176 :
1177 62904 : trg1 = generate_trgm(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1));
1178 62904 : trg2 = generate_trgm(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2));
1179 :
1180 62904 : res = cnt_sml(trg1, trg2, false);
1181 :
1182 62904 : pfree(trg1);
1183 62904 : pfree(trg2);
1184 62904 : PG_FREE_IF_COPY(in1, 0);
1185 62904 : PG_FREE_IF_COPY(in2, 1);
1186 :
1187 62904 : PG_RETURN_FLOAT4(res);
1188 : }
1189 :
1190 : Datum
1191 1804 : word_similarity(PG_FUNCTION_ARGS)
1192 : {
1193 1804 : text *in1 = PG_GETARG_TEXT_PP(0);
1194 1804 : text *in2 = PG_GETARG_TEXT_PP(1);
1195 : float4 res;
1196 :
1197 3608 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1198 3608 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1199 : 0);
1200 :
1201 1804 : PG_FREE_IF_COPY(in1, 0);
1202 1804 : PG_FREE_IF_COPY(in2, 1);
1203 1804 : PG_RETURN_FLOAT4(res);
1204 : }
1205 :
1206 : Datum
1207 1764 : strict_word_similarity(PG_FUNCTION_ARGS)
1208 : {
1209 1764 : text *in1 = PG_GETARG_TEXT_PP(0);
1210 1764 : text *in2 = PG_GETARG_TEXT_PP(1);
1211 : float4 res;
1212 :
1213 3528 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1214 3528 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1215 : WORD_SIMILARITY_STRICT);
1216 :
1217 1764 : PG_FREE_IF_COPY(in1, 0);
1218 1764 : PG_FREE_IF_COPY(in2, 1);
1219 1764 : PG_RETURN_FLOAT4(res);
1220 : }
1221 :
1222 : Datum
1223 2008 : similarity_dist(PG_FUNCTION_ARGS)
1224 : {
1225 2008 : float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
1226 : PG_GETARG_DATUM(0),
1227 : PG_GETARG_DATUM(1)));
1228 :
1229 2008 : PG_RETURN_FLOAT4(1.0 - res);
1230 : }
1231 :
1232 : Datum
1233 12000 : similarity_op(PG_FUNCTION_ARGS)
1234 : {
1235 12000 : float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
1236 : PG_GETARG_DATUM(0),
1237 : PG_GETARG_DATUM(1)));
1238 :
1239 12000 : PG_RETURN_BOOL(res >= similarity_threshold);
1240 : }
1241 :
1242 : Datum
1243 3848 : word_similarity_op(PG_FUNCTION_ARGS)
1244 : {
1245 3848 : text *in1 = PG_GETARG_TEXT_PP(0);
1246 3848 : text *in2 = PG_GETARG_TEXT_PP(1);
1247 : float4 res;
1248 :
1249 7696 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1250 7696 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1251 : WORD_SIMILARITY_CHECK_ONLY);
1252 :
1253 3848 : PG_FREE_IF_COPY(in1, 0);
1254 3848 : PG_FREE_IF_COPY(in2, 1);
1255 3848 : PG_RETURN_BOOL(res >= word_similarity_threshold);
1256 : }
1257 :
1258 : Datum
1259 3848 : word_similarity_commutator_op(PG_FUNCTION_ARGS)
1260 : {
1261 3848 : text *in1 = PG_GETARG_TEXT_PP(0);
1262 3848 : text *in2 = PG_GETARG_TEXT_PP(1);
1263 : float4 res;
1264 :
1265 7696 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1266 7696 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1267 : WORD_SIMILARITY_CHECK_ONLY);
1268 :
1269 3848 : PG_FREE_IF_COPY(in1, 0);
1270 3848 : PG_FREE_IF_COPY(in2, 1);
1271 3848 : PG_RETURN_BOOL(res >= word_similarity_threshold);
1272 : }
1273 :
1274 : Datum
1275 0 : word_similarity_dist_op(PG_FUNCTION_ARGS)
1276 : {
1277 0 : text *in1 = PG_GETARG_TEXT_PP(0);
1278 0 : text *in2 = PG_GETARG_TEXT_PP(1);
1279 : float4 res;
1280 :
1281 0 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1282 0 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1283 : 0);
1284 :
1285 0 : PG_FREE_IF_COPY(in1, 0);
1286 0 : PG_FREE_IF_COPY(in2, 1);
1287 0 : PG_RETURN_FLOAT4(1.0 - res);
1288 : }
1289 :
1290 : Datum
1291 1428 : word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
1292 : {
1293 1428 : text *in1 = PG_GETARG_TEXT_PP(0);
1294 1428 : text *in2 = PG_GETARG_TEXT_PP(1);
1295 : float4 res;
1296 :
1297 2856 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1298 2856 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1299 : 0);
1300 :
1301 1428 : PG_FREE_IF_COPY(in1, 0);
1302 1428 : PG_FREE_IF_COPY(in2, 1);
1303 1428 : PG_RETURN_FLOAT4(1.0 - res);
1304 : }
1305 :
1306 : Datum
1307 5060 : strict_word_similarity_op(PG_FUNCTION_ARGS)
1308 : {
1309 5060 : text *in1 = PG_GETARG_TEXT_PP(0);
1310 5060 : text *in2 = PG_GETARG_TEXT_PP(1);
1311 : float4 res;
1312 :
1313 10120 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1314 10120 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1315 : WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
1316 :
1317 5060 : PG_FREE_IF_COPY(in1, 0);
1318 5060 : PG_FREE_IF_COPY(in2, 1);
1319 5060 : PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
1320 : }
1321 :
1322 : Datum
1323 5060 : strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
1324 : {
1325 5060 : text *in1 = PG_GETARG_TEXT_PP(0);
1326 5060 : text *in2 = PG_GETARG_TEXT_PP(1);
1327 : float4 res;
1328 :
1329 10120 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1330 10120 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1331 : WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
1332 :
1333 5060 : PG_FREE_IF_COPY(in1, 0);
1334 5060 : PG_FREE_IF_COPY(in2, 1);
1335 5060 : PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
1336 : }
1337 :
1338 : Datum
1339 0 : strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
1340 : {
1341 0 : text *in1 = PG_GETARG_TEXT_PP(0);
1342 0 : text *in2 = PG_GETARG_TEXT_PP(1);
1343 : float4 res;
1344 :
1345 0 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1346 0 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1347 : WORD_SIMILARITY_STRICT);
1348 :
1349 0 : PG_FREE_IF_COPY(in1, 0);
1350 0 : PG_FREE_IF_COPY(in2, 1);
1351 0 : PG_RETURN_FLOAT4(1.0 - res);
1352 : }
1353 :
1354 : Datum
1355 1440 : strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
1356 : {
1357 1440 : text *in1 = PG_GETARG_TEXT_PP(0);
1358 1440 : text *in2 = PG_GETARG_TEXT_PP(1);
1359 : float4 res;
1360 :
1361 2880 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1362 2880 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1363 : WORD_SIMILARITY_STRICT);
1364 :
1365 1440 : PG_FREE_IF_COPY(in1, 0);
1366 1440 : PG_FREE_IF_COPY(in2, 1);
1367 1440 : PG_RETURN_FLOAT4(1.0 - res);
1368 : }
|