Line data Source code
1 : /*
2 : * contrib/pg_trgm/trgm_op.c
3 : */
4 : #include "postgres.h"
5 :
6 : #include <ctype.h>
7 :
8 : #include "catalog/pg_collation_d.h"
9 : #include "catalog/pg_type.h"
10 : #include "common/int.h"
11 : #include "lib/qunique.h"
12 : #include "miscadmin.h"
13 : #include "trgm.h"
14 : #include "tsearch/ts_locale.h"
15 : #include "utils/formatting.h"
16 : #include "utils/guc.h"
17 : #include "utils/lsyscache.h"
18 : #include "utils/memutils.h"
19 : #include "utils/pg_crc.h"
20 :
21 8 : PG_MODULE_MAGIC_EXT(
22 : .name = "pg_trgm",
23 : .version = PG_VERSION
24 : );
25 :
26 : /* GUC variables */
27 : double similarity_threshold = 0.3f;
28 : double word_similarity_threshold = 0.6f;
29 : double strict_word_similarity_threshold = 0.5f;
30 :
31 4 : PG_FUNCTION_INFO_V1(set_limit);
32 4 : PG_FUNCTION_INFO_V1(show_limit);
33 4 : PG_FUNCTION_INFO_V1(show_trgm);
34 4 : PG_FUNCTION_INFO_V1(similarity);
35 4 : PG_FUNCTION_INFO_V1(word_similarity);
36 4 : PG_FUNCTION_INFO_V1(strict_word_similarity);
37 4 : PG_FUNCTION_INFO_V1(similarity_dist);
38 4 : PG_FUNCTION_INFO_V1(similarity_op);
39 4 : PG_FUNCTION_INFO_V1(word_similarity_op);
40 6 : PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
41 2 : PG_FUNCTION_INFO_V1(word_similarity_dist_op);
42 4 : PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
43 4 : PG_FUNCTION_INFO_V1(strict_word_similarity_op);
44 4 : PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op);
45 2 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op);
46 4 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op);
47 :
48 : static int CMPTRGM_CHOOSE(const void *a, const void *b);
49 : int (*CMPTRGM) (const void *a, const void *b) = CMPTRGM_CHOOSE;
50 :
51 : /* Trigram with position */
52 : typedef struct
53 : {
54 : trgm trg;
55 : int index;
56 : } pos_trgm;
57 :
58 : /* Trigram bound type */
59 : typedef uint8 TrgmBound;
60 : #define TRGM_BOUND_LEFT 0x01 /* trigram is left bound of word */
61 : #define TRGM_BOUND_RIGHT 0x02 /* trigram is right bound of word */
62 :
63 : /* Word similarity flags */
64 : #define WORD_SIMILARITY_CHECK_ONLY 0x01 /* only check existence of similar
65 : * search pattern in text */
66 : #define WORD_SIMILARITY_STRICT 0x02 /* force bounds of extent to match
67 : * word bounds */
68 :
69 : /*
70 : * A growable array of trigrams
71 : *
72 : * The actual array of trigrams is in 'datum'. Note that the other fields in
73 : * 'datum', i.e. datum->flags and the varlena length, are not kept up to date
74 : * when items are added to the growable array. We merely reserve the space
75 : * for them here. You must fill those other fields before using 'datum' as a
76 : * proper TRGM datum.
77 : */
78 : typedef struct
79 : {
80 : TRGM *datum; /* trigram array */
81 : int length; /* number of trigrams in the array */
82 : int allocated; /* allocated size of 'datum' (# of trigrams) */
83 : } growable_trgm_array;
84 :
85 : /*
86 : * Allocate a new growable array.
87 : *
88 : * 'slen' is the size of the source string that we're extracting the trigrams
89 : * from. It is used to choose the initial size of the array.
90 : */
91 : static void
92 232412 : init_trgm_array(growable_trgm_array *arr, int slen)
93 : {
94 : size_t init_size;
95 :
96 : /*
97 : * In the extreme case, the input string consists entirely of one
98 : * character words, like "a b c", where each word is expanded to two
99 : * trigrams. This is not a strict upper bound though, because when
100 : * IGNORECASE is defined, we convert the input string to lowercase before
101 : * extracting the trigrams, which in rare cases can expand one input
102 : * character into multiple characters.
103 : */
104 232412 : init_size = (size_t) slen + 1;
105 :
106 : /*
107 : * Guard against possible overflow in the palloc request. (We don't worry
108 : * about the additive constants, since palloc can detect requests that are
109 : * a little above MaxAllocSize --- we just need to prevent integer
110 : * overflow in the multiplications.)
111 : */
112 232412 : if (init_size > MaxAllocSize / sizeof(trgm))
113 0 : ereport(ERROR,
114 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
115 : errmsg("out of memory")));
116 :
117 232412 : arr->datum = palloc(CALCGTSIZE(ARRKEY, init_size));
118 232412 : arr->allocated = init_size;
119 232412 : arr->length = 0;
120 232412 : }
121 :
122 : /* Make sure the array can hold at least 'needed' more trigrams */
123 : static void
124 256862 : enlarge_trgm_array(growable_trgm_array *arr, int needed)
125 : {
126 256862 : size_t new_needed = (size_t) arr->length + needed;
127 :
128 256862 : if (new_needed > arr->allocated)
129 : {
130 : /* Guard against possible overflow, like in init_trgm_array */
131 0 : if (new_needed > MaxAllocSize / sizeof(trgm))
132 0 : ereport(ERROR,
133 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
134 : errmsg("out of memory")));
135 :
136 0 : arr->datum = repalloc(arr->datum, CALCGTSIZE(ARRKEY, new_needed));
137 0 : arr->allocated = new_needed;
138 : }
139 256862 : }
140 :
141 : /*
142 : * Module load callback
143 : */
144 : void
145 8 : _PG_init(void)
146 : {
147 : /* Define custom GUC variables. */
148 8 : DefineCustomRealVariable("pg_trgm.similarity_threshold",
149 : "Sets the threshold used by the % operator.",
150 : "Valid range is 0.0 .. 1.0.",
151 : &similarity_threshold,
152 : 0.3f,
153 : 0.0,
154 : 1.0,
155 : PGC_USERSET,
156 : 0,
157 : NULL,
158 : NULL,
159 : NULL);
160 8 : DefineCustomRealVariable("pg_trgm.word_similarity_threshold",
161 : "Sets the threshold used by the <% operator.",
162 : "Valid range is 0.0 .. 1.0.",
163 : &word_similarity_threshold,
164 : 0.6f,
165 : 0.0,
166 : 1.0,
167 : PGC_USERSET,
168 : 0,
169 : NULL,
170 : NULL,
171 : NULL);
172 8 : DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold",
173 : "Sets the threshold used by the <<% operator.",
174 : "Valid range is 0.0 .. 1.0.",
175 : &strict_word_similarity_threshold,
176 : 0.5f,
177 : 0.0,
178 : 1.0,
179 : PGC_USERSET,
180 : 0,
181 : NULL,
182 : NULL,
183 : NULL);
184 :
185 8 : MarkGUCPrefixReserved("pg_trgm");
186 8 : }
187 :
188 : #define CMPCHAR(a,b) ( ((a)==(b)) ? 0 : ( ((a)<(b)) ? -1 : 1 ) )
189 :
190 : /*
191 : * Functions for comparing two trgms while treating each char as "signed char" or
192 : * "unsigned char".
193 : */
194 : static inline int
195 15023532 : CMPTRGM_SIGNED(const void *a, const void *b)
196 : {
197 : #define CMPPCHAR_S(a,b,i) CMPCHAR( *(((const signed char*)(a))+i), *(((const signed char*)(b))+i) )
198 :
199 10842824 : return CMPPCHAR_S(a, b, 0) ? CMPPCHAR_S(a, b, 0)
200 32732880 : : (CMPPCHAR_S(a, b, 1) ? CMPPCHAR_S(a, b, 1)
201 6866524 : : CMPPCHAR_S(a, b, 2));
202 : }
203 :
204 : static inline int
205 0 : CMPTRGM_UNSIGNED(const void *a, const void *b)
206 : {
207 : #define CMPPCHAR_UNS(a,b,i) CMPCHAR( *(((const unsigned char*)(a))+i), *(((const unsigned char*)(b))+i) )
208 :
209 0 : return CMPPCHAR_UNS(a, b, 0) ? CMPPCHAR_UNS(a, b, 0)
210 0 : : (CMPPCHAR_UNS(a, b, 1) ? CMPPCHAR_UNS(a, b, 1)
211 0 : : CMPPCHAR_UNS(a, b, 2));
212 : }
213 :
214 : /*
215 : * This gets called on the first call. It replaces the function pointer so
216 : * that subsequent calls are routed directly to the chosen implementation.
217 : */
218 : static int
219 8 : CMPTRGM_CHOOSE(const void *a, const void *b)
220 : {
221 8 : if (GetDefaultCharSignedness())
222 8 : CMPTRGM = CMPTRGM_SIGNED;
223 : else
224 0 : CMPTRGM = CMPTRGM_UNSIGNED;
225 :
226 8 : return CMPTRGM(a, b);
227 : }
228 :
229 : /*
230 : * Deprecated function.
231 : * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
232 : */
233 : Datum
234 4 : set_limit(PG_FUNCTION_ARGS)
235 : {
236 4 : float4 nlimit = PG_GETARG_FLOAT4(0);
237 : char *nlimit_str;
238 : Oid func_out_oid;
239 : bool is_varlena;
240 :
241 4 : getTypeOutputInfo(FLOAT4OID, &func_out_oid, &is_varlena);
242 :
243 4 : nlimit_str = OidOutputFunctionCall(func_out_oid, Float4GetDatum(nlimit));
244 :
245 4 : SetConfigOption("pg_trgm.similarity_threshold", nlimit_str,
246 : PGC_USERSET, PGC_S_SESSION);
247 :
248 4 : PG_RETURN_FLOAT4(similarity_threshold);
249 : }
250 :
251 :
252 : /*
253 : * Get similarity threshold for given index scan strategy number.
254 : */
255 : double
256 90774 : index_strategy_get_limit(StrategyNumber strategy)
257 : {
258 90774 : switch (strategy)
259 : {
260 65154 : case SimilarityStrategyNumber:
261 65154 : return similarity_threshold;
262 13664 : case WordSimilarityStrategyNumber:
263 13664 : return word_similarity_threshold;
264 11956 : case StrictWordSimilarityStrategyNumber:
265 11956 : return strict_word_similarity_threshold;
266 0 : default:
267 0 : elog(ERROR, "unrecognized strategy number: %d", strategy);
268 : break;
269 : }
270 :
271 : return 0.0; /* keep compiler quiet */
272 : }
273 :
274 : /*
275 : * Deprecated function.
276 : * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
277 : */
278 : Datum
279 40000 : show_limit(PG_FUNCTION_ARGS)
280 : {
281 40000 : PG_RETURN_FLOAT4(similarity_threshold);
282 : }
283 :
284 : static int
285 6377386 : comp_trgm(const void *a, const void *b)
286 : {
287 6377386 : return CMPTRGM(a, b);
288 : }
289 :
290 : /*
291 : * Finds first word in string, returns pointer to the word,
292 : * endword points to the character after word
293 : */
294 : static char *
295 487068 : find_word(char *str, int lenstr, char **endword)
296 : {
297 487068 : char *beginword = str;
298 487068 : const char *endstr = str + lenstr;
299 :
300 514440 : while (beginword < endstr)
301 : {
302 284160 : int clen = pg_mblen_range(beginword, endstr);
303 :
304 284160 : if (ISWORDCHR(beginword, clen))
305 256788 : break;
306 27372 : beginword += clen;
307 : }
308 :
309 487068 : if (beginword >= endstr)
310 230280 : return NULL;
311 :
312 256788 : *endword = beginword;
313 2224256 : while (*endword < endstr)
314 : {
315 1994164 : int clen = pg_mblen_range(*endword, endstr);
316 :
317 1994164 : if (!ISWORDCHR(*endword, clen))
318 26696 : break;
319 1967468 : *endword += clen;
320 : }
321 :
322 256788 : return beginword;
323 : }
324 :
325 : /*
326 : * Reduce a trigram (three possibly multi-byte characters) to a trgm,
327 : * which is always exactly three bytes. If we have three single-byte
328 : * characters, we just use them as-is; otherwise we form a hash value.
329 : */
330 : void
331 3316 : compact_trigram(trgm *tptr, char *str, int bytelen)
332 : {
333 3316 : if (bytelen == 3)
334 : {
335 2950 : CPTRGM(tptr, str);
336 : }
337 : else
338 : {
339 : pg_crc32 crc;
340 :
341 366 : INIT_LEGACY_CRC32(crc);
342 2680 : COMP_LEGACY_CRC32(crc, str, bytelen);
343 366 : FIN_LEGACY_CRC32(crc);
344 :
345 : /*
346 : * use only 3 upper bytes from crc, hope, it's good enough hashing
347 : */
348 366 : CPTRGM(tptr, &crc);
349 : }
350 3316 : }
351 :
352 : /*
353 : * Adds trigrams from the word in 'str' (already padded if necessary).
354 : */
355 : static void
356 256916 : make_trigrams(growable_trgm_array *dst, char *str, int bytelen)
357 : {
358 : trgm *tptr;
359 256916 : char *ptr = str;
360 :
361 256916 : if (bytelen < 3)
362 54 : return;
363 :
364 : /* max number of trigrams = strlen - 2 */
365 256862 : enlarge_trgm_array(dst, bytelen - 2);
366 256862 : tptr = GETARR(dst->datum) + dst->length;
367 :
368 256862 : if (pg_encoding_max_length(GetDatabaseEncoding()) == 1)
369 : {
370 0 : while (ptr < str + bytelen - 2)
371 : {
372 0 : CPTRGM(tptr, ptr);
373 0 : ptr++;
374 0 : tptr++;
375 : }
376 : }
377 : else
378 : {
379 : int lenfirst,
380 : lenmiddle,
381 : lenlast;
382 : char *endptr;
383 :
384 : /*
385 : * Fast path as long as there are no multibyte characters
386 : */
387 256862 : if (!IS_HIGHBIT_SET(ptr[0]) && !IS_HIGHBIT_SET(ptr[1]))
388 : {
389 2224116 : while (!IS_HIGHBIT_SET(ptr[2]))
390 : {
391 2224040 : CPTRGM(tptr, ptr);
392 2224040 : ptr++;
393 2224040 : tptr++;
394 :
395 2224040 : if (ptr == str + bytelen - 2)
396 256786 : goto done;
397 : }
398 :
399 76 : lenfirst = 1;
400 76 : lenmiddle = 1;
401 76 : lenlast = pg_mblen_unbounded(ptr + 2);
402 : }
403 : else
404 : {
405 0 : lenfirst = pg_mblen_unbounded(ptr);
406 0 : if (ptr + lenfirst >= str + bytelen)
407 0 : goto done;
408 0 : lenmiddle = pg_mblen_unbounded(ptr + lenfirst);
409 0 : if (ptr + lenfirst + lenmiddle >= str + bytelen)
410 0 : goto done;
411 0 : lenlast = pg_mblen_unbounded(ptr + lenfirst + lenmiddle);
412 : }
413 :
414 : /*
415 : * Slow path to handle any remaining multibyte characters
416 : *
417 : * As we go, 'ptr' points to the beginning of the current
418 : * three-character string and 'endptr' points to just past it.
419 : */
420 76 : endptr = ptr + lenfirst + lenmiddle + lenlast;
421 398 : while (endptr <= str + bytelen)
422 : {
423 398 : compact_trigram(tptr, ptr, endptr - ptr);
424 398 : tptr++;
425 :
426 : /* Advance to the next character */
427 398 : if (endptr == str + bytelen)
428 76 : break;
429 322 : ptr += lenfirst;
430 322 : lenfirst = lenmiddle;
431 322 : lenmiddle = lenlast;
432 322 : lenlast = pg_mblen_unbounded(endptr);
433 322 : endptr += lenlast;
434 : }
435 : }
436 :
437 0 : done:
438 256862 : dst->length = tptr - GETARR(dst->datum);
439 : Assert(dst->length <= dst->allocated);
440 : }
441 :
442 : /*
443 : * Make array of trigrams without sorting and removing duplicate items.
444 : *
445 : * dst: where to return the array of trigrams.
446 : * str: source string, of length slen bytes.
447 : * bounds_p: where to return bounds of trigrams (if needed).
448 : */
449 : static void
450 232302 : generate_trgm_only(growable_trgm_array *dst, char *str, int slen, TrgmBound **bounds_p)
451 : {
452 : size_t buflen;
453 : char *buf;
454 : int bytelen;
455 : char *bword,
456 : *eword;
457 232302 : TrgmBound *bounds = NULL;
458 232302 : int bounds_allocated = 0;
459 :
460 232302 : init_trgm_array(dst, slen);
461 :
462 : /*
463 : * If requested, allocate an array for the bounds, with the same size as
464 : * the trigram array.
465 : */
466 232302 : if (bounds_p)
467 : {
468 13324 : bounds_allocated = dst->allocated;
469 13324 : bounds = *bounds_p = palloc0_array(TrgmBound, bounds_allocated);
470 : }
471 :
472 232302 : if (slen + LPADDING + RPADDING < 3 || slen == 0)
473 2022 : return;
474 :
475 : /*
476 : * Allocate a buffer for case-folded, blank-padded words.
477 : *
478 : * As an initial guess, allocate a buffer large enough to hold the
479 : * original string with padding, which is always enough when compiled with
480 : * !IGNORECASE. If the case-folding produces a string longer than the
481 : * original, we'll grow the buffer.
482 : */
483 230280 : buflen = (size_t) slen + 4;
484 230280 : buf = (char *) palloc(buflen);
485 : if (LPADDING > 0)
486 : {
487 230280 : *buf = ' ';
488 : if (LPADDING > 1)
489 230280 : *(buf + 1) = ' ';
490 : }
491 :
492 230280 : eword = str;
493 487068 : while ((bword = find_word(eword, slen - (eword - str), &eword)) != NULL)
494 : {
495 : int oldlen;
496 :
497 : /* Convert word to lower case before extracting trigrams from it */
498 : #ifdef IGNORECASE
499 : {
500 : char *lowered;
501 :
502 256788 : lowered = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID);
503 256788 : bytelen = strlen(lowered);
504 :
505 : /* grow the buffer if necessary */
506 256788 : if (bytelen > buflen - 4)
507 : {
508 0 : pfree(buf);
509 0 : buflen = (size_t) bytelen + 4;
510 0 : buf = (char *) palloc(buflen);
511 : if (LPADDING > 0)
512 : {
513 0 : *buf = ' ';
514 : if (LPADDING > 1)
515 0 : *(buf + 1) = ' ';
516 : }
517 : }
518 256788 : memcpy(buf + LPADDING, lowered, bytelen);
519 256788 : pfree(lowered);
520 : }
521 : #else
522 : bytelen = eword - bword;
523 : memcpy(buf + LPADDING, bword, bytelen);
524 : #endif
525 :
526 256788 : buf[LPADDING + bytelen] = ' ';
527 256788 : buf[LPADDING + bytelen + 1] = ' ';
528 :
529 : /* Calculate trigrams marking their bounds if needed */
530 256788 : oldlen = dst->length;
531 256788 : make_trigrams(dst, buf, bytelen + LPADDING + RPADDING);
532 256788 : if (bounds)
533 : {
534 24800 : if (bounds_allocated < dst->length)
535 : {
536 0 : bounds = repalloc0_array(bounds, TrgmBound, bounds_allocated, dst->allocated);
537 0 : bounds_allocated = dst->allocated;
538 : }
539 :
540 24800 : bounds[oldlen] |= TRGM_BOUND_LEFT;
541 24800 : bounds[dst->length - 1] |= TRGM_BOUND_RIGHT;
542 : }
543 : }
544 :
545 230280 : pfree(buf);
546 : }
547 :
548 : /*
549 : * Make array of trigrams with sorting and removing duplicate items.
550 : *
551 : * str: source string, of length slen bytes.
552 : *
553 : * Returns the sorted array of unique trigrams.
554 : */
555 : TRGM *
556 179790 : generate_trgm(char *str, int slen)
557 : {
558 : TRGM *trg;
559 : growable_trgm_array arr;
560 : int len;
561 :
562 179790 : generate_trgm_only(&arr, str, slen, NULL);
563 179790 : len = arr.length;
564 179790 : trg = arr.datum;
565 179790 : trg->flag = ARRKEY;
566 :
567 : /*
568 : * Make trigrams unique.
569 : */
570 179790 : if (len > 1)
571 : {
572 179766 : qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
573 179766 : len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
574 : }
575 :
576 179790 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
577 :
578 179790 : return trg;
579 : }
580 :
581 : /*
582 : * Make array of positional trigrams from two trigram arrays trg1 and trg2.
583 : *
584 : * trg1: trigram array of search pattern, of length len1. trg1 is required
585 : * word which positions don't matter and replaced with -1.
586 : * trg2: trigram array of text, of length len2. trg2 is haystack where we
587 : * search and have to store its positions.
588 : *
589 : * Returns concatenated trigram array.
590 : */
591 : static pos_trgm *
592 26256 : make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
593 : {
594 : pos_trgm *result;
595 : int i,
596 26256 : len = len1 + len2;
597 :
598 26256 : result = palloc_array(pos_trgm, len);
599 :
600 243732 : for (i = 0; i < len1; i++)
601 : {
602 217476 : memcpy(&result[i].trg, &trg1[i], sizeof(trgm));
603 217476 : result[i].index = -1;
604 : }
605 :
606 410478 : for (i = 0; i < len2; i++)
607 : {
608 384222 : memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm));
609 384222 : result[i + len1].index = i;
610 : }
611 :
612 26256 : return result;
613 : }
614 :
615 : /*
616 : * Compare position trigrams: compare trigrams first and position second.
617 : */
618 : static int
619 2702810 : comp_ptrgm(const void *v1, const void *v2)
620 : {
621 2702810 : const pos_trgm *p1 = (const pos_trgm *) v1;
622 2702810 : const pos_trgm *p2 = (const pos_trgm *) v2;
623 : int cmp;
624 :
625 2702810 : cmp = CMPTRGM(p1->trg, p2->trg);
626 2702810 : if (cmp != 0)
627 2623404 : return cmp;
628 :
629 79406 : return pg_cmp_s32(p1->index, p2->index);
630 : }
631 :
632 : /*
633 : * Iterative search function which calculates maximum similarity with word in
634 : * the string. Maximum similarity is only calculated only if the flag
635 : * WORD_SIMILARITY_CHECK_ONLY isn't set.
636 : *
637 : * trg2indexes: array which stores indexes of the array "found".
638 : * found: array which stores true of false values.
639 : * ulen1: count of unique trigrams of array "trg1".
640 : * len2: length of array "trg2" and array "trg2indexes".
641 : * len: length of the array "found".
642 : * flags: set of boolean flags parameterizing similarity calculation.
643 : * bounds: whether each trigram is left/right bound of word.
644 : *
645 : * Returns word similarity.
646 : */
647 : static float4
648 26256 : iterate_word_similarity(int *trg2indexes,
649 : bool *found,
650 : int ulen1,
651 : int len2,
652 : int len,
653 : uint8 flags,
654 : TrgmBound *bounds)
655 : {
656 : int *lastpos,
657 : i,
658 26256 : ulen2 = 0,
659 26256 : count = 0,
660 26256 : upper = -1,
661 : lower;
662 : float4 smlr_cur,
663 26256 : smlr_max = 0.0f;
664 : double threshold;
665 :
666 : Assert(bounds || !(flags & WORD_SIMILARITY_STRICT));
667 :
668 : /* Select appropriate threshold */
669 52512 : threshold = (flags & WORD_SIMILARITY_STRICT) ?
670 26256 : strict_word_similarity_threshold :
671 : word_similarity_threshold;
672 :
673 : /*
674 : * Consider first trigram as initial lower bound for strict word
675 : * similarity, or initialize it later with first trigram present for plain
676 : * word similarity.
677 : */
678 26256 : lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1;
679 :
680 : /* Memorise last position of each trigram */
681 26256 : lastpos = palloc_array(int, len);
682 26256 : memset(lastpos, -1, sizeof(int) * len);
683 :
684 393338 : for (i = 0; i < len2; i++)
685 : {
686 : int trgindex;
687 :
688 370650 : CHECK_FOR_INTERRUPTS();
689 :
690 : /* Get index of next trigram */
691 370650 : trgindex = trg2indexes[i];
692 :
693 : /* Update last position of this trigram */
694 370650 : if (lower >= 0 || found[trgindex])
695 : {
696 271594 : if (lastpos[trgindex] < 0)
697 : {
698 267890 : ulen2++;
699 267890 : if (found[trgindex])
700 61512 : count++;
701 : }
702 271594 : lastpos[trgindex] = i;
703 : }
704 :
705 : /*
706 : * Adjust upper bound if trigram is upper bound of word for strict
707 : * word similarity, or if trigram is present in required substring for
708 : * plain word similarity
709 : */
710 548758 : if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT)
711 178108 : : found[trgindex])
712 : {
713 : int prev_lower,
714 : tmp_ulen2,
715 : tmp_lower,
716 : tmp_count;
717 :
718 51274 : upper = i;
719 51274 : if (lower == -1)
720 : {
721 9390 : lower = i;
722 9390 : ulen2 = 1;
723 : }
724 :
725 51274 : smlr_cur = CALCSML(count, ulen1, ulen2);
726 :
727 : /* Also try to adjust lower bound for greater similarity */
728 51274 : tmp_count = count;
729 51274 : tmp_ulen2 = ulen2;
730 51274 : prev_lower = lower;
731 417268 : for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
732 : {
733 : float smlr_tmp;
734 : int tmp_trgindex;
735 :
736 : /*
737 : * Adjust lower bound only if trigram is lower bound of word
738 : * for strict word similarity, or consider every trigram as
739 : * lower bound for plain word similarity.
740 : */
741 369562 : if (!(flags & WORD_SIMILARITY_STRICT)
742 290466 : || (bounds[tmp_lower] & TRGM_BOUND_LEFT))
743 : {
744 119374 : smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
745 119374 : if (smlr_tmp > smlr_cur)
746 : {
747 7022 : smlr_cur = smlr_tmp;
748 7022 : ulen2 = tmp_ulen2;
749 7022 : lower = tmp_lower;
750 7022 : count = tmp_count;
751 : }
752 :
753 : /*
754 : * If we only check that word similarity is greater than
755 : * threshold we do not need to calculate a maximum
756 : * similarity.
757 : */
758 119374 : if ((flags & WORD_SIMILARITY_CHECK_ONLY)
759 74228 : && smlr_cur >= threshold)
760 3568 : break;
761 : }
762 :
763 365994 : tmp_trgindex = trg2indexes[tmp_lower];
764 365994 : if (lastpos[tmp_trgindex] == tmp_lower)
765 : {
766 361476 : tmp_ulen2--;
767 361476 : if (found[tmp_trgindex])
768 93176 : tmp_count--;
769 : }
770 : }
771 :
772 51274 : smlr_max = Max(smlr_max, smlr_cur);
773 :
774 : /*
775 : * if we only check that word similarity is greater than threshold
776 : * we do not need to calculate a maximum similarity.
777 : */
778 51274 : if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold)
779 3568 : break;
780 :
781 81202 : for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
782 : {
783 : int tmp_trgindex;
784 :
785 33496 : tmp_trgindex = trg2indexes[tmp_lower];
786 33496 : if (lastpos[tmp_trgindex] == tmp_lower)
787 32000 : lastpos[tmp_trgindex] = -1;
788 : }
789 : }
790 : }
791 :
792 26256 : pfree(lastpos);
793 :
794 26256 : return smlr_max;
795 : }
796 :
797 : /*
798 : * Calculate word similarity.
799 : * This function prepare two arrays: "trg2indexes" and "found". Then this arrays
800 : * are used to calculate word similarity using iterate_word_similarity().
801 : *
802 : * "trg2indexes" is array which stores indexes of the array "found".
803 : * In other words:
804 : * trg2indexes[j] = i;
805 : * found[i] = true (or false);
806 : * If found[i] == true then there is trigram trg2[j] in array "trg1".
807 : * If found[i] == false then there is not trigram trg2[j] in array "trg1".
808 : *
809 : * str1: search pattern string, of length slen1 bytes.
810 : * str2: text in which we are looking for a word, of length slen2 bytes.
811 : * flags: set of boolean flags parameterizing similarity calculation.
812 : *
813 : * Returns word similarity.
814 : */
815 : static float4
816 26256 : calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
817 : uint8 flags)
818 : {
819 : bool *found;
820 : pos_trgm *ptrg;
821 : growable_trgm_array trg1;
822 : growable_trgm_array trg2;
823 : int len1,
824 : len2,
825 : len,
826 : i,
827 : j,
828 : ulen1;
829 : int *trg2indexes;
830 : float4 result;
831 26256 : TrgmBound *bounds = NULL;
832 :
833 : /* Make positional trigrams */
834 :
835 26256 : generate_trgm_only(&trg1, str1, slen1, NULL);
836 26256 : len1 = trg1.length;
837 26256 : generate_trgm_only(&trg2, str2, slen2, (flags & WORD_SIMILARITY_STRICT) ? &bounds : NULL);
838 26256 : len2 = trg2.length;
839 :
840 26256 : ptrg = make_positional_trgm(GETARR(trg1.datum), len1, GETARR(trg2.datum), len2);
841 26256 : len = len1 + len2;
842 26256 : qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
843 :
844 26256 : pfree(trg1.datum);
845 26256 : pfree(trg2.datum);
846 :
847 : /*
848 : * Merge positional trigrams array: enumerate each trigram and find its
849 : * presence in required word.
850 : */
851 26256 : trg2indexes = palloc_array(int, len2);
852 26256 : found = palloc0_array(bool, len);
853 :
854 26256 : ulen1 = 0;
855 26256 : j = 0;
856 627954 : for (i = 0; i < len; i++)
857 : {
858 601698 : if (i > 0)
859 : {
860 575442 : int cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg);
861 :
862 575442 : if (cmp != 0)
863 : {
864 507042 : if (found[j])
865 202276 : ulen1++;
866 507042 : j++;
867 : }
868 : }
869 :
870 601698 : if (ptrg[i].index >= 0)
871 : {
872 384222 : trg2indexes[ptrg[i].index] = j;
873 : }
874 : else
875 : {
876 217476 : found[j] = true;
877 : }
878 : }
879 26256 : if (found[j])
880 15200 : ulen1++;
881 :
882 : /* Run iterative procedure to find maximum similarity with word */
883 26256 : result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
884 : flags, bounds);
885 :
886 26256 : pfree(trg2indexes);
887 26256 : pfree(found);
888 26256 : pfree(ptrg);
889 :
890 26256 : return result;
891 : }
892 :
893 :
894 : /*
895 : * Extract the next non-wildcard part of a search string, i.e. a word bounded
896 : * by '_' or '%' meta-characters, non-word characters or string end.
897 : *
898 : * str: source string, of length lenstr bytes (need not be null-terminated)
899 : * buf: where to return the substring (must be long enough)
900 : * *bytelen: receives byte length of the found substring
901 : *
902 : * Returns pointer to end+1 of the found substring in the source string.
903 : * Returns NULL if no word found (in which case buf, bytelen is not set)
904 : *
905 : * If the found word is bounded by non-word characters or string boundaries
906 : * then this function will include corresponding padding spaces into buf.
907 : */
908 : static const char *
909 238 : get_wildcard_part(const char *str, int lenstr,
910 : char *buf, int *bytelen)
911 : {
912 238 : const char *beginword = str;
913 : const char *endword;
914 238 : const char *endstr = str + lenstr;
915 238 : char *s = buf;
916 238 : bool in_leading_wildcard_meta = false;
917 238 : bool in_trailing_wildcard_meta = false;
918 238 : bool in_escape = false;
919 : int clen;
920 :
921 : /*
922 : * Find the first word character, remembering whether preceding character
923 : * was wildcard meta-character. Note that the in_escape state persists
924 : * from this loop to the next one, since we may exit at a word character
925 : * that is in_escape.
926 : */
927 482 : while (beginword < endstr)
928 : {
929 372 : clen = pg_mblen_range(beginword, endstr);
930 :
931 372 : if (in_escape)
932 : {
933 6 : if (ISWORDCHR(beginword, clen))
934 6 : break;
935 0 : in_escape = false;
936 0 : in_leading_wildcard_meta = false;
937 : }
938 : else
939 : {
940 366 : if (ISESCAPECHAR(beginword))
941 6 : in_escape = true;
942 360 : else if (ISWILDCARDCHAR(beginword))
943 208 : in_leading_wildcard_meta = true;
944 152 : else if (ISWORDCHR(beginword, clen))
945 122 : break;
946 : else
947 30 : in_leading_wildcard_meta = false;
948 : }
949 244 : beginword += clen;
950 : }
951 :
952 : /*
953 : * Handle string end.
954 : */
955 238 : if (beginword - str >= lenstr)
956 110 : return NULL;
957 :
958 : /*
959 : * Add left padding spaces if preceding character wasn't wildcard
960 : * meta-character.
961 : */
962 128 : if (!in_leading_wildcard_meta)
963 : {
964 : if (LPADDING > 0)
965 : {
966 30 : *s++ = ' ';
967 : if (LPADDING > 1)
968 30 : *s++ = ' ';
969 : }
970 : }
971 :
972 : /*
973 : * Copy data into buf until wildcard meta-character, non-word character or
974 : * string boundary. Strip escapes during copy.
975 : */
976 128 : endword = beginword;
977 488 : while (endword < endstr)
978 : {
979 488 : clen = pg_mblen_range(endword, endstr);
980 488 : if (in_escape)
981 : {
982 6 : if (ISWORDCHR(endword, clen))
983 : {
984 6 : memcpy(s, endword, clen);
985 6 : s += clen;
986 : }
987 : else
988 : {
989 : /*
990 : * Back up endword to the escape character when stopping at an
991 : * escaped char, so that subsequent get_wildcard_part will
992 : * restart from the escape character. We assume here that
993 : * escape chars are single-byte.
994 : */
995 0 : endword--;
996 0 : break;
997 : }
998 6 : in_escape = false;
999 : }
1000 : else
1001 : {
1002 482 : if (ISESCAPECHAR(endword))
1003 0 : in_escape = true;
1004 482 : else if (ISWILDCARDCHAR(endword))
1005 : {
1006 110 : in_trailing_wildcard_meta = true;
1007 110 : break;
1008 : }
1009 372 : else if (ISWORDCHR(endword, clen))
1010 : {
1011 354 : memcpy(s, endword, clen);
1012 354 : s += clen;
1013 : }
1014 : else
1015 18 : break;
1016 : }
1017 360 : endword += clen;
1018 : }
1019 :
1020 : /*
1021 : * Add right padding spaces if next character isn't wildcard
1022 : * meta-character.
1023 : */
1024 128 : if (!in_trailing_wildcard_meta)
1025 : {
1026 : if (RPADDING > 0)
1027 : {
1028 18 : *s++ = ' ';
1029 : if (RPADDING > 1)
1030 : *s++ = ' ';
1031 : }
1032 : }
1033 :
1034 128 : *bytelen = s - buf;
1035 128 : return endword;
1036 : }
1037 :
1038 : /*
1039 : * Generates trigrams for wildcard search string.
1040 : *
1041 : * Returns array of trigrams that must occur in any string that matches the
1042 : * wildcard string. For example, given pattern "a%bcd%" the trigrams
1043 : * " a", "bcd" would be extracted.
1044 : */
1045 : TRGM *
1046 110 : generate_wildcard_trgm(const char *str, int slen)
1047 : {
1048 : TRGM *trg;
1049 : growable_trgm_array arr;
1050 : char *buf;
1051 : int len,
1052 : bytelen;
1053 : const char *eword;
1054 :
1055 110 : if (slen + LPADDING + RPADDING < 3 || slen == 0)
1056 : {
1057 0 : trg = (TRGM *) palloc(TRGMHDRSIZE);
1058 0 : trg->flag = ARRKEY;
1059 0 : SET_VARSIZE(trg, TRGMHDRSIZE);
1060 0 : return trg;
1061 : }
1062 :
1063 110 : init_trgm_array(&arr, slen);
1064 :
1065 : /* Allocate a buffer for blank-padded, but not yet case-folded, words */
1066 110 : buf = palloc_array(char, slen + 4);
1067 :
1068 : /*
1069 : * Extract trigrams from each substring extracted by get_wildcard_part.
1070 : */
1071 110 : eword = str;
1072 238 : while ((eword = get_wildcard_part(eword, slen - (eword - str),
1073 238 : buf, &bytelen)) != NULL)
1074 : {
1075 : char *word;
1076 :
1077 : #ifdef IGNORECASE
1078 128 : word = str_tolower(buf, bytelen, DEFAULT_COLLATION_OID);
1079 128 : bytelen = strlen(word);
1080 : #else
1081 : word = buf;
1082 : #endif
1083 :
1084 : /*
1085 : * count trigrams
1086 : */
1087 128 : make_trigrams(&arr, word, bytelen);
1088 :
1089 : #ifdef IGNORECASE
1090 128 : pfree(word);
1091 : #endif
1092 : }
1093 :
1094 110 : pfree(buf);
1095 :
1096 : /*
1097 : * Make trigrams unique.
1098 : */
1099 110 : trg = arr.datum;
1100 110 : len = arr.length;
1101 110 : if (len > 1)
1102 : {
1103 34 : qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
1104 34 : len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
1105 : }
1106 :
1107 110 : trg->flag = ARRKEY;
1108 110 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
1109 :
1110 110 : return trg;
1111 : }
1112 :
1113 : uint32
1114 69658 : trgm2int(trgm *ptr)
1115 : {
1116 69658 : uint32 val = 0;
1117 :
1118 69658 : val |= *(((unsigned char *) ptr));
1119 69658 : val <<= 8;
1120 69658 : val |= *(((unsigned char *) ptr) + 1);
1121 69658 : val <<= 8;
1122 69658 : val |= *(((unsigned char *) ptr) + 2);
1123 :
1124 69658 : return val;
1125 : }
1126 :
1127 : Datum
1128 14 : show_trgm(PG_FUNCTION_ARGS)
1129 : {
1130 14 : text *in = PG_GETARG_TEXT_PP(0);
1131 : TRGM *trg;
1132 : Datum *d;
1133 : ArrayType *a;
1134 : trgm *ptr;
1135 : int i;
1136 :
1137 14 : trg = generate_trgm(VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
1138 14 : d = palloc_array(Datum, 1 + ARRNELEM(trg));
1139 :
1140 88 : for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
1141 : {
1142 74 : text *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));
1143 :
1144 74 : if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr))
1145 : {
1146 0 : snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
1147 0 : SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
1148 : }
1149 : else
1150 : {
1151 74 : SET_VARSIZE(item, VARHDRSZ + 3);
1152 74 : CPTRGM(VARDATA(item), ptr);
1153 : }
1154 74 : d[i] = PointerGetDatum(item);
1155 : }
1156 :
1157 14 : a = construct_array_builtin(d, ARRNELEM(trg), TEXTOID);
1158 :
1159 88 : for (i = 0; i < ARRNELEM(trg); i++)
1160 74 : pfree(DatumGetPointer(d[i]));
1161 :
1162 14 : pfree(d);
1163 14 : pfree(trg);
1164 14 : PG_FREE_IF_COPY(in, 0);
1165 :
1166 14 : PG_RETURN_POINTER(a);
1167 : }
1168 :
1169 : float4
1170 137702 : cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
1171 : {
1172 : trgm *ptr1,
1173 : *ptr2;
1174 137702 : int count = 0;
1175 : int len1,
1176 : len2;
1177 :
1178 137702 : ptr1 = GETARR(trg1);
1179 137702 : ptr2 = GETARR(trg2);
1180 :
1181 137702 : len1 = ARRNELEM(trg1);
1182 137702 : len2 = ARRNELEM(trg2);
1183 :
1184 : /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
1185 137702 : if (len1 <= 0 || len2 <= 0)
1186 2 : return (float4) 0.0;
1187 :
1188 1751832 : while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
1189 : {
1190 1614132 : int res = CMPTRGM(ptr1, ptr2);
1191 :
1192 1614132 : if (res < 0)
1193 365254 : ptr1++;
1194 1248878 : else if (res > 0)
1195 427336 : ptr2++;
1196 : else
1197 : {
1198 821542 : ptr1++;
1199 821542 : ptr2++;
1200 821542 : count++;
1201 : }
1202 : }
1203 :
1204 : /*
1205 : * If inexact then len2 is equal to count, because we don't know actual
1206 : * length of second string in inexact search and we can assume that count
1207 : * is a lower bound of len2.
1208 : */
1209 137700 : return CALCSML(count, len1, inexact ? count : len2);
1210 : }
1211 :
1212 :
1213 : /*
1214 : * Returns whether trg2 contains all trigrams in trg1.
1215 : * This relies on the trigram arrays being sorted.
1216 : */
1217 : bool
1218 380 : trgm_contained_by(TRGM *trg1, TRGM *trg2)
1219 : {
1220 : trgm *ptr1,
1221 : *ptr2;
1222 : int len1,
1223 : len2;
1224 :
1225 380 : ptr1 = GETARR(trg1);
1226 380 : ptr2 = GETARR(trg2);
1227 :
1228 380 : len1 = ARRNELEM(trg1);
1229 380 : len2 = ARRNELEM(trg2);
1230 :
1231 1244 : while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
1232 : {
1233 1198 : int res = CMPTRGM(ptr1, ptr2);
1234 :
1235 1198 : if (res < 0)
1236 334 : return false;
1237 864 : else if (res > 0)
1238 640 : ptr2++;
1239 : else
1240 : {
1241 224 : ptr1++;
1242 224 : ptr2++;
1243 : }
1244 : }
1245 46 : if (ptr1 - GETARR(trg1) < len1)
1246 8 : return false;
1247 : else
1248 38 : return true;
1249 : }
1250 :
1251 : /*
1252 : * Return a palloc'd boolean array showing, for each trigram in "query",
1253 : * whether it is present in the trigram array "key".
1254 : * This relies on the "key" array being sorted, but "query" need not be.
1255 : */
1256 : bool *
1257 4300 : trgm_presence_map(TRGM *query, TRGM *key)
1258 : {
1259 : bool *result;
1260 4300 : trgm *ptrq = GETARR(query),
1261 4300 : *ptrk = GETARR(key);
1262 4300 : int lenq = ARRNELEM(query),
1263 4300 : lenk = ARRNELEM(key),
1264 : i;
1265 :
1266 4300 : result = palloc0_array(bool, lenq);
1267 :
1268 : /* for each query trigram, do a binary search in the key array */
1269 1015120 : for (i = 0; i < lenq; i++)
1270 : {
1271 1010820 : int lo = 0;
1272 1010820 : int hi = lenk;
1273 :
1274 4747306 : while (lo < hi)
1275 : {
1276 3752564 : int mid = (lo + hi) / 2;
1277 3752564 : int res = CMPTRGM(ptrq, ptrk + mid);
1278 :
1279 3752564 : if (res < 0)
1280 1568164 : hi = mid;
1281 2184400 : else if (res > 0)
1282 2168322 : lo = mid + 1;
1283 : else
1284 : {
1285 16078 : result[i] = true;
1286 16078 : break;
1287 : }
1288 : }
1289 1010820 : ptrq++;
1290 : }
1291 :
1292 4300 : return result;
1293 : }
1294 :
1295 : Datum
1296 62904 : similarity(PG_FUNCTION_ARGS)
1297 : {
1298 62904 : text *in1 = PG_GETARG_TEXT_PP(0);
1299 62904 : text *in2 = PG_GETARG_TEXT_PP(1);
1300 : TRGM *trg1,
1301 : *trg2;
1302 : float4 res;
1303 :
1304 62904 : trg1 = generate_trgm(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1));
1305 62904 : trg2 = generate_trgm(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2));
1306 :
1307 62904 : res = cnt_sml(trg1, trg2, false);
1308 :
1309 62904 : pfree(trg1);
1310 62904 : pfree(trg2);
1311 62904 : PG_FREE_IF_COPY(in1, 0);
1312 62904 : PG_FREE_IF_COPY(in2, 1);
1313 :
1314 62904 : PG_RETURN_FLOAT4(res);
1315 : }
1316 :
1317 : Datum
1318 1804 : word_similarity(PG_FUNCTION_ARGS)
1319 : {
1320 1804 : text *in1 = PG_GETARG_TEXT_PP(0);
1321 1804 : text *in2 = PG_GETARG_TEXT_PP(1);
1322 : float4 res;
1323 :
1324 1804 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1325 1804 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1326 : 0);
1327 :
1328 1804 : PG_FREE_IF_COPY(in1, 0);
1329 1804 : PG_FREE_IF_COPY(in2, 1);
1330 1804 : PG_RETURN_FLOAT4(res);
1331 : }
1332 :
1333 : Datum
1334 1764 : strict_word_similarity(PG_FUNCTION_ARGS)
1335 : {
1336 1764 : text *in1 = PG_GETARG_TEXT_PP(0);
1337 1764 : text *in2 = PG_GETARG_TEXT_PP(1);
1338 : float4 res;
1339 :
1340 1764 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1341 1764 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1342 : WORD_SIMILARITY_STRICT);
1343 :
1344 1764 : PG_FREE_IF_COPY(in1, 0);
1345 1764 : PG_FREE_IF_COPY(in2, 1);
1346 1764 : PG_RETURN_FLOAT4(res);
1347 : }
1348 :
1349 : Datum
1350 2008 : similarity_dist(PG_FUNCTION_ARGS)
1351 : {
1352 2008 : float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
1353 : PG_GETARG_DATUM(0),
1354 : PG_GETARG_DATUM(1)));
1355 :
1356 2008 : PG_RETURN_FLOAT4(1.0 - res);
1357 : }
1358 :
1359 : Datum
1360 12000 : similarity_op(PG_FUNCTION_ARGS)
1361 : {
1362 12000 : float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
1363 : PG_GETARG_DATUM(0),
1364 : PG_GETARG_DATUM(1)));
1365 :
1366 12000 : PG_RETURN_BOOL(res >= similarity_threshold);
1367 : }
1368 :
1369 : Datum
1370 3848 : word_similarity_op(PG_FUNCTION_ARGS)
1371 : {
1372 3848 : text *in1 = PG_GETARG_TEXT_PP(0);
1373 3848 : text *in2 = PG_GETARG_TEXT_PP(1);
1374 : float4 res;
1375 :
1376 3848 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1377 3848 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1378 : WORD_SIMILARITY_CHECK_ONLY);
1379 :
1380 3848 : PG_FREE_IF_COPY(in1, 0);
1381 3848 : PG_FREE_IF_COPY(in2, 1);
1382 3848 : PG_RETURN_BOOL(res >= word_similarity_threshold);
1383 : }
1384 :
1385 : Datum
1386 5852 : word_similarity_commutator_op(PG_FUNCTION_ARGS)
1387 : {
1388 5852 : text *in1 = PG_GETARG_TEXT_PP(0);
1389 5852 : text *in2 = PG_GETARG_TEXT_PP(1);
1390 : float4 res;
1391 :
1392 5852 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1393 5852 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1394 : WORD_SIMILARITY_CHECK_ONLY);
1395 :
1396 5852 : PG_FREE_IF_COPY(in1, 0);
1397 5852 : PG_FREE_IF_COPY(in2, 1);
1398 5852 : PG_RETURN_BOOL(res >= word_similarity_threshold);
1399 : }
1400 :
1401 : Datum
1402 0 : word_similarity_dist_op(PG_FUNCTION_ARGS)
1403 : {
1404 0 : text *in1 = PG_GETARG_TEXT_PP(0);
1405 0 : text *in2 = PG_GETARG_TEXT_PP(1);
1406 : float4 res;
1407 :
1408 0 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1409 0 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1410 : 0);
1411 :
1412 0 : PG_FREE_IF_COPY(in1, 0);
1413 0 : PG_FREE_IF_COPY(in2, 1);
1414 0 : PG_RETURN_FLOAT4(1.0 - res);
1415 : }
1416 :
1417 : Datum
1418 1428 : word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
1419 : {
1420 1428 : text *in1 = PG_GETARG_TEXT_PP(0);
1421 1428 : text *in2 = PG_GETARG_TEXT_PP(1);
1422 : float4 res;
1423 :
1424 1428 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1425 1428 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1426 : 0);
1427 :
1428 1428 : PG_FREE_IF_COPY(in1, 0);
1429 1428 : PG_FREE_IF_COPY(in2, 1);
1430 1428 : PG_RETURN_FLOAT4(1.0 - res);
1431 : }
1432 :
1433 : Datum
1434 5060 : strict_word_similarity_op(PG_FUNCTION_ARGS)
1435 : {
1436 5060 : text *in1 = PG_GETARG_TEXT_PP(0);
1437 5060 : text *in2 = PG_GETARG_TEXT_PP(1);
1438 : float4 res;
1439 :
1440 5060 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1441 5060 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1442 : WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
1443 :
1444 5060 : PG_FREE_IF_COPY(in1, 0);
1445 5060 : PG_FREE_IF_COPY(in2, 1);
1446 5060 : PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
1447 : }
1448 :
1449 : Datum
1450 5060 : strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
1451 : {
1452 5060 : text *in1 = PG_GETARG_TEXT_PP(0);
1453 5060 : text *in2 = PG_GETARG_TEXT_PP(1);
1454 : float4 res;
1455 :
1456 5060 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1457 5060 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1458 : WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
1459 :
1460 5060 : PG_FREE_IF_COPY(in1, 0);
1461 5060 : PG_FREE_IF_COPY(in2, 1);
1462 5060 : PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
1463 : }
1464 :
1465 : Datum
1466 0 : strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
1467 : {
1468 0 : text *in1 = PG_GETARG_TEXT_PP(0);
1469 0 : text *in2 = PG_GETARG_TEXT_PP(1);
1470 : float4 res;
1471 :
1472 0 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1473 0 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1474 : WORD_SIMILARITY_STRICT);
1475 :
1476 0 : PG_FREE_IF_COPY(in1, 0);
1477 0 : PG_FREE_IF_COPY(in2, 1);
1478 0 : PG_RETURN_FLOAT4(1.0 - res);
1479 : }
1480 :
1481 : Datum
1482 1440 : strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
1483 : {
1484 1440 : text *in1 = PG_GETARG_TEXT_PP(0);
1485 1440 : text *in2 = PG_GETARG_TEXT_PP(1);
1486 : float4 res;
1487 :
1488 1440 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1489 1440 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1490 : WORD_SIMILARITY_STRICT);
1491 :
1492 1440 : PG_FREE_IF_COPY(in1, 0);
1493 1440 : PG_FREE_IF_COPY(in2, 1);
1494 1440 : PG_RETURN_FLOAT4(1.0 - res);
1495 : }
|