Line data Source code
1 : /*
2 : * contrib/pg_trgm/trgm_op.c
3 : */
4 : #include "postgres.h"
5 :
6 : #include <ctype.h>
7 :
8 : #include "catalog/pg_collation_d.h"
9 : #include "catalog/pg_type.h"
10 : #include "common/int.h"
11 : #include "lib/qunique.h"
12 : #include "miscadmin.h"
13 : #include "trgm.h"
14 : #include "tsearch/ts_locale.h"
15 : #include "utils/formatting.h"
16 : #include "utils/guc.h"
17 : #include "utils/lsyscache.h"
18 : #include "utils/memutils.h"
19 : #include "utils/pg_crc.h"
20 :
21 4 : PG_MODULE_MAGIC_EXT(
22 : .name = "pg_trgm",
23 : .version = PG_VERSION
24 : );
25 :
26 : /* GUC variables */
27 : double similarity_threshold = 0.3f;
28 : double word_similarity_threshold = 0.6f;
29 : double strict_word_similarity_threshold = 0.5f;
30 :
31 2 : PG_FUNCTION_INFO_V1(set_limit);
32 2 : PG_FUNCTION_INFO_V1(show_limit);
33 2 : PG_FUNCTION_INFO_V1(show_trgm);
34 2 : PG_FUNCTION_INFO_V1(similarity);
35 2 : PG_FUNCTION_INFO_V1(word_similarity);
36 2 : PG_FUNCTION_INFO_V1(strict_word_similarity);
37 2 : PG_FUNCTION_INFO_V1(similarity_dist);
38 2 : PG_FUNCTION_INFO_V1(similarity_op);
39 2 : PG_FUNCTION_INFO_V1(word_similarity_op);
40 3 : PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
41 1 : PG_FUNCTION_INFO_V1(word_similarity_dist_op);
42 2 : PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
43 2 : PG_FUNCTION_INFO_V1(strict_word_similarity_op);
44 2 : PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op);
45 1 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op);
46 2 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op);
47 :
48 : static int CMPTRGM_CHOOSE(const void *a, const void *b);
49 : int (*CMPTRGM) (const void *a, const void *b) = CMPTRGM_CHOOSE;
50 :
51 : /* Trigram with position */
52 : typedef struct
53 : {
54 : trgm trg;
55 : int index;
56 : } pos_trgm;
57 :
58 : /* Trigram bound type */
59 : typedef uint8 TrgmBound;
60 : #define TRGM_BOUND_LEFT 0x01 /* trigram is left bound of word */
61 : #define TRGM_BOUND_RIGHT 0x02 /* trigram is right bound of word */
62 :
63 : /* Word similarity flags */
64 : #define WORD_SIMILARITY_CHECK_ONLY 0x01 /* only check existence of similar
65 : * search pattern in text */
66 : #define WORD_SIMILARITY_STRICT 0x02 /* force bounds of extent to match
67 : * word bounds */
68 :
69 : /*
70 : * A growable array of trigrams
71 : *
72 : * The actual array of trigrams is in 'datum'. Note that the other fields in
73 : * 'datum', i.e. datum->flags and the varlena length, are not kept up to date
74 : * when items are added to the growable array. We merely reserve the space
75 : * for them here. You must fill those other fields before using 'datum' as a
76 : * proper TRGM datum.
77 : */
78 : typedef struct
79 : {
80 : TRGM *datum; /* trigram array */
81 : int length; /* number of trigrams in the array */
82 : int allocated; /* allocated size of 'datum' (# of trigrams) */
83 : } growable_trgm_array;
84 :
85 : /*
86 : * Allocate a new growable array.
87 : *
88 : * 'slen' is the size of the source string that we're extracting the trigrams
89 : * from. It is used to choose the initial size of the array.
90 : */
91 : static void
92 116206 : init_trgm_array(growable_trgm_array *arr, int slen)
93 : {
94 : size_t init_size;
95 :
96 : /*
97 : * In the extreme case, the input string consists entirely of one
98 : * character words, like "a b c", where each word is expanded to two
99 : * trigrams. This is not a strict upper bound though, because when
100 : * IGNORECASE is defined, we convert the input string to lowercase before
101 : * extracting the trigrams, which in rare cases can expand one input
102 : * character into multiple characters.
103 : */
104 116206 : init_size = (size_t) slen + 1;
105 :
106 : /*
107 : * Guard against possible overflow in the palloc request. (We don't worry
108 : * about the additive constants, since palloc can detect requests that are
109 : * a little above MaxAllocSize --- we just need to prevent integer
110 : * overflow in the multiplications.)
111 : */
112 116206 : if (init_size > MaxAllocSize / sizeof(trgm))
113 0 : ereport(ERROR,
114 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
115 : errmsg("out of memory")));
116 :
117 116206 : arr->datum = palloc(CALCGTSIZE(ARRKEY, init_size));
118 116206 : arr->allocated = init_size;
119 116206 : arr->length = 0;
120 116206 : }
121 :
122 : /* Make sure the array can hold at least 'needed' more trigrams */
123 : static void
124 128428 : enlarge_trgm_array(growable_trgm_array *arr, int needed)
125 : {
126 128428 : size_t new_needed = (size_t) arr->length + needed;
127 :
128 128428 : if (new_needed > arr->allocated)
129 : {
130 : /* Guard against possible overflow, like in init_trgm_array */
131 0 : if (new_needed > MaxAllocSize / sizeof(trgm))
132 0 : ereport(ERROR,
133 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
134 : errmsg("out of memory")));
135 :
136 0 : arr->datum = repalloc(arr->datum, CALCGTSIZE(ARRKEY, new_needed));
137 0 : arr->allocated = new_needed;
138 : }
139 128428 : }
140 :
141 : /*
142 : * Module load callback
143 : */
144 : void
145 4 : _PG_init(void)
146 : {
147 : /* Define custom GUC variables. */
148 4 : DefineCustomRealVariable("pg_trgm.similarity_threshold",
149 : "Sets the threshold used by the % operator.",
150 : "Valid range is 0.0 .. 1.0.",
151 : &similarity_threshold,
152 : 0.3f,
153 : 0.0,
154 : 1.0,
155 : PGC_USERSET,
156 : 0,
157 : NULL,
158 : NULL,
159 : NULL);
160 4 : DefineCustomRealVariable("pg_trgm.word_similarity_threshold",
161 : "Sets the threshold used by the <% operator.",
162 : "Valid range is 0.0 .. 1.0.",
163 : &word_similarity_threshold,
164 : 0.6f,
165 : 0.0,
166 : 1.0,
167 : PGC_USERSET,
168 : 0,
169 : NULL,
170 : NULL,
171 : NULL);
172 4 : DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold",
173 : "Sets the threshold used by the <<% operator.",
174 : "Valid range is 0.0 .. 1.0.",
175 : &strict_word_similarity_threshold,
176 : 0.5f,
177 : 0.0,
178 : 1.0,
179 : PGC_USERSET,
180 : 0,
181 : NULL,
182 : NULL,
183 : NULL);
184 :
185 4 : MarkGUCPrefixReserved("pg_trgm");
186 4 : }
187 :
188 : #define CMPCHAR(a,b) ( ((a)==(b)) ? 0 : ( ((a)<(b)) ? -1 : 1 ) )
189 :
190 : /*
191 : * Functions for comparing two trgms while treating each char as "signed char" or
192 : * "unsigned char".
193 : */
194 : static inline int
195 6786904 : CMPTRGM_SIGNED(const void *a, const void *b)
196 : {
197 : #define CMPPCHAR_S(a,b,i) CMPCHAR( *(((const signed char*)(a))+i), *(((const signed char*)(b))+i) )
198 :
199 4881152 : return CMPPCHAR_S(a, b, 0) ? CMPPCHAR_S(a, b, 0)
200 14739571 : : (CMPPCHAR_S(a, b, 1) ? CMPPCHAR_S(a, b, 1)
201 3071515 : : CMPPCHAR_S(a, b, 2));
202 : }
203 :
204 : static inline int
205 0 : CMPTRGM_UNSIGNED(const void *a, const void *b)
206 : {
207 : #define CMPPCHAR_UNS(a,b,i) CMPCHAR( *(((const unsigned char*)(a))+i), *(((const unsigned char*)(b))+i) )
208 :
209 0 : return CMPPCHAR_UNS(a, b, 0) ? CMPPCHAR_UNS(a, b, 0)
210 0 : : (CMPPCHAR_UNS(a, b, 1) ? CMPPCHAR_UNS(a, b, 1)
211 0 : : CMPPCHAR_UNS(a, b, 2));
212 : }
213 :
214 : /*
215 : * This gets called on the first call. It replaces the function pointer so
216 : * that subsequent calls are routed directly to the chosen implementation.
217 : */
218 : static int
219 3 : CMPTRGM_CHOOSE(const void *a, const void *b)
220 : {
221 3 : if (GetDefaultCharSignedness())
222 3 : CMPTRGM = CMPTRGM_SIGNED;
223 : else
224 0 : CMPTRGM = CMPTRGM_UNSIGNED;
225 :
226 3 : return CMPTRGM(a, b);
227 : }
228 :
229 : #define ST_SORT trigram_qsort_signed
230 : #define ST_ELEMENT_TYPE_VOID
231 : #define ST_COMPARE(a, b) CMPTRGM_SIGNED(a, b)
232 : #define ST_SCOPE static
233 : #define ST_DEFINE
234 : #define ST_DECLARE
235 : #include "lib/sort_template.h"
236 :
237 : #define ST_SORT trigram_qsort_unsigned
238 : #define ST_ELEMENT_TYPE_VOID
239 : #define ST_COMPARE(a, b) CMPTRGM_UNSIGNED(a, b)
240 : #define ST_SCOPE static
241 : #define ST_DEFINE
242 : #define ST_DECLARE
243 : #include "lib/sort_template.h"
244 :
245 : /* Sort an array of trigrams, handling signedness correctly */
246 : static void
247 89900 : trigram_qsort(trgm *array, size_t n)
248 : {
249 89900 : if (GetDefaultCharSignedness())
250 89900 : trigram_qsort_signed(array, n, sizeof(trgm));
251 : else
252 0 : trigram_qsort_unsigned(array, n, sizeof(trgm));
253 89900 : }
254 :
255 :
256 : /*
257 : * Compare two trigrams for equality. This has the same signature as
258 : * comparison functions used for sorting, so that this can be used with
259 : * qunique(). This doesn't need separate versions for "signed char" and "
260 : * unsigned char" because equality is the same for both.
261 : */
262 : static inline int
263 721456 : CMPTRGM_EQ(const void *a, const void *b)
264 : {
265 721456 : const char *aa = a;
266 721456 : const char *bb = b;
267 :
268 721456 : return aa[0] != bb[0] || aa[1] != bb[1] || aa[2] != bb[2] ? 1 : 0;
269 : }
270 :
271 : /* Deduplicate an array of trigrams */
272 : static size_t
273 89900 : trigram_qunique(trgm *array, size_t n)
274 : {
275 89900 : return qunique(array, n, sizeof(trgm), CMPTRGM_EQ);
276 : }
277 :
278 : /*
279 : * Deprecated function.
280 : * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
281 : */
282 : Datum
283 2 : set_limit(PG_FUNCTION_ARGS)
284 : {
285 2 : float4 nlimit = PG_GETARG_FLOAT4(0);
286 : char *nlimit_str;
287 : Oid func_out_oid;
288 : bool is_varlena;
289 :
290 2 : getTypeOutputInfo(FLOAT4OID, &func_out_oid, &is_varlena);
291 :
292 2 : nlimit_str = OidOutputFunctionCall(func_out_oid, Float4GetDatum(nlimit));
293 :
294 2 : SetConfigOption("pg_trgm.similarity_threshold", nlimit_str,
295 : PGC_USERSET, PGC_S_SESSION);
296 :
297 2 : PG_RETURN_FLOAT4(similarity_threshold);
298 : }
299 :
300 :
301 : /*
302 : * Get similarity threshold for given index scan strategy number.
303 : */
304 : double
305 45152 : index_strategy_get_limit(StrategyNumber strategy)
306 : {
307 45152 : switch (strategy)
308 : {
309 32346 : case SimilarityStrategyNumber:
310 32346 : return similarity_threshold;
311 6828 : case WordSimilarityStrategyNumber:
312 6828 : return word_similarity_threshold;
313 5978 : case StrictWordSimilarityStrategyNumber:
314 5978 : return strict_word_similarity_threshold;
315 0 : default:
316 0 : elog(ERROR, "unrecognized strategy number: %d", strategy);
317 : break;
318 : }
319 :
320 : return 0.0; /* keep compiler quiet */
321 : }
322 :
323 : /*
324 : * Deprecated function.
325 : * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
326 : */
327 : Datum
328 20000 : show_limit(PG_FUNCTION_ARGS)
329 : {
330 20000 : PG_RETURN_FLOAT4(similarity_threshold);
331 : }
332 :
333 : /*
334 : * Finds first word in string, returns pointer to the word,
335 : * endword points to the character after word
336 : */
337 : static char *
338 243531 : find_word(char *str, int lenstr, char **endword)
339 : {
340 243531 : char *beginword = str;
341 243531 : const char *endstr = str + lenstr;
342 :
343 257214 : while (beginword < endstr)
344 : {
345 142074 : int clen = pg_mblen_range(beginword, endstr);
346 :
347 142074 : if (ISWORDCHR(beginword, clen))
348 128391 : break;
349 13683 : beginword += clen;
350 : }
351 :
352 243531 : if (beginword >= endstr)
353 115140 : return NULL;
354 :
355 128391 : *endword = beginword;
356 1112113 : while (*endword < endstr)
357 : {
358 997067 : int clen = pg_mblen_range(*endword, endstr);
359 :
360 997067 : if (!ISWORDCHR(*endword, clen))
361 13345 : break;
362 983722 : *endword += clen;
363 : }
364 :
365 128391 : return beginword;
366 : }
367 :
368 : /*
369 : * Reduce a trigram (three possibly multi-byte characters) to a trgm,
370 : * which is always exactly three bytes. If we have three single-byte
371 : * characters, we just use them as-is; otherwise we form a hash value.
372 : */
373 : void
374 1658 : compact_trigram(trgm *tptr, char *str, int bytelen)
375 : {
376 1658 : if (bytelen == 3)
377 : {
378 1475 : CPTRGM(tptr, str);
379 : }
380 : else
381 : {
382 : pg_crc32 crc;
383 :
384 183 : INIT_LEGACY_CRC32(crc);
385 1340 : COMP_LEGACY_CRC32(crc, str, bytelen);
386 183 : FIN_LEGACY_CRC32(crc);
387 :
388 : /*
389 : * use only 3 upper bytes from crc, hope, it's good enough hashing
390 : */
391 183 : CPTRGM(tptr, &crc);
392 : }
393 1658 : }
394 :
395 : /*
396 : * Adds trigrams from the word in 'str' (already padded if necessary).
397 : */
398 : static void
399 128455 : make_trigrams(growable_trgm_array *dst, char *str, int bytelen)
400 : {
401 : trgm *tptr;
402 128455 : char *ptr = str;
403 :
404 128455 : if (bytelen < 3)
405 27 : return;
406 :
407 : /* max number of trigrams = strlen - 2 */
408 128428 : enlarge_trgm_array(dst, bytelen - 2);
409 128428 : tptr = GETARR(dst->datum) + dst->length;
410 :
411 128428 : if (pg_encoding_max_length(GetDatabaseEncoding()) == 1)
412 : {
413 0 : while (ptr < str + bytelen - 2)
414 : {
415 0 : CPTRGM(tptr, ptr);
416 0 : ptr++;
417 0 : tptr++;
418 : }
419 : }
420 : else
421 : {
422 : int lenfirst,
423 : lenmiddle,
424 : lenlast;
425 : char *endptr;
426 :
427 : /*
428 : * Fast path as long as there are no multibyte characters
429 : */
430 128428 : if (!IS_HIGHBIT_SET(ptr[0]) && !IS_HIGHBIT_SET(ptr[1]))
431 : {
432 1112043 : while (!IS_HIGHBIT_SET(ptr[2]))
433 : {
434 1112005 : CPTRGM(tptr, ptr);
435 1112005 : ptr++;
436 1112005 : tptr++;
437 :
438 1112005 : if (ptr == str + bytelen - 2)
439 128390 : goto done;
440 : }
441 :
442 38 : lenfirst = 1;
443 38 : lenmiddle = 1;
444 38 : lenlast = pg_mblen_unbounded(ptr + 2);
445 : }
446 : else
447 : {
448 0 : lenfirst = pg_mblen_unbounded(ptr);
449 0 : if (ptr + lenfirst >= str + bytelen)
450 0 : goto done;
451 0 : lenmiddle = pg_mblen_unbounded(ptr + lenfirst);
452 0 : if (ptr + lenfirst + lenmiddle >= str + bytelen)
453 0 : goto done;
454 0 : lenlast = pg_mblen_unbounded(ptr + lenfirst + lenmiddle);
455 : }
456 :
457 : /*
458 : * Slow path to handle any remaining multibyte characters
459 : *
460 : * As we go, 'ptr' points to the beginning of the current
461 : * three-character string and 'endptr' points to just past it.
462 : */
463 38 : endptr = ptr + lenfirst + lenmiddle + lenlast;
464 199 : while (endptr <= str + bytelen)
465 : {
466 199 : compact_trigram(tptr, ptr, endptr - ptr);
467 199 : tptr++;
468 :
469 : /* Advance to the next character */
470 199 : if (endptr == str + bytelen)
471 38 : break;
472 161 : ptr += lenfirst;
473 161 : lenfirst = lenmiddle;
474 161 : lenmiddle = lenlast;
475 161 : lenlast = pg_mblen_unbounded(endptr);
476 161 : endptr += lenlast;
477 : }
478 : }
479 :
480 0 : done:
481 128428 : dst->length = tptr - GETARR(dst->datum);
482 : Assert(dst->length <= dst->allocated);
483 : }
484 :
485 : /*
486 : * Make array of trigrams without sorting and removing duplicate items.
487 : *
488 : * dst: where to return the array of trigrams.
489 : * str: source string, of length slen bytes.
490 : * bounds_p: where to return bounds of trigrams (if needed).
491 : */
492 : static void
493 116151 : generate_trgm_only(growable_trgm_array *dst, char *str, int slen, TrgmBound **bounds_p)
494 : {
495 : size_t buflen;
496 : char *buf;
497 : int bytelen;
498 : char *bword,
499 : *eword;
500 116151 : TrgmBound *bounds = NULL;
501 116151 : int bounds_allocated = 0;
502 :
503 116151 : init_trgm_array(dst, slen);
504 :
505 : /*
506 : * If requested, allocate an array for the bounds, with the same size as
507 : * the trigram array.
508 : */
509 116151 : if (bounds_p)
510 : {
511 6662 : bounds_allocated = dst->allocated;
512 6662 : bounds = *bounds_p = palloc0_array(TrgmBound, bounds_allocated);
513 : }
514 :
515 116151 : if (slen + LPADDING + RPADDING < 3 || slen == 0)
516 1011 : return;
517 :
518 : /*
519 : * Allocate a buffer for case-folded, blank-padded words.
520 : *
521 : * As an initial guess, allocate a buffer large enough to hold the
522 : * original string with padding, which is always enough when compiled with
523 : * !IGNORECASE. If the case-folding produces a string longer than the
524 : * original, we'll grow the buffer.
525 : */
526 115140 : buflen = (size_t) slen + 4;
527 115140 : buf = (char *) palloc(buflen);
528 : if (LPADDING > 0)
529 : {
530 115140 : *buf = ' ';
531 : if (LPADDING > 1)
532 115140 : *(buf + 1) = ' ';
533 : }
534 :
535 115140 : eword = str;
536 243531 : while ((bword = find_word(eword, slen - (eword - str), &eword)) != NULL)
537 : {
538 : int oldlen;
539 :
540 : /* Convert word to lower case before extracting trigrams from it */
541 : #ifdef IGNORECASE
542 : {
543 : char *lowered;
544 :
545 128391 : lowered = str_tolower(bword, eword - bword, DEFAULT_COLLATION_OID);
546 128391 : bytelen = strlen(lowered);
547 :
548 : /* grow the buffer if necessary */
549 128391 : if (bytelen > buflen - 4)
550 : {
551 0 : pfree(buf);
552 0 : buflen = (size_t) bytelen + 4;
553 0 : buf = (char *) palloc(buflen);
554 : if (LPADDING > 0)
555 : {
556 0 : *buf = ' ';
557 : if (LPADDING > 1)
558 0 : *(buf + 1) = ' ';
559 : }
560 : }
561 128391 : memcpy(buf + LPADDING, lowered, bytelen);
562 128391 : pfree(lowered);
563 : }
564 : #else
565 : bytelen = eword - bword;
566 : memcpy(buf + LPADDING, bword, bytelen);
567 : #endif
568 :
569 128391 : buf[LPADDING + bytelen] = ' ';
570 128391 : buf[LPADDING + bytelen + 1] = ' ';
571 :
572 : /* Calculate trigrams marking their bounds if needed */
573 128391 : oldlen = dst->length;
574 128391 : make_trigrams(dst, buf, bytelen + LPADDING + RPADDING);
575 128391 : if (bounds)
576 : {
577 12397 : if (bounds_allocated < dst->length)
578 : {
579 0 : bounds = *bounds_p = repalloc0_array(bounds, TrgmBound, bounds_allocated, dst->allocated);
580 0 : bounds_allocated = dst->allocated;
581 : }
582 :
583 12397 : bounds[oldlen] |= TRGM_BOUND_LEFT;
584 12397 : bounds[dst->length - 1] |= TRGM_BOUND_RIGHT;
585 : }
586 : }
587 :
588 115140 : pfree(buf);
589 : }
590 :
591 : /*
592 : * Make array of trigrams with sorting and removing duplicate items.
593 : *
594 : * str: source string, of length slen bytes.
595 : *
596 : * Returns the sorted array of unique trigrams.
597 : */
598 : TRGM *
599 89895 : generate_trgm(char *str, int slen)
600 : {
601 : TRGM *trg;
602 : growable_trgm_array arr;
603 : int len;
604 :
605 89895 : generate_trgm_only(&arr, str, slen, NULL);
606 89895 : len = arr.length;
607 89895 : trg = arr.datum;
608 89895 : trg->flag = ARRKEY;
609 :
610 : /*
611 : * Make trigrams unique.
612 : */
613 89895 : if (len > 1)
614 : {
615 89883 : trigram_qsort(GETARR(trg), len);
616 89883 : len = trigram_qunique(GETARR(trg), len);
617 : }
618 :
619 89895 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
620 :
621 89895 : return trg;
622 : }
623 :
624 : /*
625 : * Make array of positional trigrams from two trigram arrays trg1 and trg2.
626 : *
627 : * trg1: trigram array of search pattern, of length len1. trg1 is required
628 : * word which positions don't matter and replaced with -1.
629 : * trg2: trigram array of text, of length len2. trg2 is haystack where we
630 : * search and have to store its positions.
631 : *
632 : * Returns concatenated trigram array.
633 : */
634 : static pos_trgm *
635 13128 : make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
636 : {
637 : pos_trgm *result;
638 : int i,
639 13128 : len = len1 + len2;
640 :
641 13128 : result = palloc_array(pos_trgm, len);
642 :
643 121866 : for (i = 0; i < len1; i++)
644 : {
645 108738 : memcpy(&result[i].trg, &trg1[i], sizeof(trgm));
646 108738 : result[i].index = -1;
647 : }
648 :
649 205224 : for (i = 0; i < len2; i++)
650 : {
651 192096 : memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm));
652 192096 : result[i + len1].index = i;
653 : }
654 :
655 13128 : return result;
656 : }
657 :
658 : /*
659 : * Compare position trigrams: compare trigrams first and position second.
660 : */
661 : static int
662 1351321 : comp_ptrgm(const void *v1, const void *v2)
663 : {
664 1351321 : const pos_trgm *p1 = (const pos_trgm *) v1;
665 1351321 : const pos_trgm *p2 = (const pos_trgm *) v2;
666 : int cmp;
667 :
668 1351321 : cmp = CMPTRGM(p1->trg, p2->trg);
669 1351321 : if (cmp != 0)
670 1311618 : return cmp;
671 :
672 39703 : return pg_cmp_s32(p1->index, p2->index);
673 : }
674 :
675 : /*
676 : * Iterative search function which calculates maximum similarity with word in
677 : * the string. Maximum similarity is only calculated only if the flag
678 : * WORD_SIMILARITY_CHECK_ONLY isn't set.
679 : *
680 : * trg2indexes: array which stores indexes of the array "found".
681 : * found: array which stores true of false values.
682 : * ulen1: count of unique trigrams of array "trg1".
683 : * len2: length of array "trg2" and array "trg2indexes".
684 : * len: length of the array "found".
685 : * flags: set of boolean flags parameterizing similarity calculation.
686 : * bounds: whether each trigram is left/right bound of word.
687 : *
688 : * Returns word similarity.
689 : */
690 : static float4
691 13128 : iterate_word_similarity(int *trg2indexes,
692 : bool *found,
693 : int ulen1,
694 : int len2,
695 : int len,
696 : uint8 flags,
697 : TrgmBound *bounds)
698 : {
699 : int *lastpos,
700 : i,
701 13128 : ulen2 = 0,
702 13128 : count = 0,
703 13128 : upper = -1,
704 : lower;
705 : float4 smlr_cur,
706 13128 : smlr_max = 0.0f;
707 : double threshold;
708 :
709 : Assert(bounds || !(flags & WORD_SIMILARITY_STRICT));
710 :
711 : /* Select appropriate threshold */
712 26256 : threshold = (flags & WORD_SIMILARITY_STRICT) ?
713 13128 : strict_word_similarity_threshold :
714 : word_similarity_threshold;
715 :
716 : /*
717 : * Consider first trigram as initial lower bound for strict word
718 : * similarity, or initialize it later with first trigram present for plain
719 : * word similarity.
720 : */
721 13128 : lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1;
722 :
723 : /* Memorise last position of each trigram */
724 13128 : lastpos = palloc_array(int, len);
725 13128 : memset(lastpos, -1, sizeof(int) * len);
726 :
727 196654 : for (i = 0; i < len2; i++)
728 : {
729 : int trgindex;
730 :
731 185310 : CHECK_FOR_INTERRUPTS();
732 :
733 : /* Get index of next trigram */
734 185310 : trgindex = trg2indexes[i];
735 :
736 : /* Update last position of this trigram */
737 185310 : if (lower >= 0 || found[trgindex])
738 : {
739 135784 : if (lastpos[trgindex] < 0)
740 : {
741 133932 : ulen2++;
742 133932 : if (found[trgindex])
743 30756 : count++;
744 : }
745 135784 : lastpos[trgindex] = i;
746 : }
747 :
748 : /*
749 : * Adjust upper bound if trigram is upper bound of word for strict
750 : * word similarity, or if trigram is present in required substring for
751 : * plain word similarity
752 : */
753 274367 : if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT)
754 89057 : : found[trgindex])
755 : {
756 : int prev_lower,
757 : tmp_ulen2,
758 : tmp_lower,
759 : tmp_count;
760 :
761 25634 : upper = i;
762 25634 : if (lower == -1)
763 : {
764 4695 : lower = i;
765 4695 : ulen2 = 1;
766 : }
767 :
768 25634 : smlr_cur = CALCSML(count, ulen1, ulen2);
769 :
770 : /* Also try to adjust lower bound for greater similarity */
771 25634 : tmp_count = count;
772 25634 : tmp_ulen2 = ulen2;
773 25634 : prev_lower = lower;
774 208569 : for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
775 : {
776 : float smlr_tmp;
777 : int tmp_trgindex;
778 :
779 : /*
780 : * Adjust lower bound only if trigram is lower bound of word
781 : * for strict word similarity, or consider every trigram as
782 : * lower bound for plain word similarity.
783 : */
784 184719 : if (!(flags & WORD_SIMILARITY_STRICT)
785 145160 : || (bounds[tmp_lower] & TRGM_BOUND_LEFT))
786 : {
787 59689 : smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
788 59689 : if (smlr_tmp > smlr_cur)
789 : {
790 3512 : smlr_cur = smlr_tmp;
791 3512 : ulen2 = tmp_ulen2;
792 3512 : lower = tmp_lower;
793 3512 : count = tmp_count;
794 : }
795 :
796 : /*
797 : * If we only check that word similarity is greater than
798 : * threshold we do not need to calculate a maximum
799 : * similarity.
800 : */
801 59689 : if ((flags & WORD_SIMILARITY_CHECK_ONLY)
802 37114 : && smlr_cur >= threshold)
803 1784 : break;
804 : }
805 :
806 182935 : tmp_trgindex = trg2indexes[tmp_lower];
807 182935 : if (lastpos[tmp_trgindex] == tmp_lower)
808 : {
809 180676 : tmp_ulen2--;
810 180676 : if (found[tmp_trgindex])
811 46575 : tmp_count--;
812 : }
813 : }
814 :
815 25634 : smlr_max = Max(smlr_max, smlr_cur);
816 :
817 : /*
818 : * if we only check that word similarity is greater than threshold
819 : * we do not need to calculate a maximum similarity.
820 : */
821 25634 : if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold)
822 1784 : break;
823 :
824 40605 : for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
825 : {
826 : int tmp_trgindex;
827 :
828 16755 : tmp_trgindex = trg2indexes[tmp_lower];
829 16755 : if (lastpos[tmp_trgindex] == tmp_lower)
830 16007 : lastpos[tmp_trgindex] = -1;
831 : }
832 : }
833 : }
834 :
835 13128 : pfree(lastpos);
836 :
837 13128 : return smlr_max;
838 : }
839 :
840 : /*
841 : * Calculate word similarity.
842 : * This function prepare two arrays: "trg2indexes" and "found". Then this arrays
843 : * are used to calculate word similarity using iterate_word_similarity().
844 : *
845 : * "trg2indexes" is array which stores indexes of the array "found".
846 : * In other words:
847 : * trg2indexes[j] = i;
848 : * found[i] = true (or false);
849 : * If found[i] == true then there is trigram trg2[j] in array "trg1".
850 : * If found[i] == false then there is not trigram trg2[j] in array "trg1".
851 : *
852 : * str1: search pattern string, of length slen1 bytes.
853 : * str2: text in which we are looking for a word, of length slen2 bytes.
854 : * flags: set of boolean flags parameterizing similarity calculation.
855 : *
856 : * Returns word similarity.
857 : */
858 : static float4
859 13128 : calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
860 : uint8 flags)
861 : {
862 : bool *found;
863 : pos_trgm *ptrg;
864 : growable_trgm_array trg1;
865 : growable_trgm_array trg2;
866 : int len1,
867 : len2,
868 : len,
869 : i,
870 : j,
871 : ulen1;
872 : int *trg2indexes;
873 : float4 result;
874 13128 : TrgmBound *bounds = NULL;
875 :
876 : /* Make positional trigrams */
877 :
878 13128 : generate_trgm_only(&trg1, str1, slen1, NULL);
879 13128 : len1 = trg1.length;
880 13128 : generate_trgm_only(&trg2, str2, slen2, (flags & WORD_SIMILARITY_STRICT) ? &bounds : NULL);
881 13128 : len2 = trg2.length;
882 :
883 13128 : ptrg = make_positional_trgm(GETARR(trg1.datum), len1, GETARR(trg2.datum), len2);
884 13128 : len = len1 + len2;
885 13128 : qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
886 :
887 13128 : pfree(trg1.datum);
888 13128 : pfree(trg2.datum);
889 :
890 : /*
891 : * Merge positional trigrams array: enumerate each trigram and find its
892 : * presence in required word.
893 : */
894 13128 : trg2indexes = palloc_array(int, len2);
895 13128 : found = palloc0_array(bool, len);
896 :
897 13128 : ulen1 = 0;
898 13128 : j = 0;
899 313962 : for (i = 0; i < len; i++)
900 : {
901 300834 : if (i > 0)
902 : {
903 287706 : int cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg);
904 :
905 287706 : if (cmp != 0)
906 : {
907 253506 : if (found[j])
908 101138 : ulen1++;
909 253506 : j++;
910 : }
911 : }
912 :
913 300834 : if (ptrg[i].index >= 0)
914 : {
915 192096 : trg2indexes[ptrg[i].index] = j;
916 : }
917 : else
918 : {
919 108738 : found[j] = true;
920 : }
921 : }
922 13128 : if (found[j])
923 7600 : ulen1++;
924 :
925 : /* Run iterative procedure to find maximum similarity with word */
926 13128 : result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
927 : flags, bounds);
928 :
929 13128 : pfree(trg2indexes);
930 13128 : pfree(found);
931 13128 : pfree(ptrg);
932 :
933 13128 : return result;
934 : }
935 :
936 :
937 : /*
938 : * Extract the next non-wildcard part of a search string, i.e. a word bounded
939 : * by '_' or '%' meta-characters, non-word characters or string end.
940 : *
941 : * str: source string, of length lenstr bytes (need not be null-terminated)
942 : * buf: where to return the substring (must be long enough)
943 : * *bytelen: receives byte length of the found substring
944 : *
945 : * Returns pointer to end+1 of the found substring in the source string.
946 : * Returns NULL if no word found (in which case buf, bytelen is not set)
947 : *
948 : * If the found word is bounded by non-word characters or string boundaries
949 : * then this function will include corresponding padding spaces into buf.
950 : */
951 : static const char *
952 119 : get_wildcard_part(const char *str, int lenstr,
953 : char *buf, int *bytelen)
954 : {
955 119 : const char *beginword = str;
956 : const char *endword;
957 119 : const char *endstr = str + lenstr;
958 119 : char *s = buf;
959 119 : bool in_leading_wildcard_meta = false;
960 119 : bool in_trailing_wildcard_meta = false;
961 119 : bool in_escape = false;
962 : int clen;
963 :
964 : /*
965 : * Find the first word character, remembering whether preceding character
966 : * was wildcard meta-character. Note that the in_escape state persists
967 : * from this loop to the next one, since we may exit at a word character
968 : * that is in_escape.
969 : */
970 241 : while (beginword < endstr)
971 : {
972 186 : clen = pg_mblen_range(beginword, endstr);
973 :
974 186 : if (in_escape)
975 : {
976 3 : if (ISWORDCHR(beginword, clen))
977 3 : break;
978 0 : in_escape = false;
979 0 : in_leading_wildcard_meta = false;
980 : }
981 : else
982 : {
983 183 : if (ISESCAPECHAR(beginword))
984 3 : in_escape = true;
985 180 : else if (ISWILDCARDCHAR(beginword))
986 104 : in_leading_wildcard_meta = true;
987 76 : else if (ISWORDCHR(beginword, clen))
988 61 : break;
989 : else
990 15 : in_leading_wildcard_meta = false;
991 : }
992 122 : beginword += clen;
993 : }
994 :
995 : /*
996 : * Handle string end.
997 : */
998 119 : if (beginword - str >= lenstr)
999 55 : return NULL;
1000 :
1001 : /*
1002 : * Add left padding spaces if preceding character wasn't wildcard
1003 : * meta-character.
1004 : */
1005 64 : if (!in_leading_wildcard_meta)
1006 : {
1007 : if (LPADDING > 0)
1008 : {
1009 15 : *s++ = ' ';
1010 : if (LPADDING > 1)
1011 15 : *s++ = ' ';
1012 : }
1013 : }
1014 :
1015 : /*
1016 : * Copy data into buf until wildcard meta-character, non-word character or
1017 : * string boundary. Strip escapes during copy.
1018 : */
1019 64 : endword = beginword;
1020 244 : while (endword < endstr)
1021 : {
1022 244 : clen = pg_mblen_range(endword, endstr);
1023 244 : if (in_escape)
1024 : {
1025 3 : if (ISWORDCHR(endword, clen))
1026 : {
1027 3 : memcpy(s, endword, clen);
1028 3 : s += clen;
1029 : }
1030 : else
1031 : {
1032 : /*
1033 : * Back up endword to the escape character when stopping at an
1034 : * escaped char, so that subsequent get_wildcard_part will
1035 : * restart from the escape character. We assume here that
1036 : * escape chars are single-byte.
1037 : */
1038 0 : endword--;
1039 0 : break;
1040 : }
1041 3 : in_escape = false;
1042 : }
1043 : else
1044 : {
1045 241 : if (ISESCAPECHAR(endword))
1046 0 : in_escape = true;
1047 241 : else if (ISWILDCARDCHAR(endword))
1048 : {
1049 55 : in_trailing_wildcard_meta = true;
1050 55 : break;
1051 : }
1052 186 : else if (ISWORDCHR(endword, clen))
1053 : {
1054 177 : memcpy(s, endword, clen);
1055 177 : s += clen;
1056 : }
1057 : else
1058 9 : break;
1059 : }
1060 180 : endword += clen;
1061 : }
1062 :
1063 : /*
1064 : * Add right padding spaces if next character isn't wildcard
1065 : * meta-character.
1066 : */
1067 64 : if (!in_trailing_wildcard_meta)
1068 : {
1069 : if (RPADDING > 0)
1070 : {
1071 9 : *s++ = ' ';
1072 : if (RPADDING > 1)
1073 : *s++ = ' ';
1074 : }
1075 : }
1076 :
1077 64 : *bytelen = s - buf;
1078 64 : return endword;
1079 : }
1080 :
1081 : /*
1082 : * Generates trigrams for wildcard search string.
1083 : *
1084 : * Returns array of trigrams that must occur in any string that matches the
1085 : * wildcard string. For example, given pattern "a%bcd%" the trigrams
1086 : * " a", "bcd" would be extracted.
1087 : */
1088 : TRGM *
1089 55 : generate_wildcard_trgm(const char *str, int slen)
1090 : {
1091 : TRGM *trg;
1092 : growable_trgm_array arr;
1093 : char *buf;
1094 : int len,
1095 : bytelen;
1096 : const char *eword;
1097 :
1098 55 : if (slen + LPADDING + RPADDING < 3 || slen == 0)
1099 : {
1100 0 : trg = (TRGM *) palloc(TRGMHDRSIZE);
1101 0 : trg->flag = ARRKEY;
1102 0 : SET_VARSIZE(trg, TRGMHDRSIZE);
1103 0 : return trg;
1104 : }
1105 :
1106 55 : init_trgm_array(&arr, slen);
1107 :
1108 : /* Allocate a buffer for blank-padded, but not yet case-folded, words */
1109 55 : buf = palloc_array(char, slen + 4);
1110 :
1111 : /*
1112 : * Extract trigrams from each substring extracted by get_wildcard_part.
1113 : */
1114 55 : eword = str;
1115 119 : while ((eword = get_wildcard_part(eword, slen - (eword - str),
1116 119 : buf, &bytelen)) != NULL)
1117 : {
1118 : char *word;
1119 :
1120 : #ifdef IGNORECASE
1121 64 : word = str_tolower(buf, bytelen, DEFAULT_COLLATION_OID);
1122 64 : bytelen = strlen(word);
1123 : #else
1124 : word = buf;
1125 : #endif
1126 :
1127 : /*
1128 : * count trigrams
1129 : */
1130 64 : make_trigrams(&arr, word, bytelen);
1131 :
1132 : #ifdef IGNORECASE
1133 64 : pfree(word);
1134 : #endif
1135 : }
1136 :
1137 55 : pfree(buf);
1138 :
1139 : /*
1140 : * Make trigrams unique.
1141 : */
1142 55 : trg = arr.datum;
1143 55 : len = arr.length;
1144 55 : if (len > 1)
1145 : {
1146 17 : trigram_qsort(GETARR(trg), len);
1147 17 : len = trigram_qunique(GETARR(trg), len);
1148 : }
1149 :
1150 55 : trg->flag = ARRKEY;
1151 55 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
1152 :
1153 55 : return trg;
1154 : }
1155 :
1156 : uint32
1157 34829 : trgm2int(trgm *ptr)
1158 : {
1159 34829 : uint32 val = 0;
1160 :
1161 34829 : val |= *(((unsigned char *) ptr));
1162 34829 : val <<= 8;
1163 34829 : val |= *(((unsigned char *) ptr) + 1);
1164 34829 : val <<= 8;
1165 34829 : val |= *(((unsigned char *) ptr) + 2);
1166 :
1167 34829 : return val;
1168 : }
1169 :
1170 : Datum
1171 7 : show_trgm(PG_FUNCTION_ARGS)
1172 : {
1173 7 : text *in = PG_GETARG_TEXT_PP(0);
1174 : TRGM *trg;
1175 : Datum *d;
1176 : ArrayType *a;
1177 : trgm *ptr;
1178 : int i;
1179 :
1180 7 : trg = generate_trgm(VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
1181 7 : d = palloc_array(Datum, 1 + ARRNELEM(trg));
1182 :
1183 44 : for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
1184 : {
1185 37 : text *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));
1186 :
1187 37 : if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr))
1188 : {
1189 0 : snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
1190 0 : SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
1191 : }
1192 : else
1193 : {
1194 37 : SET_VARSIZE(item, VARHDRSZ + 3);
1195 37 : CPTRGM(VARDATA(item), ptr);
1196 : }
1197 37 : d[i] = PointerGetDatum(item);
1198 : }
1199 :
1200 7 : a = construct_array_builtin(d, ARRNELEM(trg), TEXTOID);
1201 :
1202 44 : for (i = 0; i < ARRNELEM(trg); i++)
1203 37 : pfree(DatumGetPointer(d[i]));
1204 :
1205 7 : pfree(d);
1206 7 : pfree(trg);
1207 7 : PG_FREE_IF_COPY(in, 0);
1208 :
1209 7 : PG_RETURN_POINTER(a);
1210 : }
1211 :
1212 : float4
1213 68623 : cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
1214 : {
1215 : trgm *ptr1,
1216 : *ptr2;
1217 68623 : int count = 0;
1218 : int len1,
1219 : len2;
1220 :
1221 68623 : ptr1 = GETARR(trg1);
1222 68623 : ptr2 = GETARR(trg2);
1223 :
1224 68623 : len1 = ARRNELEM(trg1);
1225 68623 : len2 = ARRNELEM(trg2);
1226 :
1227 : /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
1228 68623 : if (len1 <= 0 || len2 <= 0)
1229 1 : return (float4) 0.0;
1230 :
1231 872381 : while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
1232 : {
1233 803759 : int res = CMPTRGM(ptr1, ptr2);
1234 :
1235 803759 : if (res < 0)
1236 180986 : ptr1++;
1237 622773 : else if (res > 0)
1238 212069 : ptr2++;
1239 : else
1240 : {
1241 410704 : ptr1++;
1242 410704 : ptr2++;
1243 410704 : count++;
1244 : }
1245 : }
1246 :
1247 : /*
1248 : * If inexact then len2 is equal to count, because we don't know actual
1249 : * length of second string in inexact search and we can assume that count
1250 : * is a lower bound of len2.
1251 : */
1252 68622 : return CALCSML(count, len1, inexact ? count : len2);
1253 : }
1254 :
1255 :
1256 : /*
1257 : * Returns whether trg2 contains all trigrams in trg1.
1258 : * This relies on the trigram arrays being sorted.
1259 : */
1260 : bool
1261 190 : trgm_contained_by(TRGM *trg1, TRGM *trg2)
1262 : {
1263 : trgm *ptr1,
1264 : *ptr2;
1265 : int len1,
1266 : len2;
1267 :
1268 190 : ptr1 = GETARR(trg1);
1269 190 : ptr2 = GETARR(trg2);
1270 :
1271 190 : len1 = ARRNELEM(trg1);
1272 190 : len2 = ARRNELEM(trg2);
1273 :
1274 622 : while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
1275 : {
1276 599 : int res = CMPTRGM(ptr1, ptr2);
1277 :
1278 599 : if (res < 0)
1279 167 : return false;
1280 432 : else if (res > 0)
1281 320 : ptr2++;
1282 : else
1283 : {
1284 112 : ptr1++;
1285 112 : ptr2++;
1286 : }
1287 : }
1288 23 : if (ptr1 - GETARR(trg1) < len1)
1289 4 : return false;
1290 : else
1291 19 : return true;
1292 : }
1293 :
1294 : /*
1295 : * Return a palloc'd boolean array showing, for each trigram in "query",
1296 : * whether it is present in the trigram array "key".
1297 : * This relies on the "key" array being sorted, but "query" need not be.
1298 : */
1299 : bool *
1300 2150 : trgm_presence_map(TRGM *query, TRGM *key)
1301 : {
1302 : bool *result;
1303 2150 : trgm *ptrq = GETARR(query),
1304 2150 : *ptrk = GETARR(key);
1305 2150 : int lenq = ARRNELEM(query),
1306 2150 : lenk = ARRNELEM(key),
1307 : i;
1308 :
1309 2150 : result = palloc0_array(bool, lenq);
1310 :
1311 : /* for each query trigram, do a binary search in the key array */
1312 507560 : for (i = 0; i < lenq; i++)
1313 : {
1314 505410 : int lo = 0;
1315 505410 : int hi = lenk;
1316 :
1317 2373653 : while (lo < hi)
1318 : {
1319 1876282 : int mid = (lo + hi) / 2;
1320 1876282 : int res = CMPTRGM(ptrq, ptrk + mid);
1321 :
1322 1876282 : if (res < 0)
1323 784082 : hi = mid;
1324 1092200 : else if (res > 0)
1325 1084161 : lo = mid + 1;
1326 : else
1327 : {
1328 8039 : result[i] = true;
1329 8039 : break;
1330 : }
1331 : }
1332 505410 : ptrq++;
1333 : }
1334 :
1335 2150 : return result;
1336 : }
1337 :
1338 : Datum
1339 31452 : similarity(PG_FUNCTION_ARGS)
1340 : {
1341 31452 : text *in1 = PG_GETARG_TEXT_PP(0);
1342 31452 : text *in2 = PG_GETARG_TEXT_PP(1);
1343 : TRGM *trg1,
1344 : *trg2;
1345 : float4 res;
1346 :
1347 31452 : trg1 = generate_trgm(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1));
1348 31452 : trg2 = generate_trgm(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2));
1349 :
1350 31452 : res = cnt_sml(trg1, trg2, false);
1351 :
1352 31452 : pfree(trg1);
1353 31452 : pfree(trg2);
1354 31452 : PG_FREE_IF_COPY(in1, 0);
1355 31452 : PG_FREE_IF_COPY(in2, 1);
1356 :
1357 31452 : PG_RETURN_FLOAT4(res);
1358 : }
1359 :
1360 : Datum
1361 902 : word_similarity(PG_FUNCTION_ARGS)
1362 : {
1363 902 : text *in1 = PG_GETARG_TEXT_PP(0);
1364 902 : text *in2 = PG_GETARG_TEXT_PP(1);
1365 : float4 res;
1366 :
1367 902 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1368 902 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1369 : 0);
1370 :
1371 902 : PG_FREE_IF_COPY(in1, 0);
1372 902 : PG_FREE_IF_COPY(in2, 1);
1373 902 : PG_RETURN_FLOAT4(res);
1374 : }
1375 :
1376 : Datum
1377 882 : strict_word_similarity(PG_FUNCTION_ARGS)
1378 : {
1379 882 : text *in1 = PG_GETARG_TEXT_PP(0);
1380 882 : text *in2 = PG_GETARG_TEXT_PP(1);
1381 : float4 res;
1382 :
1383 882 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1384 882 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1385 : WORD_SIMILARITY_STRICT);
1386 :
1387 882 : PG_FREE_IF_COPY(in1, 0);
1388 882 : PG_FREE_IF_COPY(in2, 1);
1389 882 : PG_RETURN_FLOAT4(res);
1390 : }
1391 :
1392 : Datum
1393 1004 : similarity_dist(PG_FUNCTION_ARGS)
1394 : {
1395 1004 : float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
1396 : PG_GETARG_DATUM(0),
1397 : PG_GETARG_DATUM(1)));
1398 :
1399 1004 : PG_RETURN_FLOAT4(1.0 - res);
1400 : }
1401 :
1402 : Datum
1403 6000 : similarity_op(PG_FUNCTION_ARGS)
1404 : {
1405 6000 : float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
1406 : PG_GETARG_DATUM(0),
1407 : PG_GETARG_DATUM(1)));
1408 :
1409 6000 : PG_RETURN_BOOL(res >= similarity_threshold);
1410 : }
1411 :
1412 : Datum
1413 1924 : word_similarity_op(PG_FUNCTION_ARGS)
1414 : {
1415 1924 : text *in1 = PG_GETARG_TEXT_PP(0);
1416 1924 : text *in2 = PG_GETARG_TEXT_PP(1);
1417 : float4 res;
1418 :
1419 1924 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1420 1924 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1421 : WORD_SIMILARITY_CHECK_ONLY);
1422 :
1423 1924 : PG_FREE_IF_COPY(in1, 0);
1424 1924 : PG_FREE_IF_COPY(in2, 1);
1425 1924 : PG_RETURN_BOOL(res >= word_similarity_threshold);
1426 : }
1427 :
1428 : Datum
1429 2926 : word_similarity_commutator_op(PG_FUNCTION_ARGS)
1430 : {
1431 2926 : text *in1 = PG_GETARG_TEXT_PP(0);
1432 2926 : text *in2 = PG_GETARG_TEXT_PP(1);
1433 : float4 res;
1434 :
1435 2926 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1436 2926 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1437 : WORD_SIMILARITY_CHECK_ONLY);
1438 :
1439 2926 : PG_FREE_IF_COPY(in1, 0);
1440 2926 : PG_FREE_IF_COPY(in2, 1);
1441 2926 : PG_RETURN_BOOL(res >= word_similarity_threshold);
1442 : }
1443 :
1444 : Datum
1445 0 : word_similarity_dist_op(PG_FUNCTION_ARGS)
1446 : {
1447 0 : text *in1 = PG_GETARG_TEXT_PP(0);
1448 0 : text *in2 = PG_GETARG_TEXT_PP(1);
1449 : float4 res;
1450 :
1451 0 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1452 0 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1453 : 0);
1454 :
1455 0 : PG_FREE_IF_COPY(in1, 0);
1456 0 : PG_FREE_IF_COPY(in2, 1);
1457 0 : PG_RETURN_FLOAT4(1.0 - res);
1458 : }
1459 :
1460 : Datum
1461 714 : word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
1462 : {
1463 714 : text *in1 = PG_GETARG_TEXT_PP(0);
1464 714 : text *in2 = PG_GETARG_TEXT_PP(1);
1465 : float4 res;
1466 :
1467 714 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1468 714 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1469 : 0);
1470 :
1471 714 : PG_FREE_IF_COPY(in1, 0);
1472 714 : PG_FREE_IF_COPY(in2, 1);
1473 714 : PG_RETURN_FLOAT4(1.0 - res);
1474 : }
1475 :
1476 : Datum
1477 2530 : strict_word_similarity_op(PG_FUNCTION_ARGS)
1478 : {
1479 2530 : text *in1 = PG_GETARG_TEXT_PP(0);
1480 2530 : text *in2 = PG_GETARG_TEXT_PP(1);
1481 : float4 res;
1482 :
1483 2530 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1484 2530 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1485 : WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
1486 :
1487 2530 : PG_FREE_IF_COPY(in1, 0);
1488 2530 : PG_FREE_IF_COPY(in2, 1);
1489 2530 : PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
1490 : }
1491 :
1492 : Datum
1493 2530 : strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
1494 : {
1495 2530 : text *in1 = PG_GETARG_TEXT_PP(0);
1496 2530 : text *in2 = PG_GETARG_TEXT_PP(1);
1497 : float4 res;
1498 :
1499 2530 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1500 2530 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1501 : WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
1502 :
1503 2530 : PG_FREE_IF_COPY(in1, 0);
1504 2530 : PG_FREE_IF_COPY(in2, 1);
1505 2530 : PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
1506 : }
1507 :
1508 : Datum
1509 0 : strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
1510 : {
1511 0 : text *in1 = PG_GETARG_TEXT_PP(0);
1512 0 : text *in2 = PG_GETARG_TEXT_PP(1);
1513 : float4 res;
1514 :
1515 0 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1516 0 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1517 : WORD_SIMILARITY_STRICT);
1518 :
1519 0 : PG_FREE_IF_COPY(in1, 0);
1520 0 : PG_FREE_IF_COPY(in2, 1);
1521 0 : PG_RETURN_FLOAT4(1.0 - res);
1522 : }
1523 :
1524 : Datum
1525 720 : strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
1526 : {
1527 720 : text *in1 = PG_GETARG_TEXT_PP(0);
1528 720 : text *in2 = PG_GETARG_TEXT_PP(1);
1529 : float4 res;
1530 :
1531 720 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1532 720 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1533 : WORD_SIMILARITY_STRICT);
1534 :
1535 720 : PG_FREE_IF_COPY(in1, 0);
1536 720 : PG_FREE_IF_COPY(in2, 1);
1537 720 : PG_RETURN_FLOAT4(1.0 - res);
1538 : }
|