Line data Source code
1 : /*
2 : * contrib/pg_trgm/trgm_op.c
3 : */
4 : #include "postgres.h"
5 :
6 : #include <ctype.h>
7 :
8 : #include "catalog/pg_type.h"
9 : #include "lib/qunique.h"
10 : #include "miscadmin.h"
11 : #include "trgm.h"
12 : #include "tsearch/ts_locale.h"
13 : #include "utils/guc.h"
14 : #include "utils/lsyscache.h"
15 : #include "utils/memutils.h"
16 : #include "utils/pg_crc.h"
17 :
18 6 : PG_MODULE_MAGIC;
19 :
20 : /* GUC variables */
21 : double similarity_threshold = 0.3f;
22 : double word_similarity_threshold = 0.6f;
23 : double strict_word_similarity_threshold = 0.5f;
24 :
25 4 : PG_FUNCTION_INFO_V1(set_limit);
26 4 : PG_FUNCTION_INFO_V1(show_limit);
27 4 : PG_FUNCTION_INFO_V1(show_trgm);
28 4 : PG_FUNCTION_INFO_V1(similarity);
29 4 : PG_FUNCTION_INFO_V1(word_similarity);
30 4 : PG_FUNCTION_INFO_V1(strict_word_similarity);
31 4 : PG_FUNCTION_INFO_V1(similarity_dist);
32 4 : PG_FUNCTION_INFO_V1(similarity_op);
33 4 : PG_FUNCTION_INFO_V1(word_similarity_op);
34 4 : PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
35 2 : PG_FUNCTION_INFO_V1(word_similarity_dist_op);
36 4 : PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
37 4 : PG_FUNCTION_INFO_V1(strict_word_similarity_op);
38 4 : PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op);
39 2 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op);
40 4 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op);
41 :
42 : /* Trigram with position */
43 : typedef struct
44 : {
45 : trgm trg;
46 : int index;
47 : } pos_trgm;
48 :
49 : /* Trigram bound type */
50 : typedef uint8 TrgmBound;
51 : #define TRGM_BOUND_LEFT 0x01 /* trigram is left bound of word */
52 : #define TRGM_BOUND_RIGHT 0x02 /* trigram is right bound of word */
53 :
54 : /* Word similarity flags */
55 : #define WORD_SIMILARITY_CHECK_ONLY 0x01 /* only check existence of similar
56 : * search pattern in text */
57 : #define WORD_SIMILARITY_STRICT 0x02 /* force bounds of extent to match
58 : * word bounds */
59 :
60 : /*
61 : * Module load callback
62 : */
63 : void
64 6 : _PG_init(void)
65 : {
66 : /* Define custom GUC variables. */
67 6 : DefineCustomRealVariable("pg_trgm.similarity_threshold",
68 : "Sets the threshold used by the % operator.",
69 : "Valid range is 0.0 .. 1.0.",
70 : &similarity_threshold,
71 : 0.3f,
72 : 0.0,
73 : 1.0,
74 : PGC_USERSET,
75 : 0,
76 : NULL,
77 : NULL,
78 : NULL);
79 6 : DefineCustomRealVariable("pg_trgm.word_similarity_threshold",
80 : "Sets the threshold used by the <% operator.",
81 : "Valid range is 0.0 .. 1.0.",
82 : &word_similarity_threshold,
83 : 0.6f,
84 : 0.0,
85 : 1.0,
86 : PGC_USERSET,
87 : 0,
88 : NULL,
89 : NULL,
90 : NULL);
91 6 : DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold",
92 : "Sets the threshold used by the <<% operator.",
93 : "Valid range is 0.0 .. 1.0.",
94 : &strict_word_similarity_threshold,
95 : 0.5f,
96 : 0.0,
97 : 1.0,
98 : PGC_USERSET,
99 : 0,
100 : NULL,
101 : NULL,
102 : NULL);
103 :
104 6 : MarkGUCPrefixReserved("pg_trgm");
105 6 : }
106 :
107 : /*
108 : * Deprecated function.
109 : * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
110 : */
111 : Datum
112 4 : set_limit(PG_FUNCTION_ARGS)
113 : {
114 4 : float4 nlimit = PG_GETARG_FLOAT4(0);
115 : char *nlimit_str;
116 : Oid func_out_oid;
117 : bool is_varlena;
118 :
119 4 : getTypeOutputInfo(FLOAT4OID, &func_out_oid, &is_varlena);
120 :
121 4 : nlimit_str = OidOutputFunctionCall(func_out_oid, Float4GetDatum(nlimit));
122 :
123 4 : SetConfigOption("pg_trgm.similarity_threshold", nlimit_str,
124 : PGC_USERSET, PGC_S_SESSION);
125 :
126 4 : PG_RETURN_FLOAT4(similarity_threshold);
127 : }
128 :
129 :
130 : /*
131 : * Get similarity threshold for given index scan strategy number.
132 : */
133 : double
134 86400 : index_strategy_get_limit(StrategyNumber strategy)
135 : {
136 86400 : switch (strategy)
137 : {
138 64808 : case SimilarityStrategyNumber:
139 64808 : return similarity_threshold;
140 9636 : case WordSimilarityStrategyNumber:
141 9636 : return word_similarity_threshold;
142 11956 : case StrictWordSimilarityStrategyNumber:
143 11956 : return strict_word_similarity_threshold;
144 0 : default:
145 0 : elog(ERROR, "unrecognized strategy number: %d", strategy);
146 : break;
147 : }
148 :
149 : return 0.0; /* keep compiler quiet */
150 : }
151 :
152 : /*
153 : * Deprecated function.
154 : * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
155 : */
156 : Datum
157 40000 : show_limit(PG_FUNCTION_ARGS)
158 : {
159 40000 : PG_RETURN_FLOAT4(similarity_threshold);
160 : }
161 :
162 : static int
163 6265240 : comp_trgm(const void *a, const void *b)
164 : {
165 6265240 : return CMPTRGM(a, b);
166 : }
167 :
168 : /*
169 : * Finds first word in string, returns pointer to the word,
170 : * endword points to the character after word
171 : */
172 : static char *
173 478818 : find_word(char *str, int lenstr, char **endword, int *charlen)
174 : {
175 478818 : char *beginword = str;
176 :
177 506142 : while (beginword - str < lenstr && !ISWORDCHR(beginword))
178 27324 : beginword += pg_mblen(beginword);
179 :
180 478818 : if (beginword - str >= lenstr)
181 226160 : return NULL;
182 :
183 252658 : *endword = beginword;
184 252658 : *charlen = 0;
185 2175334 : while (*endword - str < lenstr && ISWORDCHR(*endword))
186 : {
187 1922676 : *endword += pg_mblen(*endword);
188 1922676 : (*charlen)++;
189 : }
190 :
191 252658 : return beginword;
192 : }
193 :
194 : /*
195 : * Reduce a trigram (three possibly multi-byte characters) to a trgm,
196 : * which is always exactly three bytes. If we have three single-byte
197 : * characters, we just use them as-is; otherwise we form a hash value.
198 : */
199 : void
200 2918 : compact_trigram(trgm *tptr, char *str, int bytelen)
201 : {
202 2918 : if (bytelen == 3)
203 : {
204 2918 : CPTRGM(tptr, str);
205 : }
206 : else
207 : {
208 : pg_crc32 crc;
209 :
210 0 : INIT_LEGACY_CRC32(crc);
211 0 : COMP_LEGACY_CRC32(crc, str, bytelen);
212 0 : FIN_LEGACY_CRC32(crc);
213 :
214 : /*
215 : * use only 3 upper bytes from crc, hope, it's good enough hashing
216 : */
217 0 : CPTRGM(tptr, &crc);
218 : }
219 2918 : }
220 :
221 : /*
222 : * Adds trigrams from words (already padded).
223 : */
224 : static trgm *
225 252786 : make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
226 : {
227 252786 : char *ptr = str;
228 :
229 252786 : if (charlen < 3)
230 54 : return tptr;
231 :
232 252732 : if (bytelen > charlen)
233 : {
234 : /* Find multibyte character boundaries and apply compact_trigram */
235 0 : int lenfirst = pg_mblen(str),
236 0 : lenmiddle = pg_mblen(str + lenfirst),
237 0 : lenlast = pg_mblen(str + lenfirst + lenmiddle);
238 :
239 0 : while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
240 : {
241 0 : compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
242 :
243 0 : ptr += lenfirst;
244 0 : tptr++;
245 :
246 0 : lenfirst = lenmiddle;
247 0 : lenmiddle = lenlast;
248 0 : lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
249 : }
250 : }
251 : else
252 : {
253 : /* Fast path when there are no multibyte characters */
254 : Assert(bytelen == charlen);
255 :
256 2428248 : while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
257 : {
258 2175516 : CPTRGM(tptr, ptr);
259 2175516 : ptr++;
260 2175516 : tptr++;
261 : }
262 : }
263 :
264 252732 : return tptr;
265 : }
266 :
267 : /*
268 : * Make array of trigrams without sorting and removing duplicate items.
269 : *
270 : * trg: where to return the array of trigrams.
271 : * str: source string, of length slen bytes.
272 : * bounds: where to return bounds of trigrams (if needed).
273 : *
274 : * Returns length of the generated array.
275 : */
276 : static int
277 226162 : generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
278 : {
279 : trgm *tptr;
280 : char *buf;
281 : int charlen,
282 : bytelen;
283 : char *bword,
284 : *eword;
285 :
286 226162 : if (slen + LPADDING + RPADDING < 3 || slen == 0)
287 2 : return 0;
288 :
289 226160 : tptr = trg;
290 :
291 : /* Allocate a buffer for case-folded, blank-padded words */
292 226160 : buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
293 :
294 : if (LPADDING > 0)
295 : {
296 226160 : *buf = ' ';
297 : if (LPADDING > 1)
298 226160 : *(buf + 1) = ' ';
299 : }
300 :
301 226160 : eword = str;
302 478818 : while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
303 : {
304 : #ifdef IGNORECASE
305 252658 : bword = lowerstr_with_len(bword, eword - bword);
306 252658 : bytelen = strlen(bword);
307 : #else
308 : bytelen = eword - bword;
309 : #endif
310 :
311 252658 : memcpy(buf + LPADDING, bword, bytelen);
312 :
313 : #ifdef IGNORECASE
314 252658 : pfree(bword);
315 : #endif
316 :
317 252658 : buf[LPADDING + bytelen] = ' ';
318 252658 : buf[LPADDING + bytelen + 1] = ' ';
319 :
320 : /* Calculate trigrams marking their bounds if needed */
321 252658 : if (bounds)
322 24794 : bounds[tptr - trg] |= TRGM_BOUND_LEFT;
323 252658 : tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
324 : charlen + LPADDING + RPADDING);
325 252658 : if (bounds)
326 24794 : bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
327 : }
328 :
329 226160 : pfree(buf);
330 :
331 226160 : return tptr - trg;
332 : }
333 :
334 : /*
335 : * Guard against possible overflow in the palloc requests below. (We
336 : * don't worry about the additive constants, since palloc can detect
337 : * requests that are a little above MaxAllocSize --- we just need to
338 : * prevent integer overflow in the multiplications.)
339 : */
340 : static void
341 202020 : protect_out_of_mem(int slen)
342 : {
343 202020 : if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
344 202020 : (Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
345 0 : ereport(ERROR,
346 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
347 : errmsg("out of memory")));
348 202020 : }
349 :
350 : /*
351 : * Make array of trigrams with sorting and removing duplicate items.
352 : *
353 : * str: source string, of length slen bytes.
354 : *
355 : * Returns the sorted array of unique trigrams.
356 : */
357 : TRGM *
358 177658 : generate_trgm(char *str, int slen)
359 : {
360 : TRGM *trg;
361 : int len;
362 :
363 177658 : protect_out_of_mem(slen);
364 :
365 177658 : trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
366 177658 : trg->flag = ARRKEY;
367 :
368 177658 : len = generate_trgm_only(GETARR(trg), str, slen, NULL);
369 177658 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
370 :
371 177658 : if (len == 0)
372 8 : return trg;
373 :
374 : /*
375 : * Make trigrams unique.
376 : */
377 177650 : if (len > 1)
378 : {
379 177650 : qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
380 177650 : len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
381 : }
382 :
383 177650 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
384 :
385 177650 : return trg;
386 : }
387 :
388 : /*
389 : * Make array of positional trigrams from two trigram arrays trg1 and trg2.
390 : *
391 : * trg1: trigram array of search pattern, of length len1. trg1 is required
392 : * word which positions don't matter and replaced with -1.
393 : * trg2: trigram array of text, of length len2. trg2 is haystack where we
394 : * search and have to store its positions.
395 : *
396 : * Returns concatenated trigram array.
397 : */
398 : static pos_trgm *
399 24252 : make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
400 : {
401 : pos_trgm *result;
402 : int i,
403 24252 : len = len1 + len2;
404 :
405 24252 : result = (pos_trgm *) palloc(sizeof(pos_trgm) * len);
406 :
407 241728 : for (i = 0; i < len1; i++)
408 : {
409 217476 : memcpy(&result[i].trg, &trg1[i], sizeof(trgm));
410 217476 : result[i].index = -1;
411 : }
412 :
413 384410 : for (i = 0; i < len2; i++)
414 : {
415 360158 : memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm));
416 360158 : result[i + len1].index = i;
417 : }
418 :
419 24252 : return result;
420 : }
421 :
422 : /*
423 : * Compare position trigrams: compare trigrams first and position second.
424 : */
425 : static int
426 2615370 : comp_ptrgm(const void *v1, const void *v2)
427 : {
428 2615370 : const pos_trgm *p1 = (const pos_trgm *) v1;
429 2615370 : const pos_trgm *p2 = (const pos_trgm *) v2;
430 : int cmp;
431 :
432 2615370 : cmp = CMPTRGM(p1->trg, p2->trg);
433 2615370 : if (cmp != 0)
434 2535958 : return cmp;
435 :
436 79412 : if (p1->index < p2->index)
437 42736 : return -1;
438 36676 : else if (p1->index == p2->index)
439 0 : return 0;
440 : else
441 36676 : return 1;
442 : }
443 :
444 : /*
445 : * Iterative search function which calculates maximum similarity with word in
446 : * the string. Maximum similarity is only calculated only if the flag
447 : * WORD_SIMILARITY_CHECK_ONLY isn't set.
448 : *
449 : * trg2indexes: array which stores indexes of the array "found".
450 : * found: array which stores true of false values.
451 : * ulen1: count of unique trigrams of array "trg1".
452 : * len2: length of array "trg2" and array "trg2indexes".
453 : * len: length of the array "found".
454 : * flags: set of boolean flags parameterizing similarity calculation.
455 : * bounds: whether each trigram is left/right bound of word.
456 : *
457 : * Returns word similarity.
458 : */
459 : static float4
460 24252 : iterate_word_similarity(int *trg2indexes,
461 : bool *found,
462 : int ulen1,
463 : int len2,
464 : int len,
465 : uint8 flags,
466 : TrgmBound *bounds)
467 : {
468 : int *lastpos,
469 : i,
470 24252 : ulen2 = 0,
471 24252 : count = 0,
472 24252 : upper = -1,
473 : lower;
474 : float4 smlr_cur,
475 24252 : smlr_max = 0.0f;
476 : double threshold;
477 :
478 : Assert(bounds || !(flags & WORD_SIMILARITY_STRICT));
479 :
480 : /* Select appropriate threshold */
481 48504 : threshold = (flags & WORD_SIMILARITY_STRICT) ?
482 24252 : strict_word_similarity_threshold :
483 : word_similarity_threshold;
484 :
485 : /*
486 : * Consider first trigram as initial lower bound for strict word
487 : * similarity, or initialize it later with first trigram present for plain
488 : * word similarity.
489 : */
490 24252 : lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1;
491 :
492 : /* Memorise last position of each trigram */
493 24252 : lastpos = (int *) palloc(sizeof(int) * len);
494 24252 : memset(lastpos, -1, sizeof(int) * len);
495 :
496 367270 : for (i = 0; i < len2; i++)
497 : {
498 : int trgindex;
499 :
500 346586 : CHECK_FOR_INTERRUPTS();
501 :
502 : /* Get index of next trigram */
503 346586 : trgindex = trg2indexes[i];
504 :
505 : /* Update last position of this trigram */
506 346586 : if (lower >= 0 || found[trgindex])
507 : {
508 271564 : if (lastpos[trgindex] < 0)
509 : {
510 267858 : ulen2++;
511 267858 : if (found[trgindex])
512 61512 : count++;
513 : }
514 271564 : lastpos[trgindex] = i;
515 : }
516 :
517 : /*
518 : * Adjust upper bound if trigram is upper bound of word for strict
519 : * word similarity, or if trigram is present in required substring for
520 : * plain word similarity
521 : */
522 500666 : if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT)
523 154080 : : found[trgindex])
524 : {
525 : int prev_lower,
526 : tmp_ulen2,
527 : tmp_lower,
528 : tmp_count;
529 :
530 51270 : upper = i;
531 51270 : if (lower == -1)
532 : {
533 9390 : lower = i;
534 9390 : ulen2 = 1;
535 : }
536 :
537 51270 : smlr_cur = CALCSML(count, ulen1, ulen2);
538 :
539 : /* Also try to adjust lower bound for greater similarity */
540 51270 : tmp_count = count;
541 51270 : tmp_ulen2 = ulen2;
542 51270 : prev_lower = lower;
543 417144 : for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
544 : {
545 : float smlr_tmp;
546 : int tmp_trgindex;
547 :
548 : /*
549 : * Adjust lower bound only if trigram is lower bound of word
550 : * for strict word similarity, or consider every trigram as
551 : * lower bound for plain word similarity.
552 : */
553 369442 : if (!(flags & WORD_SIMILARITY_STRICT)
554 290320 : || (bounds[tmp_lower] & TRGM_BOUND_LEFT))
555 : {
556 119382 : smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
557 119382 : if (smlr_tmp > smlr_cur)
558 : {
559 7024 : smlr_cur = smlr_tmp;
560 7024 : ulen2 = tmp_ulen2;
561 7024 : lower = tmp_lower;
562 7024 : count = tmp_count;
563 : }
564 :
565 : /*
566 : * If we only check that word similarity is greater than
567 : * threshold we do not need to calculate a maximum
568 : * similarity.
569 : */
570 119382 : if ((flags & WORD_SIMILARITY_CHECK_ONLY)
571 74228 : && smlr_cur >= threshold)
572 3568 : break;
573 : }
574 :
575 365874 : tmp_trgindex = trg2indexes[tmp_lower];
576 365874 : if (lastpos[tmp_trgindex] == tmp_lower)
577 : {
578 361354 : tmp_ulen2--;
579 361354 : if (found[tmp_trgindex])
580 93158 : tmp_count--;
581 : }
582 : }
583 :
584 51270 : smlr_max = Max(smlr_max, smlr_cur);
585 :
586 : /*
587 : * if we only check that word similarity is greater than threshold
588 : * we do not need to calculate a maximum similarity.
589 : */
590 51270 : if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold)
591 3568 : break;
592 :
593 81202 : for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
594 : {
595 : int tmp_trgindex;
596 :
597 33500 : tmp_trgindex = trg2indexes[tmp_lower];
598 33500 : if (lastpos[tmp_trgindex] == tmp_lower)
599 32002 : lastpos[tmp_trgindex] = -1;
600 : }
601 : }
602 : }
603 :
604 24252 : pfree(lastpos);
605 :
606 24252 : return smlr_max;
607 : }
608 :
609 : /*
610 : * Calculate word similarity.
611 : * This function prepare two arrays: "trg2indexes" and "found". Then this arrays
612 : * are used to calculate word similarity using iterate_word_similarity().
613 : *
614 : * "trg2indexes" is array which stores indexes of the array "found".
615 : * In other words:
616 : * trg2indexes[j] = i;
617 : * found[i] = true (or false);
618 : * If found[i] == true then there is trigram trg2[j] in array "trg1".
619 : * If found[i] == false then there is not trigram trg2[j] in array "trg1".
620 : *
621 : * str1: search pattern string, of length slen1 bytes.
622 : * str2: text in which we are looking for a word, of length slen2 bytes.
623 : * flags: set of boolean flags parameterizing similarity calculation.
624 : *
625 : * Returns word similarity.
626 : */
627 : static float4
628 24252 : calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
629 : uint8 flags)
630 : {
631 : bool *found;
632 : pos_trgm *ptrg;
633 : trgm *trg1;
634 : trgm *trg2;
635 : int len1,
636 : len2,
637 : len,
638 : i,
639 : j,
640 : ulen1;
641 : int *trg2indexes;
642 : float4 result;
643 : TrgmBound *bounds;
644 :
645 24252 : protect_out_of_mem(slen1 + slen2);
646 :
647 : /* Make positional trigrams */
648 24252 : trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
649 24252 : trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
650 24252 : if (flags & WORD_SIMILARITY_STRICT)
651 13324 : bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
652 : else
653 10928 : bounds = NULL;
654 :
655 24252 : len1 = generate_trgm_only(trg1, str1, slen1, NULL);
656 24252 : len2 = generate_trgm_only(trg2, str2, slen2, bounds);
657 :
658 24252 : ptrg = make_positional_trgm(trg1, len1, trg2, len2);
659 24252 : len = len1 + len2;
660 24252 : qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
661 :
662 24252 : pfree(trg1);
663 24252 : pfree(trg2);
664 :
665 : /*
666 : * Merge positional trigrams array: enumerate each trigram and find its
667 : * presence in required word.
668 : */
669 24252 : trg2indexes = (int *) palloc(sizeof(int) * len2);
670 24252 : found = (bool *) palloc0(sizeof(bool) * len);
671 :
672 24252 : ulen1 = 0;
673 24252 : j = 0;
674 601886 : for (i = 0; i < len; i++)
675 : {
676 577634 : if (i > 0)
677 : {
678 553382 : int cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg);
679 :
680 553382 : if (cmp != 0)
681 : {
682 484980 : if (found[j])
683 202274 : ulen1++;
684 484980 : j++;
685 : }
686 : }
687 :
688 577634 : if (ptrg[i].index >= 0)
689 : {
690 360158 : trg2indexes[ptrg[i].index] = j;
691 : }
692 : else
693 : {
694 217476 : found[j] = true;
695 : }
696 : }
697 24252 : if (found[j])
698 15202 : ulen1++;
699 :
700 : /* Run iterative procedure to find maximum similarity with word */
701 24252 : result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
702 : flags, bounds);
703 :
704 24252 : pfree(trg2indexes);
705 24252 : pfree(found);
706 24252 : pfree(ptrg);
707 :
708 24252 : return result;
709 : }
710 :
711 :
712 : /*
713 : * Extract the next non-wildcard part of a search string, i.e. a word bounded
714 : * by '_' or '%' meta-characters, non-word characters or string end.
715 : *
716 : * str: source string, of length lenstr bytes (need not be null-terminated)
717 : * buf: where to return the substring (must be long enough)
718 : * *bytelen: receives byte length of the found substring
719 : * *charlen: receives character length of the found substring
720 : *
721 : * Returns pointer to end+1 of the found substring in the source string.
722 : * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
723 : *
724 : * If the found word is bounded by non-word characters or string boundaries
725 : * then this function will include corresponding padding spaces into buf.
726 : */
727 : static const char *
728 238 : get_wildcard_part(const char *str, int lenstr,
729 : char *buf, int *bytelen, int *charlen)
730 : {
731 238 : const char *beginword = str;
732 : const char *endword;
733 238 : char *s = buf;
734 238 : bool in_leading_wildcard_meta = false;
735 238 : bool in_trailing_wildcard_meta = false;
736 238 : bool in_escape = false;
737 : int clen;
738 :
739 : /*
740 : * Find the first word character, remembering whether preceding character
741 : * was wildcard meta-character. Note that the in_escape state persists
742 : * from this loop to the next one, since we may exit at a word character
743 : * that is in_escape.
744 : */
745 482 : while (beginword - str < lenstr)
746 : {
747 372 : if (in_escape)
748 : {
749 6 : if (ISWORDCHR(beginword))
750 6 : break;
751 0 : in_escape = false;
752 0 : in_leading_wildcard_meta = false;
753 : }
754 : else
755 : {
756 366 : if (ISESCAPECHAR(beginword))
757 6 : in_escape = true;
758 360 : else if (ISWILDCARDCHAR(beginword))
759 208 : in_leading_wildcard_meta = true;
760 152 : else if (ISWORDCHR(beginword))
761 122 : break;
762 : else
763 30 : in_leading_wildcard_meta = false;
764 : }
765 244 : beginword += pg_mblen(beginword);
766 : }
767 :
768 : /*
769 : * Handle string end.
770 : */
771 238 : if (beginword - str >= lenstr)
772 110 : return NULL;
773 :
774 : /*
775 : * Add left padding spaces if preceding character wasn't wildcard
776 : * meta-character.
777 : */
778 128 : *charlen = 0;
779 128 : if (!in_leading_wildcard_meta)
780 : {
781 : if (LPADDING > 0)
782 : {
783 30 : *s++ = ' ';
784 30 : (*charlen)++;
785 : if (LPADDING > 1)
786 : {
787 30 : *s++ = ' ';
788 30 : (*charlen)++;
789 : }
790 : }
791 : }
792 :
793 : /*
794 : * Copy data into buf until wildcard meta-character, non-word character or
795 : * string boundary. Strip escapes during copy.
796 : */
797 128 : endword = beginword;
798 488 : while (endword - str < lenstr)
799 : {
800 488 : clen = pg_mblen(endword);
801 488 : if (in_escape)
802 : {
803 6 : if (ISWORDCHR(endword))
804 : {
805 6 : memcpy(s, endword, clen);
806 6 : (*charlen)++;
807 6 : s += clen;
808 : }
809 : else
810 : {
811 : /*
812 : * Back up endword to the escape character when stopping at an
813 : * escaped char, so that subsequent get_wildcard_part will
814 : * restart from the escape character. We assume here that
815 : * escape chars are single-byte.
816 : */
817 0 : endword--;
818 0 : break;
819 : }
820 6 : in_escape = false;
821 : }
822 : else
823 : {
824 482 : if (ISESCAPECHAR(endword))
825 0 : in_escape = true;
826 482 : else if (ISWILDCARDCHAR(endword))
827 : {
828 110 : in_trailing_wildcard_meta = true;
829 110 : break;
830 : }
831 372 : else if (ISWORDCHR(endword))
832 : {
833 354 : memcpy(s, endword, clen);
834 354 : (*charlen)++;
835 354 : s += clen;
836 : }
837 : else
838 18 : break;
839 : }
840 360 : endword += clen;
841 : }
842 :
843 : /*
844 : * Add right padding spaces if next character isn't wildcard
845 : * meta-character.
846 : */
847 128 : if (!in_trailing_wildcard_meta)
848 : {
849 : if (RPADDING > 0)
850 : {
851 18 : *s++ = ' ';
852 18 : (*charlen)++;
853 : if (RPADDING > 1)
854 : {
855 : *s++ = ' ';
856 : (*charlen)++;
857 : }
858 : }
859 : }
860 :
861 128 : *bytelen = s - buf;
862 128 : return endword;
863 : }
864 :
865 : /*
866 : * Generates trigrams for wildcard search string.
867 : *
868 : * Returns array of trigrams that must occur in any string that matches the
869 : * wildcard string. For example, given pattern "a%bcd%" the trigrams
870 : * " a", "bcd" would be extracted.
871 : */
872 : TRGM *
873 110 : generate_wildcard_trgm(const char *str, int slen)
874 : {
875 : TRGM *trg;
876 : char *buf,
877 : *buf2;
878 : trgm *tptr;
879 : int len,
880 : charlen,
881 : bytelen;
882 : const char *eword;
883 :
884 110 : protect_out_of_mem(slen);
885 :
886 110 : trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
887 110 : trg->flag = ARRKEY;
888 110 : SET_VARSIZE(trg, TRGMHDRSIZE);
889 :
890 110 : if (slen + LPADDING + RPADDING < 3 || slen == 0)
891 0 : return trg;
892 :
893 110 : tptr = GETARR(trg);
894 :
895 : /* Allocate a buffer for blank-padded, but not yet case-folded, words */
896 110 : buf = palloc(sizeof(char) * (slen + 4));
897 :
898 : /*
899 : * Extract trigrams from each substring extracted by get_wildcard_part.
900 : */
901 110 : eword = str;
902 238 : while ((eword = get_wildcard_part(eword, slen - (eword - str),
903 : buf, &bytelen, &charlen)) != NULL)
904 : {
905 : #ifdef IGNORECASE
906 128 : buf2 = lowerstr_with_len(buf, bytelen);
907 128 : bytelen = strlen(buf2);
908 : #else
909 : buf2 = buf;
910 : #endif
911 :
912 : /*
913 : * count trigrams
914 : */
915 128 : tptr = make_trigrams(tptr, buf2, bytelen, charlen);
916 :
917 : #ifdef IGNORECASE
918 128 : pfree(buf2);
919 : #endif
920 : }
921 :
922 110 : pfree(buf);
923 :
924 110 : if ((len = tptr - GETARR(trg)) == 0)
925 48 : return trg;
926 :
927 : /*
928 : * Make trigrams unique.
929 : */
930 62 : if (len > 1)
931 : {
932 34 : qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
933 34 : len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
934 : }
935 :
936 62 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
937 :
938 62 : return trg;
939 : }
940 :
941 : uint32
942 69546 : trgm2int(trgm *ptr)
943 : {
944 69546 : uint32 val = 0;
945 :
946 69546 : val |= *(((unsigned char *) ptr));
947 69546 : val <<= 8;
948 69546 : val |= *(((unsigned char *) ptr) + 1);
949 69546 : val <<= 8;
950 69546 : val |= *(((unsigned char *) ptr) + 2);
951 :
952 69546 : return val;
953 : }
954 :
955 : Datum
956 14 : show_trgm(PG_FUNCTION_ARGS)
957 : {
958 14 : text *in = PG_GETARG_TEXT_PP(0);
959 : TRGM *trg;
960 : Datum *d;
961 : ArrayType *a;
962 : trgm *ptr;
963 : int i;
964 :
965 14 : trg = generate_trgm(VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
966 14 : d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));
967 :
968 88 : for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
969 : {
970 74 : text *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));
971 :
972 74 : if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr))
973 : {
974 0 : snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
975 0 : SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
976 : }
977 : else
978 : {
979 74 : SET_VARSIZE(item, VARHDRSZ + 3);
980 74 : CPTRGM(VARDATA(item), ptr);
981 : }
982 74 : d[i] = PointerGetDatum(item);
983 : }
984 :
985 14 : a = construct_array_builtin(d, ARRNELEM(trg), TEXTOID);
986 :
987 88 : for (i = 0; i < ARRNELEM(trg); i++)
988 74 : pfree(DatumGetPointer(d[i]));
989 :
990 14 : pfree(d);
991 14 : pfree(trg);
992 14 : PG_FREE_IF_COPY(in, 0);
993 :
994 14 : PG_RETURN_POINTER(a);
995 : }
996 :
997 : float4
998 137576 : cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
999 : {
1000 : trgm *ptr1,
1001 : *ptr2;
1002 137576 : int count = 0;
1003 : int len1,
1004 : len2;
1005 :
1006 137576 : ptr1 = GETARR(trg1);
1007 137576 : ptr2 = GETARR(trg2);
1008 :
1009 137576 : len1 = ARRNELEM(trg1);
1010 137576 : len2 = ARRNELEM(trg2);
1011 :
1012 : /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
1013 137576 : if (len1 <= 0 || len2 <= 0)
1014 2 : return (float4) 0.0;
1015 :
1016 1751214 : while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
1017 : {
1018 1613640 : int res = CMPTRGM(ptr1, ptr2);
1019 :
1020 1613640 : if (res < 0)
1021 364714 : ptr1++;
1022 1248926 : else if (res > 0)
1023 426238 : ptr2++;
1024 : else
1025 : {
1026 822688 : ptr1++;
1027 822688 : ptr2++;
1028 822688 : count++;
1029 : }
1030 : }
1031 :
1032 : /*
1033 : * If inexact then len2 is equal to count, because we don't know actual
1034 : * length of second string in inexact search and we can assume that count
1035 : * is a lower bound of len2.
1036 : */
1037 137574 : return CALCSML(count, len1, inexact ? count : len2);
1038 : }
1039 :
1040 :
1041 : /*
1042 : * Returns whether trg2 contains all trigrams in trg1.
1043 : * This relies on the trigram arrays being sorted.
1044 : */
1045 : bool
1046 380 : trgm_contained_by(TRGM *trg1, TRGM *trg2)
1047 : {
1048 : trgm *ptr1,
1049 : *ptr2;
1050 : int len1,
1051 : len2;
1052 :
1053 380 : ptr1 = GETARR(trg1);
1054 380 : ptr2 = GETARR(trg2);
1055 :
1056 380 : len1 = ARRNELEM(trg1);
1057 380 : len2 = ARRNELEM(trg2);
1058 :
1059 1244 : while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
1060 : {
1061 1198 : int res = CMPTRGM(ptr1, ptr2);
1062 :
1063 1198 : if (res < 0)
1064 334 : return false;
1065 864 : else if (res > 0)
1066 640 : ptr2++;
1067 : else
1068 : {
1069 224 : ptr1++;
1070 224 : ptr2++;
1071 : }
1072 : }
1073 46 : if (ptr1 - GETARR(trg1) < len1)
1074 8 : return false;
1075 : else
1076 38 : return true;
1077 : }
1078 :
1079 : /*
1080 : * Return a palloc'd boolean array showing, for each trigram in "query",
1081 : * whether it is present in the trigram array "key".
1082 : * This relies on the "key" array being sorted, but "query" need not be.
1083 : */
1084 : bool *
1085 4300 : trgm_presence_map(TRGM *query, TRGM *key)
1086 : {
1087 : bool *result;
1088 4300 : trgm *ptrq = GETARR(query),
1089 4300 : *ptrk = GETARR(key);
1090 4300 : int lenq = ARRNELEM(query),
1091 4300 : lenk = ARRNELEM(key),
1092 : i;
1093 :
1094 4300 : result = (bool *) palloc0(lenq * sizeof(bool));
1095 :
1096 : /* for each query trigram, do a binary search in the key array */
1097 1015120 : for (i = 0; i < lenq; i++)
1098 : {
1099 1010820 : int lo = 0;
1100 1010820 : int hi = lenk;
1101 :
1102 4747306 : while (lo < hi)
1103 : {
1104 3752564 : int mid = (lo + hi) / 2;
1105 3752564 : int res = CMPTRGM(ptrq, ptrk + mid);
1106 :
1107 3752564 : if (res < 0)
1108 1568164 : hi = mid;
1109 2184400 : else if (res > 0)
1110 2168322 : lo = mid + 1;
1111 : else
1112 : {
1113 16078 : result[i] = true;
1114 16078 : break;
1115 : }
1116 : }
1117 1010820 : ptrq++;
1118 : }
1119 :
1120 4300 : return result;
1121 : }
1122 :
1123 : Datum
1124 62904 : similarity(PG_FUNCTION_ARGS)
1125 : {
1126 62904 : text *in1 = PG_GETARG_TEXT_PP(0);
1127 62904 : text *in2 = PG_GETARG_TEXT_PP(1);
1128 : TRGM *trg1,
1129 : *trg2;
1130 : float4 res;
1131 :
1132 62904 : trg1 = generate_trgm(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1));
1133 62904 : trg2 = generate_trgm(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2));
1134 :
1135 62904 : res = cnt_sml(trg1, trg2, false);
1136 :
1137 62904 : pfree(trg1);
1138 62904 : pfree(trg2);
1139 62904 : PG_FREE_IF_COPY(in1, 0);
1140 62904 : PG_FREE_IF_COPY(in2, 1);
1141 :
1142 62904 : PG_RETURN_FLOAT4(res);
1143 : }
1144 :
1145 : Datum
1146 1804 : word_similarity(PG_FUNCTION_ARGS)
1147 : {
1148 1804 : text *in1 = PG_GETARG_TEXT_PP(0);
1149 1804 : text *in2 = PG_GETARG_TEXT_PP(1);
1150 : float4 res;
1151 :
1152 3608 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1153 3608 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1154 : 0);
1155 :
1156 1804 : PG_FREE_IF_COPY(in1, 0);
1157 1804 : PG_FREE_IF_COPY(in2, 1);
1158 1804 : PG_RETURN_FLOAT4(res);
1159 : }
1160 :
1161 : Datum
1162 1764 : strict_word_similarity(PG_FUNCTION_ARGS)
1163 : {
1164 1764 : text *in1 = PG_GETARG_TEXT_PP(0);
1165 1764 : text *in2 = PG_GETARG_TEXT_PP(1);
1166 : float4 res;
1167 :
1168 3528 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1169 3528 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1170 : WORD_SIMILARITY_STRICT);
1171 :
1172 1764 : PG_FREE_IF_COPY(in1, 0);
1173 1764 : PG_FREE_IF_COPY(in2, 1);
1174 1764 : PG_RETURN_FLOAT4(res);
1175 : }
1176 :
1177 : Datum
1178 2008 : similarity_dist(PG_FUNCTION_ARGS)
1179 : {
1180 2008 : float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
1181 : PG_GETARG_DATUM(0),
1182 : PG_GETARG_DATUM(1)));
1183 :
1184 2008 : PG_RETURN_FLOAT4(1.0 - res);
1185 : }
1186 :
1187 : Datum
1188 12000 : similarity_op(PG_FUNCTION_ARGS)
1189 : {
1190 12000 : float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
1191 : PG_GETARG_DATUM(0),
1192 : PG_GETARG_DATUM(1)));
1193 :
1194 12000 : PG_RETURN_BOOL(res >= similarity_threshold);
1195 : }
1196 :
1197 : Datum
1198 3848 : word_similarity_op(PG_FUNCTION_ARGS)
1199 : {
1200 3848 : text *in1 = PG_GETARG_TEXT_PP(0);
1201 3848 : text *in2 = PG_GETARG_TEXT_PP(1);
1202 : float4 res;
1203 :
1204 7696 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1205 7696 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1206 : WORD_SIMILARITY_CHECK_ONLY);
1207 :
1208 3848 : PG_FREE_IF_COPY(in1, 0);
1209 3848 : PG_FREE_IF_COPY(in2, 1);
1210 3848 : PG_RETURN_BOOL(res >= word_similarity_threshold);
1211 : }
1212 :
1213 : Datum
1214 3848 : word_similarity_commutator_op(PG_FUNCTION_ARGS)
1215 : {
1216 3848 : text *in1 = PG_GETARG_TEXT_PP(0);
1217 3848 : text *in2 = PG_GETARG_TEXT_PP(1);
1218 : float4 res;
1219 :
1220 7696 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1221 7696 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1222 : WORD_SIMILARITY_CHECK_ONLY);
1223 :
1224 3848 : PG_FREE_IF_COPY(in1, 0);
1225 3848 : PG_FREE_IF_COPY(in2, 1);
1226 3848 : PG_RETURN_BOOL(res >= word_similarity_threshold);
1227 : }
1228 :
1229 : Datum
1230 0 : word_similarity_dist_op(PG_FUNCTION_ARGS)
1231 : {
1232 0 : text *in1 = PG_GETARG_TEXT_PP(0);
1233 0 : text *in2 = PG_GETARG_TEXT_PP(1);
1234 : float4 res;
1235 :
1236 0 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1237 0 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1238 : 0);
1239 :
1240 0 : PG_FREE_IF_COPY(in1, 0);
1241 0 : PG_FREE_IF_COPY(in2, 1);
1242 0 : PG_RETURN_FLOAT4(1.0 - res);
1243 : }
1244 :
1245 : Datum
1246 1428 : word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
1247 : {
1248 1428 : text *in1 = PG_GETARG_TEXT_PP(0);
1249 1428 : text *in2 = PG_GETARG_TEXT_PP(1);
1250 : float4 res;
1251 :
1252 2856 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1253 2856 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1254 : 0);
1255 :
1256 1428 : PG_FREE_IF_COPY(in1, 0);
1257 1428 : PG_FREE_IF_COPY(in2, 1);
1258 1428 : PG_RETURN_FLOAT4(1.0 - res);
1259 : }
1260 :
1261 : Datum
1262 5060 : strict_word_similarity_op(PG_FUNCTION_ARGS)
1263 : {
1264 5060 : text *in1 = PG_GETARG_TEXT_PP(0);
1265 5060 : text *in2 = PG_GETARG_TEXT_PP(1);
1266 : float4 res;
1267 :
1268 10120 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1269 10120 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1270 : WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
1271 :
1272 5060 : PG_FREE_IF_COPY(in1, 0);
1273 5060 : PG_FREE_IF_COPY(in2, 1);
1274 5060 : PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
1275 : }
1276 :
1277 : Datum
1278 5060 : strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
1279 : {
1280 5060 : text *in1 = PG_GETARG_TEXT_PP(0);
1281 5060 : text *in2 = PG_GETARG_TEXT_PP(1);
1282 : float4 res;
1283 :
1284 10120 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1285 10120 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1286 : WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
1287 :
1288 5060 : PG_FREE_IF_COPY(in1, 0);
1289 5060 : PG_FREE_IF_COPY(in2, 1);
1290 5060 : PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
1291 : }
1292 :
1293 : Datum
1294 0 : strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
1295 : {
1296 0 : text *in1 = PG_GETARG_TEXT_PP(0);
1297 0 : text *in2 = PG_GETARG_TEXT_PP(1);
1298 : float4 res;
1299 :
1300 0 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1301 0 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1302 : WORD_SIMILARITY_STRICT);
1303 :
1304 0 : PG_FREE_IF_COPY(in1, 0);
1305 0 : PG_FREE_IF_COPY(in2, 1);
1306 0 : PG_RETURN_FLOAT4(1.0 - res);
1307 : }
1308 :
1309 : Datum
1310 1440 : strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
1311 : {
1312 1440 : text *in1 = PG_GETARG_TEXT_PP(0);
1313 1440 : text *in2 = PG_GETARG_TEXT_PP(1);
1314 : float4 res;
1315 :
1316 2880 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1317 2880 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1318 : WORD_SIMILARITY_STRICT);
1319 :
1320 1440 : PG_FREE_IF_COPY(in1, 0);
1321 1440 : PG_FREE_IF_COPY(in2, 1);
1322 1440 : PG_RETURN_FLOAT4(1.0 - res);
1323 : }
|