Line data Source code
1 : /*
2 : * contrib/pg_trgm/trgm_op.c
3 : */
4 : #include "postgres.h"
5 :
6 : #include <ctype.h>
7 :
8 : #include "catalog/pg_type.h"
9 : #include "common/int.h"
10 : #include "lib/qunique.h"
11 : #include "miscadmin.h"
12 : #include "trgm.h"
13 : #include "tsearch/ts_locale.h"
14 : #include "utils/guc.h"
15 : #include "utils/lsyscache.h"
16 : #include "utils/memutils.h"
17 : #include "utils/pg_crc.h"
18 :
19 6 : PG_MODULE_MAGIC;
20 :
21 : /* GUC variables */
22 : double similarity_threshold = 0.3f;
23 : double word_similarity_threshold = 0.6f;
24 : double strict_word_similarity_threshold = 0.5f;
25 :
26 4 : PG_FUNCTION_INFO_V1(set_limit);
27 4 : PG_FUNCTION_INFO_V1(show_limit);
28 4 : PG_FUNCTION_INFO_V1(show_trgm);
29 4 : PG_FUNCTION_INFO_V1(similarity);
30 4 : PG_FUNCTION_INFO_V1(word_similarity);
31 4 : PG_FUNCTION_INFO_V1(strict_word_similarity);
32 4 : PG_FUNCTION_INFO_V1(similarity_dist);
33 4 : PG_FUNCTION_INFO_V1(similarity_op);
34 4 : PG_FUNCTION_INFO_V1(word_similarity_op);
35 4 : PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
36 2 : PG_FUNCTION_INFO_V1(word_similarity_dist_op);
37 4 : PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
38 4 : PG_FUNCTION_INFO_V1(strict_word_similarity_op);
39 4 : PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op);
40 2 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op);
41 4 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op);
42 :
43 : /* Trigram with position */
44 : typedef struct
45 : {
46 : trgm trg;
47 : int index;
48 : } pos_trgm;
49 :
50 : /* Trigram bound type */
51 : typedef uint8 TrgmBound;
52 : #define TRGM_BOUND_LEFT 0x01 /* trigram is left bound of word */
53 : #define TRGM_BOUND_RIGHT 0x02 /* trigram is right bound of word */
54 :
55 : /* Word similarity flags */
56 : #define WORD_SIMILARITY_CHECK_ONLY 0x01 /* only check existence of similar
57 : * search pattern in text */
58 : #define WORD_SIMILARITY_STRICT 0x02 /* force bounds of extent to match
59 : * word bounds */
60 :
61 : /*
62 : * Module load callback
63 : */
64 : void
65 6 : _PG_init(void)
66 : {
67 : /* Define custom GUC variables. */
68 6 : DefineCustomRealVariable("pg_trgm.similarity_threshold",
69 : "Sets the threshold used by the % operator.",
70 : "Valid range is 0.0 .. 1.0.",
71 : &similarity_threshold,
72 : 0.3f,
73 : 0.0,
74 : 1.0,
75 : PGC_USERSET,
76 : 0,
77 : NULL,
78 : NULL,
79 : NULL);
80 6 : DefineCustomRealVariable("pg_trgm.word_similarity_threshold",
81 : "Sets the threshold used by the <% operator.",
82 : "Valid range is 0.0 .. 1.0.",
83 : &word_similarity_threshold,
84 : 0.6f,
85 : 0.0,
86 : 1.0,
87 : PGC_USERSET,
88 : 0,
89 : NULL,
90 : NULL,
91 : NULL);
92 6 : DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold",
93 : "Sets the threshold used by the <<% operator.",
94 : "Valid range is 0.0 .. 1.0.",
95 : &strict_word_similarity_threshold,
96 : 0.5f,
97 : 0.0,
98 : 1.0,
99 : PGC_USERSET,
100 : 0,
101 : NULL,
102 : NULL,
103 : NULL);
104 :
105 6 : MarkGUCPrefixReserved("pg_trgm");
106 6 : }
107 :
108 : /*
109 : * Deprecated function.
110 : * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
111 : */
112 : Datum
113 4 : set_limit(PG_FUNCTION_ARGS)
114 : {
115 4 : float4 nlimit = PG_GETARG_FLOAT4(0);
116 : char *nlimit_str;
117 : Oid func_out_oid;
118 : bool is_varlena;
119 :
120 4 : getTypeOutputInfo(FLOAT4OID, &func_out_oid, &is_varlena);
121 :
122 4 : nlimit_str = OidOutputFunctionCall(func_out_oid, Float4GetDatum(nlimit));
123 :
124 4 : SetConfigOption("pg_trgm.similarity_threshold", nlimit_str,
125 : PGC_USERSET, PGC_S_SESSION);
126 :
127 4 : PG_RETURN_FLOAT4(similarity_threshold);
128 : }
129 :
130 :
131 : /*
132 : * Get similarity threshold for given index scan strategy number.
133 : */
134 : double
135 86802 : index_strategy_get_limit(StrategyNumber strategy)
136 : {
137 86802 : switch (strategy)
138 : {
139 65202 : case SimilarityStrategyNumber:
140 65202 : return similarity_threshold;
141 9644 : case WordSimilarityStrategyNumber:
142 9644 : return word_similarity_threshold;
143 11956 : case StrictWordSimilarityStrategyNumber:
144 11956 : return strict_word_similarity_threshold;
145 0 : default:
146 0 : elog(ERROR, "unrecognized strategy number: %d", strategy);
147 : break;
148 : }
149 :
150 : return 0.0; /* keep compiler quiet */
151 : }
152 :
153 : /*
154 : * Deprecated function.
155 : * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
156 : */
157 : Datum
158 40000 : show_limit(PG_FUNCTION_ARGS)
159 : {
160 40000 : PG_RETURN_FLOAT4(similarity_threshold);
161 : }
162 :
163 : static int
164 6374440 : comp_trgm(const void *a, const void *b)
165 : {
166 6374440 : return CMPTRGM(a, b);
167 : }
168 :
169 : /*
170 : * Finds first word in string, returns pointer to the word,
171 : * endword points to the character after word
172 : */
173 : static char *
174 482820 : find_word(char *str, int lenstr, char **endword, int *charlen)
175 : {
176 482820 : char *beginword = str;
177 :
178 510146 : while (beginword - str < lenstr && !ISWORDCHR(beginword))
179 27326 : beginword += pg_mblen(beginword);
180 :
181 482820 : if (beginword - str >= lenstr)
182 228160 : return NULL;
183 :
184 254660 : *endword = beginword;
185 254660 : *charlen = 0;
186 2199344 : while (*endword - str < lenstr && ISWORDCHR(*endword))
187 : {
188 1944684 : *endword += pg_mblen(*endword);
189 1944684 : (*charlen)++;
190 : }
191 :
192 254660 : return beginword;
193 : }
194 :
195 : /*
196 : * Reduce a trigram (three possibly multi-byte characters) to a trgm,
197 : * which is always exactly three bytes. If we have three single-byte
198 : * characters, we just use them as-is; otherwise we form a hash value.
199 : */
200 : void
201 2918 : compact_trigram(trgm *tptr, char *str, int bytelen)
202 : {
203 2918 : if (bytelen == 3)
204 : {
205 2918 : CPTRGM(tptr, str);
206 : }
207 : else
208 : {
209 : pg_crc32 crc;
210 :
211 0 : INIT_LEGACY_CRC32(crc);
212 0 : COMP_LEGACY_CRC32(crc, str, bytelen);
213 0 : FIN_LEGACY_CRC32(crc);
214 :
215 : /*
216 : * use only 3 upper bytes from crc, hope, it's good enough hashing
217 : */
218 0 : CPTRGM(tptr, &crc);
219 : }
220 2918 : }
221 :
222 : /*
223 : * Adds trigrams from words (already padded).
224 : */
225 : static trgm *
226 254788 : make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
227 : {
228 254788 : char *ptr = str;
229 :
230 254788 : if (charlen < 3)
231 54 : return tptr;
232 :
233 254734 : if (bytelen > charlen)
234 : {
235 : /* Find multibyte character boundaries and apply compact_trigram */
236 0 : int lenfirst = pg_mblen(str),
237 0 : lenmiddle = pg_mblen(str + lenfirst),
238 0 : lenlast = pg_mblen(str + lenfirst + lenmiddle);
239 :
240 0 : while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
241 : {
242 0 : compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
243 :
244 0 : ptr += lenfirst;
245 0 : tptr++;
246 :
247 0 : lenfirst = lenmiddle;
248 0 : lenmiddle = lenlast;
249 0 : lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
250 : }
251 : }
252 : else
253 : {
254 : /* Fast path when there are no multibyte characters */
255 : Assert(bytelen == charlen);
256 :
257 2454260 : while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
258 : {
259 2199526 : CPTRGM(tptr, ptr);
260 2199526 : ptr++;
261 2199526 : tptr++;
262 : }
263 : }
264 :
265 254734 : return tptr;
266 : }
267 :
268 : /*
269 : * Make array of trigrams without sorting and removing duplicate items.
270 : *
271 : * trg: where to return the array of trigrams.
272 : * str: source string, of length slen bytes.
273 : * bounds: where to return bounds of trigrams (if needed).
274 : *
275 : * Returns length of the generated array.
276 : */
277 : static int
278 228162 : generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
279 : {
280 : trgm *tptr;
281 : char *buf;
282 : int charlen,
283 : bytelen;
284 : char *bword,
285 : *eword;
286 :
287 228162 : if (slen + LPADDING + RPADDING < 3 || slen == 0)
288 2 : return 0;
289 :
290 228160 : tptr = trg;
291 :
292 : /* Allocate a buffer for case-folded, blank-padded words */
293 228160 : buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
294 :
295 : if (LPADDING > 0)
296 : {
297 228160 : *buf = ' ';
298 : if (LPADDING > 1)
299 228160 : *(buf + 1) = ' ';
300 : }
301 :
302 228160 : eword = str;
303 482820 : while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
304 : {
305 : #ifdef IGNORECASE
306 254660 : bword = lowerstr_with_len(bword, eword - bword);
307 254660 : bytelen = strlen(bword);
308 : #else
309 : bytelen = eword - bword;
310 : #endif
311 :
312 254660 : memcpy(buf + LPADDING, bword, bytelen);
313 :
314 : #ifdef IGNORECASE
315 254660 : pfree(bword);
316 : #endif
317 :
318 254660 : buf[LPADDING + bytelen] = ' ';
319 254660 : buf[LPADDING + bytelen + 1] = ' ';
320 :
321 : /* Calculate trigrams marking their bounds if needed */
322 254660 : if (bounds)
323 24796 : bounds[tptr - trg] |= TRGM_BOUND_LEFT;
324 254660 : tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
325 : charlen + LPADDING + RPADDING);
326 254660 : if (bounds)
327 24796 : bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
328 : }
329 :
330 228160 : pfree(buf);
331 :
332 228160 : return tptr - trg;
333 : }
334 :
335 : /*
336 : * Guard against possible overflow in the palloc requests below. (We
337 : * don't worry about the additive constants, since palloc can detect
338 : * requests that are a little above MaxAllocSize --- we just need to
339 : * prevent integer overflow in the multiplications.)
340 : */
341 : static void
342 204020 : protect_out_of_mem(int slen)
343 : {
344 204020 : if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
345 204020 : (Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
346 0 : ereport(ERROR,
347 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
348 : errmsg("out of memory")));
349 204020 : }
350 :
351 : /*
352 : * Make array of trigrams with sorting and removing duplicate items.
353 : *
354 : * str: source string, of length slen bytes.
355 : *
356 : * Returns the sorted array of unique trigrams.
357 : */
358 : TRGM *
359 179658 : generate_trgm(char *str, int slen)
360 : {
361 : TRGM *trg;
362 : int len;
363 :
364 179658 : protect_out_of_mem(slen);
365 :
366 179658 : trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
367 179658 : trg->flag = ARRKEY;
368 :
369 179658 : len = generate_trgm_only(GETARR(trg), str, slen, NULL);
370 179658 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
371 :
372 179658 : if (len == 0)
373 8 : return trg;
374 :
375 : /*
376 : * Make trigrams unique.
377 : */
378 179650 : if (len > 1)
379 : {
380 179650 : qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
381 179650 : len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
382 : }
383 :
384 179650 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
385 :
386 179650 : return trg;
387 : }
388 :
389 : /*
390 : * Make array of positional trigrams from two trigram arrays trg1 and trg2.
391 : *
392 : * trg1: trigram array of search pattern, of length len1. trg1 is required
393 : * word which positions don't matter and replaced with -1.
394 : * trg2: trigram array of text, of length len2. trg2 is haystack where we
395 : * search and have to store its positions.
396 : *
397 : * Returns concatenated trigram array.
398 : */
399 : static pos_trgm *
400 24252 : make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
401 : {
402 : pos_trgm *result;
403 : int i,
404 24252 : len = len1 + len2;
405 :
406 24252 : result = (pos_trgm *) palloc(sizeof(pos_trgm) * len);
407 :
408 241728 : for (i = 0; i < len1; i++)
409 : {
410 217476 : memcpy(&result[i].trg, &trg1[i], sizeof(trgm));
411 217476 : result[i].index = -1;
412 : }
413 :
414 384420 : for (i = 0; i < len2; i++)
415 : {
416 360168 : memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm));
417 360168 : result[i + len1].index = i;
418 : }
419 :
420 24252 : return result;
421 : }
422 :
423 : /*
424 : * Compare position trigrams: compare trigrams first and position second.
425 : */
426 : static int
427 2615350 : comp_ptrgm(const void *v1, const void *v2)
428 : {
429 2615350 : const pos_trgm *p1 = (const pos_trgm *) v1;
430 2615350 : const pos_trgm *p2 = (const pos_trgm *) v2;
431 : int cmp;
432 :
433 2615350 : cmp = CMPTRGM(p1->trg, p2->trg);
434 2615350 : if (cmp != 0)
435 2535942 : return cmp;
436 :
437 79408 : return pg_cmp_s32(p1->index, p2->index);
438 : }
439 :
440 : /*
441 : * Iterative search function which calculates maximum similarity with word in
442 : * the string. Maximum similarity is only calculated only if the flag
443 : * WORD_SIMILARITY_CHECK_ONLY isn't set.
444 : *
445 : * trg2indexes: array which stores indexes of the array "found".
446 : * found: array which stores true of false values.
447 : * ulen1: count of unique trigrams of array "trg1".
448 : * len2: length of array "trg2" and array "trg2indexes".
449 : * len: length of the array "found".
450 : * flags: set of boolean flags parameterizing similarity calculation.
451 : * bounds: whether each trigram is left/right bound of word.
452 : *
453 : * Returns word similarity.
454 : */
455 : static float4
456 24252 : iterate_word_similarity(int *trg2indexes,
457 : bool *found,
458 : int ulen1,
459 : int len2,
460 : int len,
461 : uint8 flags,
462 : TrgmBound *bounds)
463 : {
464 : int *lastpos,
465 : i,
466 24252 : ulen2 = 0,
467 24252 : count = 0,
468 24252 : upper = -1,
469 : lower;
470 : float4 smlr_cur,
471 24252 : smlr_max = 0.0f;
472 : double threshold;
473 :
474 : Assert(bounds || !(flags & WORD_SIMILARITY_STRICT));
475 :
476 : /* Select appropriate threshold */
477 48504 : threshold = (flags & WORD_SIMILARITY_STRICT) ?
478 24252 : strict_word_similarity_threshold :
479 : word_similarity_threshold;
480 :
481 : /*
482 : * Consider first trigram as initial lower bound for strict word
483 : * similarity, or initialize it later with first trigram present for plain
484 : * word similarity.
485 : */
486 24252 : lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1;
487 :
488 : /* Memorise last position of each trigram */
489 24252 : lastpos = (int *) palloc(sizeof(int) * len);
490 24252 : memset(lastpos, -1, sizeof(int) * len);
491 :
492 367280 : for (i = 0; i < len2; i++)
493 : {
494 : int trgindex;
495 :
496 346596 : CHECK_FOR_INTERRUPTS();
497 :
498 : /* Get index of next trigram */
499 346596 : trgindex = trg2indexes[i];
500 :
501 : /* Update last position of this trigram */
502 346596 : if (lower >= 0 || found[trgindex])
503 : {
504 271574 : if (lastpos[trgindex] < 0)
505 : {
506 267870 : ulen2++;
507 267870 : if (found[trgindex])
508 61512 : count++;
509 : }
510 271574 : lastpos[trgindex] = i;
511 : }
512 :
513 : /*
514 : * Adjust upper bound if trigram is upper bound of word for strict
515 : * word similarity, or if trigram is present in required substring for
516 : * plain word similarity
517 : */
518 500672 : if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT)
519 154076 : : found[trgindex])
520 : {
521 : int prev_lower,
522 : tmp_ulen2,
523 : tmp_lower,
524 : tmp_count;
525 :
526 51270 : upper = i;
527 51270 : if (lower == -1)
528 : {
529 9390 : lower = i;
530 9390 : ulen2 = 1;
531 : }
532 :
533 51270 : smlr_cur = CALCSML(count, ulen1, ulen2);
534 :
535 : /* Also try to adjust lower bound for greater similarity */
536 51270 : tmp_count = count;
537 51270 : tmp_ulen2 = ulen2;
538 51270 : prev_lower = lower;
539 417160 : for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
540 : {
541 : float smlr_tmp;
542 : int tmp_trgindex;
543 :
544 : /*
545 : * Adjust lower bound only if trigram is lower bound of word
546 : * for strict word similarity, or consider every trigram as
547 : * lower bound for plain word similarity.
548 : */
549 369458 : if (!(flags & WORD_SIMILARITY_STRICT)
550 290350 : || (bounds[tmp_lower] & TRGM_BOUND_LEFT))
551 : {
552 119372 : smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
553 119372 : if (smlr_tmp > smlr_cur)
554 : {
555 7024 : smlr_cur = smlr_tmp;
556 7024 : ulen2 = tmp_ulen2;
557 7024 : lower = tmp_lower;
558 7024 : count = tmp_count;
559 : }
560 :
561 : /*
562 : * If we only check that word similarity is greater than
563 : * threshold we do not need to calculate a maximum
564 : * similarity.
565 : */
566 119372 : if ((flags & WORD_SIMILARITY_CHECK_ONLY)
567 74228 : && smlr_cur >= threshold)
568 3568 : break;
569 : }
570 :
571 365890 : tmp_trgindex = trg2indexes[tmp_lower];
572 365890 : if (lastpos[tmp_trgindex] == tmp_lower)
573 : {
574 361372 : tmp_ulen2--;
575 361372 : if (found[tmp_trgindex])
576 93152 : tmp_count--;
577 : }
578 : }
579 :
580 51270 : smlr_max = Max(smlr_max, smlr_cur);
581 :
582 : /*
583 : * if we only check that word similarity is greater than threshold
584 : * we do not need to calculate a maximum similarity.
585 : */
586 51270 : if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold)
587 3568 : break;
588 :
589 81214 : for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
590 : {
591 : int tmp_trgindex;
592 :
593 33512 : tmp_trgindex = trg2indexes[tmp_lower];
594 33512 : if (lastpos[tmp_trgindex] == tmp_lower)
595 32016 : lastpos[tmp_trgindex] = -1;
596 : }
597 : }
598 : }
599 :
600 24252 : pfree(lastpos);
601 :
602 24252 : return smlr_max;
603 : }
604 :
605 : /*
606 : * Calculate word similarity.
607 : * This function prepare two arrays: "trg2indexes" and "found". Then this arrays
608 : * are used to calculate word similarity using iterate_word_similarity().
609 : *
610 : * "trg2indexes" is array which stores indexes of the array "found".
611 : * In other words:
612 : * trg2indexes[j] = i;
613 : * found[i] = true (or false);
614 : * If found[i] == true then there is trigram trg2[j] in array "trg1".
615 : * If found[i] == false then there is not trigram trg2[j] in array "trg1".
616 : *
617 : * str1: search pattern string, of length slen1 bytes.
618 : * str2: text in which we are looking for a word, of length slen2 bytes.
619 : * flags: set of boolean flags parameterizing similarity calculation.
620 : *
621 : * Returns word similarity.
622 : */
623 : static float4
624 24252 : calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
625 : uint8 flags)
626 : {
627 : bool *found;
628 : pos_trgm *ptrg;
629 : trgm *trg1;
630 : trgm *trg2;
631 : int len1,
632 : len2,
633 : len,
634 : i,
635 : j,
636 : ulen1;
637 : int *trg2indexes;
638 : float4 result;
639 : TrgmBound *bounds;
640 :
641 24252 : protect_out_of_mem(slen1 + slen2);
642 :
643 : /* Make positional trigrams */
644 24252 : trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
645 24252 : trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
646 24252 : if (flags & WORD_SIMILARITY_STRICT)
647 13324 : bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
648 : else
649 10928 : bounds = NULL;
650 :
651 24252 : len1 = generate_trgm_only(trg1, str1, slen1, NULL);
652 24252 : len2 = generate_trgm_only(trg2, str2, slen2, bounds);
653 :
654 24252 : ptrg = make_positional_trgm(trg1, len1, trg2, len2);
655 24252 : len = len1 + len2;
656 24252 : qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
657 :
658 24252 : pfree(trg1);
659 24252 : pfree(trg2);
660 :
661 : /*
662 : * Merge positional trigrams array: enumerate each trigram and find its
663 : * presence in required word.
664 : */
665 24252 : trg2indexes = (int *) palloc(sizeof(int) * len2);
666 24252 : found = (bool *) palloc0(sizeof(bool) * len);
667 :
668 24252 : ulen1 = 0;
669 24252 : j = 0;
670 601896 : for (i = 0; i < len; i++)
671 : {
672 577644 : if (i > 0)
673 : {
674 553392 : int cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg);
675 :
676 553392 : if (cmp != 0)
677 : {
678 484992 : if (found[j])
679 202274 : ulen1++;
680 484992 : j++;
681 : }
682 : }
683 :
684 577644 : if (ptrg[i].index >= 0)
685 : {
686 360168 : trg2indexes[ptrg[i].index] = j;
687 : }
688 : else
689 : {
690 217476 : found[j] = true;
691 : }
692 : }
693 24252 : if (found[j])
694 15202 : ulen1++;
695 :
696 : /* Run iterative procedure to find maximum similarity with word */
697 24252 : result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
698 : flags, bounds);
699 :
700 24252 : pfree(trg2indexes);
701 24252 : pfree(found);
702 24252 : pfree(ptrg);
703 :
704 24252 : return result;
705 : }
706 :
707 :
708 : /*
709 : * Extract the next non-wildcard part of a search string, i.e. a word bounded
710 : * by '_' or '%' meta-characters, non-word characters or string end.
711 : *
712 : * str: source string, of length lenstr bytes (need not be null-terminated)
713 : * buf: where to return the substring (must be long enough)
714 : * *bytelen: receives byte length of the found substring
715 : * *charlen: receives character length of the found substring
716 : *
717 : * Returns pointer to end+1 of the found substring in the source string.
718 : * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
719 : *
720 : * If the found word is bounded by non-word characters or string boundaries
721 : * then this function will include corresponding padding spaces into buf.
722 : */
723 : static const char *
724 238 : get_wildcard_part(const char *str, int lenstr,
725 : char *buf, int *bytelen, int *charlen)
726 : {
727 238 : const char *beginword = str;
728 : const char *endword;
729 238 : char *s = buf;
730 238 : bool in_leading_wildcard_meta = false;
731 238 : bool in_trailing_wildcard_meta = false;
732 238 : bool in_escape = false;
733 : int clen;
734 :
735 : /*
736 : * Find the first word character, remembering whether preceding character
737 : * was wildcard meta-character. Note that the in_escape state persists
738 : * from this loop to the next one, since we may exit at a word character
739 : * that is in_escape.
740 : */
741 482 : while (beginword - str < lenstr)
742 : {
743 372 : if (in_escape)
744 : {
745 6 : if (ISWORDCHR(beginword))
746 6 : break;
747 0 : in_escape = false;
748 0 : in_leading_wildcard_meta = false;
749 : }
750 : else
751 : {
752 366 : if (ISESCAPECHAR(beginword))
753 6 : in_escape = true;
754 360 : else if (ISWILDCARDCHAR(beginword))
755 208 : in_leading_wildcard_meta = true;
756 152 : else if (ISWORDCHR(beginword))
757 122 : break;
758 : else
759 30 : in_leading_wildcard_meta = false;
760 : }
761 244 : beginword += pg_mblen(beginword);
762 : }
763 :
764 : /*
765 : * Handle string end.
766 : */
767 238 : if (beginword - str >= lenstr)
768 110 : return NULL;
769 :
770 : /*
771 : * Add left padding spaces if preceding character wasn't wildcard
772 : * meta-character.
773 : */
774 128 : *charlen = 0;
775 128 : if (!in_leading_wildcard_meta)
776 : {
777 : if (LPADDING > 0)
778 : {
779 30 : *s++ = ' ';
780 30 : (*charlen)++;
781 : if (LPADDING > 1)
782 : {
783 30 : *s++ = ' ';
784 30 : (*charlen)++;
785 : }
786 : }
787 : }
788 :
789 : /*
790 : * Copy data into buf until wildcard meta-character, non-word character or
791 : * string boundary. Strip escapes during copy.
792 : */
793 128 : endword = beginword;
794 488 : while (endword - str < lenstr)
795 : {
796 488 : clen = pg_mblen(endword);
797 488 : if (in_escape)
798 : {
799 6 : if (ISWORDCHR(endword))
800 : {
801 6 : memcpy(s, endword, clen);
802 6 : (*charlen)++;
803 6 : s += clen;
804 : }
805 : else
806 : {
807 : /*
808 : * Back up endword to the escape character when stopping at an
809 : * escaped char, so that subsequent get_wildcard_part will
810 : * restart from the escape character. We assume here that
811 : * escape chars are single-byte.
812 : */
813 0 : endword--;
814 0 : break;
815 : }
816 6 : in_escape = false;
817 : }
818 : else
819 : {
820 482 : if (ISESCAPECHAR(endword))
821 0 : in_escape = true;
822 482 : else if (ISWILDCARDCHAR(endword))
823 : {
824 110 : in_trailing_wildcard_meta = true;
825 110 : break;
826 : }
827 372 : else if (ISWORDCHR(endword))
828 : {
829 354 : memcpy(s, endword, clen);
830 354 : (*charlen)++;
831 354 : s += clen;
832 : }
833 : else
834 18 : break;
835 : }
836 360 : endword += clen;
837 : }
838 :
839 : /*
840 : * Add right padding spaces if next character isn't wildcard
841 : * meta-character.
842 : */
843 128 : if (!in_trailing_wildcard_meta)
844 : {
845 : if (RPADDING > 0)
846 : {
847 18 : *s++ = ' ';
848 18 : (*charlen)++;
849 : if (RPADDING > 1)
850 : {
851 : *s++ = ' ';
852 : (*charlen)++;
853 : }
854 : }
855 : }
856 :
857 128 : *bytelen = s - buf;
858 128 : return endword;
859 : }
860 :
861 : /*
862 : * Generates trigrams for wildcard search string.
863 : *
864 : * Returns array of trigrams that must occur in any string that matches the
865 : * wildcard string. For example, given pattern "a%bcd%" the trigrams
866 : * " a", "bcd" would be extracted.
867 : */
868 : TRGM *
869 110 : generate_wildcard_trgm(const char *str, int slen)
870 : {
871 : TRGM *trg;
872 : char *buf,
873 : *buf2;
874 : trgm *tptr;
875 : int len,
876 : charlen,
877 : bytelen;
878 : const char *eword;
879 :
880 110 : protect_out_of_mem(slen);
881 :
882 110 : trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
883 110 : trg->flag = ARRKEY;
884 110 : SET_VARSIZE(trg, TRGMHDRSIZE);
885 :
886 110 : if (slen + LPADDING + RPADDING < 3 || slen == 0)
887 0 : return trg;
888 :
889 110 : tptr = GETARR(trg);
890 :
891 : /* Allocate a buffer for blank-padded, but not yet case-folded, words */
892 110 : buf = palloc(sizeof(char) * (slen + 4));
893 :
894 : /*
895 : * Extract trigrams from each substring extracted by get_wildcard_part.
896 : */
897 110 : eword = str;
898 238 : while ((eword = get_wildcard_part(eword, slen - (eword - str),
899 : buf, &bytelen, &charlen)) != NULL)
900 : {
901 : #ifdef IGNORECASE
902 128 : buf2 = lowerstr_with_len(buf, bytelen);
903 128 : bytelen = strlen(buf2);
904 : #else
905 : buf2 = buf;
906 : #endif
907 :
908 : /*
909 : * count trigrams
910 : */
911 128 : tptr = make_trigrams(tptr, buf2, bytelen, charlen);
912 :
913 : #ifdef IGNORECASE
914 128 : pfree(buf2);
915 : #endif
916 : }
917 :
918 110 : pfree(buf);
919 :
920 110 : if ((len = tptr - GETARR(trg)) == 0)
921 48 : return trg;
922 :
923 : /*
924 : * Make trigrams unique.
925 : */
926 62 : if (len > 1)
927 : {
928 34 : qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
929 34 : len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
930 : }
931 :
932 62 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
933 :
934 62 : return trg;
935 : }
936 :
937 : uint32
938 69546 : trgm2int(trgm *ptr)
939 : {
940 69546 : uint32 val = 0;
941 :
942 69546 : val |= *(((unsigned char *) ptr));
943 69546 : val <<= 8;
944 69546 : val |= *(((unsigned char *) ptr) + 1);
945 69546 : val <<= 8;
946 69546 : val |= *(((unsigned char *) ptr) + 2);
947 :
948 69546 : return val;
949 : }
950 :
951 : Datum
952 14 : show_trgm(PG_FUNCTION_ARGS)
953 : {
954 14 : text *in = PG_GETARG_TEXT_PP(0);
955 : TRGM *trg;
956 : Datum *d;
957 : ArrayType *a;
958 : trgm *ptr;
959 : int i;
960 :
961 14 : trg = generate_trgm(VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
962 14 : d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));
963 :
964 88 : for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
965 : {
966 74 : text *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));
967 :
968 74 : if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr))
969 : {
970 0 : snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
971 0 : SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
972 : }
973 : else
974 : {
975 74 : SET_VARSIZE(item, VARHDRSZ + 3);
976 74 : CPTRGM(VARDATA(item), ptr);
977 : }
978 74 : d[i] = PointerGetDatum(item);
979 : }
980 :
981 14 : a = construct_array_builtin(d, ARRNELEM(trg), TEXTOID);
982 :
983 88 : for (i = 0; i < ARRNELEM(trg); i++)
984 74 : pfree(DatumGetPointer(d[i]));
985 :
986 14 : pfree(d);
987 14 : pfree(trg);
988 14 : PG_FREE_IF_COPY(in, 0);
989 :
990 14 : PG_RETURN_POINTER(a);
991 : }
992 :
993 : float4
994 138204 : cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
995 : {
996 : trgm *ptr1,
997 : *ptr2;
998 138204 : int count = 0;
999 : int len1,
1000 : len2;
1001 :
1002 138204 : ptr1 = GETARR(trg1);
1003 138204 : ptr2 = GETARR(trg2);
1004 :
1005 138204 : len1 = ARRNELEM(trg1);
1006 138204 : len2 = ARRNELEM(trg2);
1007 :
1008 : /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
1009 138204 : if (len1 <= 0 || len2 <= 0)
1010 2 : return (float4) 0.0;
1011 :
1012 1762514 : while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
1013 : {
1014 1624312 : int res = CMPTRGM(ptr1, ptr2);
1015 :
1016 1624312 : if (res < 0)
1017 369670 : ptr1++;
1018 1254642 : else if (res > 0)
1019 430402 : ptr2++;
1020 : else
1021 : {
1022 824240 : ptr1++;
1023 824240 : ptr2++;
1024 824240 : count++;
1025 : }
1026 : }
1027 :
1028 : /*
1029 : * If inexact then len2 is equal to count, because we don't know actual
1030 : * length of second string in inexact search and we can assume that count
1031 : * is a lower bound of len2.
1032 : */
1033 138202 : return CALCSML(count, len1, inexact ? count : len2);
1034 : }
1035 :
1036 :
1037 : /*
1038 : * Returns whether trg2 contains all trigrams in trg1.
1039 : * This relies on the trigram arrays being sorted.
1040 : */
1041 : bool
1042 380 : trgm_contained_by(TRGM *trg1, TRGM *trg2)
1043 : {
1044 : trgm *ptr1,
1045 : *ptr2;
1046 : int len1,
1047 : len2;
1048 :
1049 380 : ptr1 = GETARR(trg1);
1050 380 : ptr2 = GETARR(trg2);
1051 :
1052 380 : len1 = ARRNELEM(trg1);
1053 380 : len2 = ARRNELEM(trg2);
1054 :
1055 1244 : while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
1056 : {
1057 1198 : int res = CMPTRGM(ptr1, ptr2);
1058 :
1059 1198 : if (res < 0)
1060 334 : return false;
1061 864 : else if (res > 0)
1062 640 : ptr2++;
1063 : else
1064 : {
1065 224 : ptr1++;
1066 224 : ptr2++;
1067 : }
1068 : }
1069 46 : if (ptr1 - GETARR(trg1) < len1)
1070 8 : return false;
1071 : else
1072 38 : return true;
1073 : }
1074 :
1075 : /*
1076 : * Return a palloc'd boolean array showing, for each trigram in "query",
1077 : * whether it is present in the trigram array "key".
1078 : * This relies on the "key" array being sorted, but "query" need not be.
1079 : */
1080 : bool *
1081 4300 : trgm_presence_map(TRGM *query, TRGM *key)
1082 : {
1083 : bool *result;
1084 4300 : trgm *ptrq = GETARR(query),
1085 4300 : *ptrk = GETARR(key);
1086 4300 : int lenq = ARRNELEM(query),
1087 4300 : lenk = ARRNELEM(key),
1088 : i;
1089 :
1090 4300 : result = (bool *) palloc0(lenq * sizeof(bool));
1091 :
1092 : /* for each query trigram, do a binary search in the key array */
1093 1015120 : for (i = 0; i < lenq; i++)
1094 : {
1095 1010820 : int lo = 0;
1096 1010820 : int hi = lenk;
1097 :
1098 4747306 : while (lo < hi)
1099 : {
1100 3752564 : int mid = (lo + hi) / 2;
1101 3752564 : int res = CMPTRGM(ptrq, ptrk + mid);
1102 :
1103 3752564 : if (res < 0)
1104 1568164 : hi = mid;
1105 2184400 : else if (res > 0)
1106 2168322 : lo = mid + 1;
1107 : else
1108 : {
1109 16078 : result[i] = true;
1110 16078 : break;
1111 : }
1112 : }
1113 1010820 : ptrq++;
1114 : }
1115 :
1116 4300 : return result;
1117 : }
1118 :
1119 : Datum
1120 62904 : similarity(PG_FUNCTION_ARGS)
1121 : {
1122 62904 : text *in1 = PG_GETARG_TEXT_PP(0);
1123 62904 : text *in2 = PG_GETARG_TEXT_PP(1);
1124 : TRGM *trg1,
1125 : *trg2;
1126 : float4 res;
1127 :
1128 62904 : trg1 = generate_trgm(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1));
1129 62904 : trg2 = generate_trgm(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2));
1130 :
1131 62904 : res = cnt_sml(trg1, trg2, false);
1132 :
1133 62904 : pfree(trg1);
1134 62904 : pfree(trg2);
1135 62904 : PG_FREE_IF_COPY(in1, 0);
1136 62904 : PG_FREE_IF_COPY(in2, 1);
1137 :
1138 62904 : PG_RETURN_FLOAT4(res);
1139 : }
1140 :
1141 : Datum
1142 1804 : word_similarity(PG_FUNCTION_ARGS)
1143 : {
1144 1804 : text *in1 = PG_GETARG_TEXT_PP(0);
1145 1804 : text *in2 = PG_GETARG_TEXT_PP(1);
1146 : float4 res;
1147 :
1148 3608 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1149 3608 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1150 : 0);
1151 :
1152 1804 : PG_FREE_IF_COPY(in1, 0);
1153 1804 : PG_FREE_IF_COPY(in2, 1);
1154 1804 : PG_RETURN_FLOAT4(res);
1155 : }
1156 :
1157 : Datum
1158 1764 : strict_word_similarity(PG_FUNCTION_ARGS)
1159 : {
1160 1764 : text *in1 = PG_GETARG_TEXT_PP(0);
1161 1764 : text *in2 = PG_GETARG_TEXT_PP(1);
1162 : float4 res;
1163 :
1164 3528 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1165 3528 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1166 : WORD_SIMILARITY_STRICT);
1167 :
1168 1764 : PG_FREE_IF_COPY(in1, 0);
1169 1764 : PG_FREE_IF_COPY(in2, 1);
1170 1764 : PG_RETURN_FLOAT4(res);
1171 : }
1172 :
1173 : Datum
1174 2008 : similarity_dist(PG_FUNCTION_ARGS)
1175 : {
1176 2008 : float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
1177 : PG_GETARG_DATUM(0),
1178 : PG_GETARG_DATUM(1)));
1179 :
1180 2008 : PG_RETURN_FLOAT4(1.0 - res);
1181 : }
1182 :
1183 : Datum
1184 12000 : similarity_op(PG_FUNCTION_ARGS)
1185 : {
1186 12000 : float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
1187 : PG_GETARG_DATUM(0),
1188 : PG_GETARG_DATUM(1)));
1189 :
1190 12000 : PG_RETURN_BOOL(res >= similarity_threshold);
1191 : }
1192 :
1193 : Datum
1194 3848 : word_similarity_op(PG_FUNCTION_ARGS)
1195 : {
1196 3848 : text *in1 = PG_GETARG_TEXT_PP(0);
1197 3848 : text *in2 = PG_GETARG_TEXT_PP(1);
1198 : float4 res;
1199 :
1200 7696 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1201 7696 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1202 : WORD_SIMILARITY_CHECK_ONLY);
1203 :
1204 3848 : PG_FREE_IF_COPY(in1, 0);
1205 3848 : PG_FREE_IF_COPY(in2, 1);
1206 3848 : PG_RETURN_BOOL(res >= word_similarity_threshold);
1207 : }
1208 :
1209 : Datum
1210 3848 : word_similarity_commutator_op(PG_FUNCTION_ARGS)
1211 : {
1212 3848 : text *in1 = PG_GETARG_TEXT_PP(0);
1213 3848 : text *in2 = PG_GETARG_TEXT_PP(1);
1214 : float4 res;
1215 :
1216 7696 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1217 7696 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1218 : WORD_SIMILARITY_CHECK_ONLY);
1219 :
1220 3848 : PG_FREE_IF_COPY(in1, 0);
1221 3848 : PG_FREE_IF_COPY(in2, 1);
1222 3848 : PG_RETURN_BOOL(res >= word_similarity_threshold);
1223 : }
1224 :
1225 : Datum
1226 0 : word_similarity_dist_op(PG_FUNCTION_ARGS)
1227 : {
1228 0 : text *in1 = PG_GETARG_TEXT_PP(0);
1229 0 : text *in2 = PG_GETARG_TEXT_PP(1);
1230 : float4 res;
1231 :
1232 0 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1233 0 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1234 : 0);
1235 :
1236 0 : PG_FREE_IF_COPY(in1, 0);
1237 0 : PG_FREE_IF_COPY(in2, 1);
1238 0 : PG_RETURN_FLOAT4(1.0 - res);
1239 : }
1240 :
1241 : Datum
1242 1428 : word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
1243 : {
1244 1428 : text *in1 = PG_GETARG_TEXT_PP(0);
1245 1428 : text *in2 = PG_GETARG_TEXT_PP(1);
1246 : float4 res;
1247 :
1248 2856 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1249 2856 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1250 : 0);
1251 :
1252 1428 : PG_FREE_IF_COPY(in1, 0);
1253 1428 : PG_FREE_IF_COPY(in2, 1);
1254 1428 : PG_RETURN_FLOAT4(1.0 - res);
1255 : }
1256 :
1257 : Datum
1258 5060 : strict_word_similarity_op(PG_FUNCTION_ARGS)
1259 : {
1260 5060 : text *in1 = PG_GETARG_TEXT_PP(0);
1261 5060 : text *in2 = PG_GETARG_TEXT_PP(1);
1262 : float4 res;
1263 :
1264 10120 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1265 10120 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1266 : WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
1267 :
1268 5060 : PG_FREE_IF_COPY(in1, 0);
1269 5060 : PG_FREE_IF_COPY(in2, 1);
1270 5060 : PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
1271 : }
1272 :
1273 : Datum
1274 5060 : strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
1275 : {
1276 5060 : text *in1 = PG_GETARG_TEXT_PP(0);
1277 5060 : text *in2 = PG_GETARG_TEXT_PP(1);
1278 : float4 res;
1279 :
1280 10120 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1281 10120 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1282 : WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
1283 :
1284 5060 : PG_FREE_IF_COPY(in1, 0);
1285 5060 : PG_FREE_IF_COPY(in2, 1);
1286 5060 : PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
1287 : }
1288 :
1289 : Datum
1290 0 : strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
1291 : {
1292 0 : text *in1 = PG_GETARG_TEXT_PP(0);
1293 0 : text *in2 = PG_GETARG_TEXT_PP(1);
1294 : float4 res;
1295 :
1296 0 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1297 0 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1298 : WORD_SIMILARITY_STRICT);
1299 :
1300 0 : PG_FREE_IF_COPY(in1, 0);
1301 0 : PG_FREE_IF_COPY(in2, 1);
1302 0 : PG_RETURN_FLOAT4(1.0 - res);
1303 : }
1304 :
1305 : Datum
1306 1440 : strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
1307 : {
1308 1440 : text *in1 = PG_GETARG_TEXT_PP(0);
1309 1440 : text *in2 = PG_GETARG_TEXT_PP(1);
1310 : float4 res;
1311 :
1312 2880 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1313 2880 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1314 : WORD_SIMILARITY_STRICT);
1315 :
1316 1440 : PG_FREE_IF_COPY(in1, 0);
1317 1440 : PG_FREE_IF_COPY(in2, 1);
1318 1440 : PG_RETURN_FLOAT4(1.0 - res);
1319 : }
|