Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * trgm_regexp.c
4 : * Regular expression matching using trigrams.
5 : *
6 : * The general idea of trigram index support for a regular expression (regex)
7 : * search is to transform the regex into a logical expression on trigrams.
8 : * For example:
9 : *
10 : * (ab|cd)efg => ((abe & bef) | (cde & def)) & efg
11 : *
12 : * If a string matches the regex, then it must match the logical expression on
13 : * trigrams. The opposite is not necessarily true, however: a string that
14 : * matches the logical expression might not match the original regex. Such
15 : * false positives are removed via recheck, by running the regular regex match
16 : * operator on the retrieved heap tuple.
17 : *
18 : * Since the trigram expression involves both AND and OR operators, we can't
19 : * expect the core index machinery to evaluate it completely. Instead, the
20 : * result of regex analysis is a list of trigrams to be sought in the index,
21 : * plus a simplified graph that is used by trigramsMatchGraph() to determine
22 : * whether a particular indexed value matches the expression.
23 : *
24 : * Converting a regex to a trigram expression is based on analysis of an
25 : * automaton corresponding to the regex. The algorithm consists of four
26 : * stages:
27 : *
28 : * 1) Compile the regexp to NFA form. This is handled by the PostgreSQL
29 : * regexp library, which provides accessors for its opaque regex_t struct
30 : * to expose the NFA state graph and the "colors" (sets of equivalent
31 : * characters) used as state transition labels.
32 : *
33 : * 2) Transform the original NFA into an expanded graph, where arcs
34 : * are labeled with trigrams that must be present in order to move from
35 : * one state to another via the arcs. The trigrams used in this stage
36 : * consist of colors, not characters, as in the original NFA.
37 : *
38 : * 3) Expand the color trigrams into regular trigrams consisting of
39 : * characters. If too many distinct trigrams are produced, trigrams are
40 : * eliminated and the graph is simplified until it's simple enough.
41 : *
42 : * 4) Finally, the resulting graph is packed into a TrgmPackedGraph struct,
43 : * and returned to the caller.
44 : *
45 : * 1) Compile the regexp to NFA form
46 : * ---------------------------------
47 : * The automaton returned by the regexp compiler is a graph where vertices
48 : * are "states" and arcs are labeled with colors. Each color represents
49 : * a set of characters, so that all characters assigned to the same color
50 : * are interchangeable, so far as matching the regexp is concerned. There
51 : * are two special states: "initial" and "final". A state can have multiple
52 : * outgoing arcs labeled with the same color, which makes the automaton
53 : * non-deterministic, because it can be in many states simultaneously.
54 : *
55 : * Note that this NFA is already lossy compared to the original regexp,
56 : * since it ignores some regex features such as lookahead constraints and
57 : * backref matching. This is OK for our purposes since it's still the case
58 : * that only strings matching the NFA can possibly satisfy the regexp.
59 : *
60 : * 2) Transform the original NFA into an expanded graph
61 : * ----------------------------------------------------
62 : * In the 2nd stage, the automaton is transformed into a graph based on the
63 : * original NFA. Each state in the expanded graph represents a state from
64 : * the original NFA, plus a prefix identifying the last two characters
65 : * (colors, to be precise) seen before entering the state. There can be
66 : * multiple states in the expanded graph for each state in the original NFA,
67 : * depending on what characters can precede it. A prefix position can be
68 : * "unknown" if it's uncertain what the preceding character was, or "blank"
69 : * if the character was a non-word character (we don't need to distinguish
70 : * which non-word character it was, so just think of all of them as blanks).
71 : *
72 : * For convenience in description, call an expanded-state identifier
73 : * (two prefix colors plus a state number from the original NFA) an
74 : * "enter key".
75 : *
76 : * Each arc of the expanded graph is labeled with a trigram that must be
77 : * present in the string to match. We can construct this from an out-arc of
78 : * the underlying NFA state by combining the expanded state's prefix with the
79 : * color label of the underlying out-arc, if neither prefix position is
80 : * "unknown". But note that some of the colors in the trigram might be
81 : * "blank". This is OK since we want to generate word-boundary trigrams as
82 : * the regular trigram machinery would, if we know that some word characters
83 : * must be adjacent to a word boundary in all strings matching the NFA.
84 : *
85 : * The expanded graph can also have fewer states than the original NFA,
86 : * because we don't bother to make a separate state entry unless the state
87 : * is reachable by a valid arc. When an enter key is reachable from a state
88 : * of the expanded graph, but we do not know a complete trigram associated
89 : * with that transition, we cannot make a valid arc; instead we insert the
90 : * enter key into the enterKeys list of the source state. This effectively
91 : * means that the two expanded states are not reliably distinguishable based
92 : * on examining trigrams.
93 : *
94 : * So the expanded graph resembles the original NFA, but the arcs are
95 : * labeled with trigrams instead of individual characters, and there may be
96 : * more or fewer states. It is a lossy representation of the original NFA:
97 : * any string that matches the original regexp must match the expanded graph,
98 : * but the reverse is not true.
99 : *
100 : * We build the expanded graph through a breadth-first traversal of states
101 : * reachable from the initial state. At each reachable state, we identify the
102 : * states reachable from it without traversing a predictable trigram, and add
103 : * those states' enter keys to the current state. Then we generate all
104 : * out-arcs leading out of this collection of states that have predictable
105 : * trigrams, adding their target states to the queue of states to examine.
106 : *
107 : * When building the graph, if the number of states or arcs exceed pre-defined
108 : * limits, we give up and simply mark any states not yet processed as final
109 : * states. Roughly speaking, that means that we make use of some portion from
110 : * the beginning of the regexp. Also, any colors that have too many member
111 : * characters are treated as "unknown", so that we can't derive trigrams
112 : * from them.
113 : *
114 : * 3) Expand the color trigrams into regular trigrams
115 : * --------------------------------------------------
116 : * The trigrams in the expanded graph are "color trigrams", consisting
117 : * of three consecutive colors that must be present in the string. But for
118 : * search, we need regular trigrams consisting of characters. In the 3rd
119 : * stage, the color trigrams are expanded into regular trigrams. Since each
120 : * color can represent many characters, the total number of regular trigrams
121 : * after expansion could be very large. Because searching the index for
122 : * thousands of trigrams would be slow, and would likely produce so many
123 : * false positives that we would have to traverse a large fraction of the
124 : * index, the graph is simplified further in a lossy fashion by removing
125 : * color trigrams. When a color trigram is removed, the states connected by
126 : * any arcs labeled with that trigram are merged.
127 : *
128 : * Trigrams do not all have equivalent value for searching: some of them are
129 : * more frequent and some of them are less frequent. Ideally, we would like
130 : * to know the distribution of trigrams, but we don't. But because of padding
131 : * we know for sure that the empty character is more frequent than others,
132 : * so we can penalize trigrams according to presence of whitespace. The
133 : * penalty assigned to each color trigram is the number of simple trigrams
134 : * it would produce, times the penalties[] multiplier associated with its
135 : * whitespace content. (The penalties[] constants were calculated by analysis
136 : * of some real-life text.) We eliminate color trigrams starting with the
137 : * highest-penalty one, until we get to a total penalty of no more than
138 : * WISH_TRGM_PENALTY. However, we cannot remove a color trigram if that would
139 : * lead to merging the initial and final states, so we may not be able to
140 : * reach WISH_TRGM_PENALTY. It's still okay so long as we have no more than
141 : * MAX_TRGM_COUNT simple trigrams in total, otherwise we fail.
142 : *
143 : * 4) Pack the graph into a compact representation
144 : * -----------------------------------------------
145 : * The 2nd and 3rd stages might have eliminated or merged many of the states
146 : * and trigrams created earlier, so in this final stage, the graph is
147 : * compacted and packed into a simpler struct that contains only the
148 : * information needed to evaluate it.
149 : *
150 : * ALGORITHM EXAMPLE:
151 : *
152 : * Consider the example regex "ab[cd]". This regex is transformed into the
153 : * following NFA (for simplicity we show colors as their single members):
154 : *
155 : * 4#
156 : * c/
157 : * a b /
158 : * 1* --- 2 ---- 3
159 : * \
160 : * d\
161 : * 5#
162 : *
163 : * We use * to mark initial state and # to mark final state. It's not depicted,
164 : * but states 1, 4, 5 have self-referencing arcs for all possible characters,
165 : * because this pattern can match to any part of a string.
166 : *
167 : * As the result of stage 2 we will have the following graph:
168 : *
169 : * abc abd
170 : * 2# <---- 1* ----> 3#
171 : *
172 : * The process for generating this graph is:
173 : * 1) Create state 1 with enter key (UNKNOWN, UNKNOWN, 1).
174 : * 2) Add key (UNKNOWN, "a", 2) to state 1.
175 : * 3) Add key ("a", "b", 3) to state 1.
176 : * 4) Create new state 2 with enter key ("b", "c", 4). Add an arc
177 : * from state 1 to state 2 with label trigram "abc".
178 : * 5) Mark state 2 final because state 4 of source NFA is marked as final.
179 : * 6) Create new state 3 with enter key ("b", "d", 5). Add an arc
180 : * from state 1 to state 3 with label trigram "abd".
181 : * 7) Mark state 3 final because state 5 of source NFA is marked as final.
182 : *
183 : *
184 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
185 : * Portions Copyright (c) 1994, Regents of the University of California
186 : *
187 : * IDENTIFICATION
188 : * contrib/pg_trgm/trgm_regexp.c
189 : *
190 : *-------------------------------------------------------------------------
191 : */
192 : #include "postgres.h"
193 :
194 : #include "catalog/pg_collation_d.h"
195 : #include "regex/regexport.h"
196 : #include "trgm.h"
197 : #include "tsearch/ts_locale.h"
198 : #include "utils/formatting.h"
199 : #include "utils/hsearch.h"
200 : #include "utils/memutils.h"
201 : #include "varatt.h"
202 :
203 : /*
204 : * Uncomment (or use -DTRGM_REGEXP_DEBUG) to print debug info,
205 : * for exploring and debugging the algorithm implementation.
206 : * This produces three graph files in /tmp, in Graphviz .gv format.
207 : * Some progress information is also printed to postmaster stderr.
208 : */
209 : /* #define TRGM_REGEXP_DEBUG */
210 :
211 : /*
212 : * These parameters are used to limit the amount of work done.
213 : * Otherwise regex processing could be too slow and memory-consuming.
214 : *
215 : * MAX_EXPANDED_STATES - How many states we allow in expanded graph
216 : * MAX_EXPANDED_ARCS - How many arcs we allow in expanded graph
217 : * MAX_TRGM_COUNT - How many simple trigrams we allow to be extracted
218 : * WISH_TRGM_PENALTY - Maximum desired sum of color trigram penalties
219 : * COLOR_COUNT_LIMIT - Maximum number of characters per color
220 : */
221 : #define MAX_EXPANDED_STATES 128
222 : #define MAX_EXPANDED_ARCS 1024
223 : #define MAX_TRGM_COUNT 256
224 : #define WISH_TRGM_PENALTY 16
225 : #define COLOR_COUNT_LIMIT 256
226 :
227 : /*
228 : * Penalty multipliers for trigram counts depending on whitespace contents.
229 : * Numbers based on analysis of real-life texts.
230 : */
231 : static const float4 penalties[8] = {
232 : 1.0f, /* "aaa" */
233 : 3.5f, /* "aa " */
234 : 0.0f, /* "a a" (impossible) */
235 : 0.0f, /* "a " (impossible) */
236 : 4.2f, /* " aa" */
237 : 2.1f, /* " a " */
238 : 25.0f, /* " a" */
239 : 0.0f /* " " (impossible) */
240 : };
241 :
242 : /* Struct representing a single pg_wchar, converted back to multibyte form */
243 : typedef struct
244 : {
245 : char bytes[MAX_MULTIBYTE_CHAR_LEN];
246 : } trgm_mb_char;
247 :
248 : /*
249 : * Attributes of NFA colors:
250 : *
251 : * expandable - we know the character expansion of this color
252 : * containsNonWord - color contains non-word characters
253 : * (which will not be extracted into trigrams)
254 : * wordCharsCount - count of word characters in color
255 : * wordChars - array of this color's word characters
256 : * (which can be extracted into trigrams)
257 : *
258 : * When expandable is false, the other attributes don't matter; we just
259 : * assume this color represents unknown character(s).
260 : */
261 : typedef struct
262 : {
263 : bool expandable;
264 : bool containsNonWord;
265 : int wordCharsCount;
266 : trgm_mb_char *wordChars;
267 : } TrgmColorInfo;
268 :
269 : /*
270 : * A "prefix" is information about the colors of the last two characters read
271 : * before reaching a specific NFA state. These colors can have special values
272 : * COLOR_UNKNOWN and COLOR_BLANK. COLOR_UNKNOWN means that we have no
273 : * information, for example because we read some character of an unexpandable
274 : * color. COLOR_BLANK means that we read a non-word character.
275 : *
276 : * We call a prefix ambiguous if at least one of its colors is unknown. It's
277 : * fully ambiguous if both are unknown, partially ambiguous if only the first
278 : * is unknown. (The case of first color known, second unknown is not valid.)
279 : *
280 : * Wholly- or partly-blank prefixes are mostly handled the same as regular
281 : * color prefixes. This allows us to generate appropriate partly-blank
282 : * trigrams when the NFA requires word character(s) to appear adjacent to
283 : * non-word character(s).
284 : */
285 : typedef int TrgmColor;
286 :
287 : /* We assume that colors returned by the regexp engine cannot be these: */
288 : #define COLOR_UNKNOWN (-3)
289 : #define COLOR_BLANK (-4)
290 :
291 : typedef struct
292 : {
293 : TrgmColor colors[2];
294 : } TrgmPrefix;
295 :
296 : /*
297 : * Color-trigram data type. Note that some elements of the trigram can be
298 : * COLOR_BLANK, but we don't allow COLOR_UNKNOWN.
299 : */
300 : typedef struct
301 : {
302 : TrgmColor colors[3];
303 : } ColorTrgm;
304 :
305 : /*
306 : * Key identifying a state of our expanded graph: color prefix, and number
307 : * of the corresponding state in the underlying regex NFA. The color prefix
308 : * shows how we reached the regex state (to the extent that we know it).
309 : */
310 : typedef struct
311 : {
312 : TrgmPrefix prefix;
313 : int nstate;
314 : } TrgmStateKey;
315 :
316 : /*
317 : * One state of the expanded graph.
318 : *
319 : * stateKey - ID of this state
320 : * arcs - outgoing arcs of this state (List of TrgmArc)
321 : * enterKeys - enter keys reachable from this state without reading any
322 : * predictable trigram (List of TrgmStateKey)
323 : * flags - flag bits
324 : * snumber - number of this state (initially assigned as -1, -2, etc,
325 : * for debugging purposes only; then at the packaging stage,
326 : * surviving states are renumbered with positive numbers)
327 : * parent - parent state, if this state has been merged into another
328 : * tentFlags - flags this state would acquire via planned merges
329 : * tentParent - planned parent state, if considering a merge
330 : */
331 : #define TSTATE_INIT 0x01 /* flag indicating this state is initial */
332 : #define TSTATE_FIN 0x02 /* flag indicating this state is final */
333 :
334 : typedef struct TrgmState
335 : {
336 : TrgmStateKey stateKey; /* hashtable key: must be first field */
337 : List *arcs;
338 : List *enterKeys;
339 : int flags;
340 : int snumber;
341 : struct TrgmState *parent;
342 : int tentFlags;
343 : struct TrgmState *tentParent;
344 : } TrgmState;
345 :
346 : /*
347 : * One arc in the expanded graph.
348 : */
349 : typedef struct
350 : {
351 : ColorTrgm ctrgm; /* trigram needed to traverse arc */
352 : TrgmState *target; /* next state */
353 : } TrgmArc;
354 :
355 : /*
356 : * Information about arc of specific color trigram (used in stage 3)
357 : *
358 : * Contains pointers to the source and target states.
359 : */
360 : typedef struct
361 : {
362 : TrgmState *source;
363 : TrgmState *target;
364 : } TrgmArcInfo;
365 :
366 : /*
367 : * Information about color trigram (used in stage 3)
368 : *
369 : * ctrgm - trigram itself
370 : * cnumber - number of this trigram (used in the packaging stage)
371 : * count - number of simple trigrams created from this color trigram
372 : * expanded - indicates this color trigram is expanded into simple trigrams
373 : * arcs - list of all arcs labeled with this color trigram.
374 : */
375 : typedef struct
376 : {
377 : ColorTrgm ctrgm;
378 : int cnumber;
379 : int count;
380 : float4 penalty;
381 : bool expanded;
382 : List *arcs;
383 : } ColorTrgmInfo;
384 :
385 : /*
386 : * Data structure representing all the data we need during regex processing.
387 : *
388 : * regex - compiled regex
389 : * colorInfo - extracted information about regex's colors
390 : * ncolors - number of colors in colorInfo[]
391 : * states - hashtable of TrgmStates (states of expanded graph)
392 : * initState - pointer to initial state of expanded graph
393 : * queue - queue of to-be-processed TrgmStates
394 : * keysQueue - queue of to-be-processed TrgmStateKeys
395 : * arcsCount - total number of arcs of expanded graph (for resource
396 : * limiting)
397 : * overflowed - we have exceeded resource limit for transformation
398 : * colorTrgms - array of all color trigrams present in graph
399 : * colorTrgmsCount - count of those color trigrams
400 : * totalTrgmCount - total count of extracted simple trigrams
401 : */
402 : typedef struct
403 : {
404 : /* Source regexp, and color information extracted from it (stage 1) */
405 : regex_t *regex;
406 : TrgmColorInfo *colorInfo;
407 : int ncolors;
408 :
409 : /* Expanded graph (stage 2) */
410 : HTAB *states;
411 : TrgmState *initState;
412 : int nstates;
413 :
414 : /* Workspace for stage 2 */
415 : List *queue;
416 : List *keysQueue;
417 : int arcsCount;
418 : bool overflowed;
419 :
420 : /* Information about distinct color trigrams in the graph (stage 3) */
421 : ColorTrgmInfo *colorTrgms;
422 : int colorTrgmsCount;
423 : int totalTrgmCount;
424 : } TrgmNFA;
425 :
426 : /*
427 : * Final, compact representation of expanded graph.
428 : */
429 : typedef struct
430 : {
431 : int targetState; /* index of target state (zero-based) */
432 : int colorTrgm; /* index of color trigram for transition */
433 : } TrgmPackedArc;
434 :
435 : typedef struct
436 : {
437 : int arcsCount; /* number of out-arcs for this state */
438 : TrgmPackedArc *arcs; /* array of arcsCount packed arcs */
439 : } TrgmPackedState;
440 :
441 : /* "typedef struct TrgmPackedGraph TrgmPackedGraph" appears in trgm.h */
442 : struct TrgmPackedGraph
443 : {
444 : /*
445 : * colorTrigramsCount and colorTrigramGroups contain information about how
446 : * trigrams are grouped into color trigrams. "colorTrigramsCount" is the
447 : * count of color trigrams and "colorTrigramGroups" contains number of
448 : * simple trigrams for each color trigram. The array of simple trigrams
449 : * (stored separately from this struct) is ordered so that the simple
450 : * trigrams for each color trigram are consecutive, and they're in order
451 : * by color trigram number.
452 : */
453 : int colorTrigramsCount;
454 : int *colorTrigramGroups; /* array of size colorTrigramsCount */
455 :
456 : /*
457 : * The states of the simplified NFA. State number 0 is always initial
458 : * state and state number 1 is always final state.
459 : */
460 : int statesCount;
461 : TrgmPackedState *states; /* array of size statesCount */
462 :
463 : /* Temporary work space for trigramsMatchGraph() */
464 : bool *colorTrigramsActive; /* array of size colorTrigramsCount */
465 : bool *statesActive; /* array of size statesCount */
466 : int *statesQueue; /* array of size statesCount */
467 : };
468 :
469 : /*
470 : * Temporary structure for representing an arc during packaging.
471 : */
472 : typedef struct
473 : {
474 : int sourceState;
475 : int targetState;
476 : int colorTrgm;
477 : } TrgmPackArcInfo;
478 :
479 :
480 : /* prototypes for private functions */
481 : static TRGM *createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
482 : MemoryContext rcontext);
483 : static void RE_compile(regex_t *regex, text *text_re,
484 : int cflags, Oid collation);
485 : static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA);
486 : static bool convertPgWchar(pg_wchar c, trgm_mb_char *result);
487 : static void transformGraph(TrgmNFA *trgmNFA);
488 : static void processState(TrgmNFA *trgmNFA, TrgmState *state);
489 : static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key);
490 : static void addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key);
491 : static void addArcs(TrgmNFA *trgmNFA, TrgmState *state);
492 : static void addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key,
493 : TrgmColor co, TrgmStateKey *destKey);
494 : static bool validArcLabel(TrgmStateKey *key, TrgmColor co);
495 : static TrgmState *getState(TrgmNFA *trgmNFA, TrgmStateKey *key);
496 : static bool prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2);
497 : static bool selectColorTrigrams(TrgmNFA *trgmNFA);
498 : static TRGM *expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext);
499 : static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3]);
500 : static void mergeStates(TrgmState *state1, TrgmState *state2);
501 : static int colorTrgmInfoCmp(const void *p1, const void *p2);
502 : static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2);
503 : static TrgmPackedGraph *packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext);
504 : static int packArcInfoCmp(const void *a1, const void *a2);
505 :
506 : #ifdef TRGM_REGEXP_DEBUG
507 : static void printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors);
508 : static void printTrgmNFA(TrgmNFA *trgmNFA);
509 : static void printTrgmColor(StringInfo buf, TrgmColor co);
510 : static void printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams);
511 : #endif
512 :
513 :
514 : /*
515 : * Main entry point to process a regular expression.
516 : *
517 : * Returns an array of trigrams required by the regular expression, or NULL if
518 : * the regular expression was too complex to analyze. In addition, a packed
519 : * graph representation of the regex is returned into *graph. The results
520 : * must be allocated in rcontext (which might or might not be the current
521 : * context).
522 : */
523 : TRGM *
524 130 : createTrgmNFA(text *text_re, Oid collation,
525 : TrgmPackedGraph **graph, MemoryContext rcontext)
526 : {
527 : TRGM *trg;
528 : regex_t regex;
529 : MemoryContext tmpcontext;
530 : MemoryContext oldcontext;
531 :
532 : /*
533 : * This processing generates a great deal of cruft, which we'd like to
534 : * clean up before returning (since this function may be called in a
535 : * query-lifespan memory context). Make a temp context we can work in so
536 : * that cleanup is easy.
537 : */
538 130 : tmpcontext = AllocSetContextCreate(CurrentMemoryContext,
539 : "createTrgmNFA temporary context",
540 : ALLOCSET_DEFAULT_SIZES);
541 130 : oldcontext = MemoryContextSwitchTo(tmpcontext);
542 :
543 : /*
544 : * Stage 1: Compile the regexp into a NFA, using the regexp library.
545 : */
546 : #ifdef IGNORECASE
547 130 : RE_compile(®ex, text_re,
548 : REG_ADVANCED | REG_NOSUB | REG_ICASE, collation);
549 : #else
550 : RE_compile(®ex, text_re,
551 : REG_ADVANCED | REG_NOSUB, collation);
552 : #endif
553 :
554 130 : trg = createTrgmNFAInternal(®ex, graph, rcontext);
555 :
556 : /* Clean up all the cruft we created (including regex) */
557 130 : MemoryContextSwitchTo(oldcontext);
558 130 : MemoryContextDelete(tmpcontext);
559 :
560 130 : return trg;
561 : }
562 :
563 : /*
564 : * Body of createTrgmNFA, exclusive of regex compilation/freeing.
565 : */
566 : static TRGM *
567 130 : createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
568 : MemoryContext rcontext)
569 : {
570 : TRGM *trg;
571 : TrgmNFA trgmNFA;
572 :
573 130 : trgmNFA.regex = regex;
574 :
575 : /* Collect color information from the regex */
576 130 : getColorInfo(regex, &trgmNFA);
577 :
578 : #ifdef TRGM_REGEXP_DEBUG
579 : printSourceNFA(regex, trgmNFA.colorInfo, trgmNFA.ncolors);
580 : #endif
581 :
582 : /*
583 : * Stage 2: Create an expanded graph from the source NFA.
584 : */
585 130 : transformGraph(&trgmNFA);
586 :
587 : #ifdef TRGM_REGEXP_DEBUG
588 : printTrgmNFA(&trgmNFA);
589 : #endif
590 :
591 : /*
592 : * Fail if we were unable to make a nontrivial graph, ie it is possible to
593 : * get from the initial state to the final state without reading any
594 : * predictable trigram.
595 : */
596 130 : if (trgmNFA.initState->flags & TSTATE_FIN)
597 18 : return NULL;
598 :
599 : /*
600 : * Stage 3: Select color trigrams to expand. Fail if too many trigrams.
601 : */
602 112 : if (!selectColorTrigrams(&trgmNFA))
603 6 : return NULL;
604 :
605 : /*
606 : * Stage 4: Expand color trigrams and pack graph into final
607 : * representation.
608 : */
609 106 : trg = expandColorTrigrams(&trgmNFA, rcontext);
610 :
611 106 : *graph = packGraph(&trgmNFA, rcontext);
612 :
613 : #ifdef TRGM_REGEXP_DEBUG
614 : printTrgmPackedGraph(*graph, trg);
615 : #endif
616 :
617 106 : return trg;
618 : }
619 :
620 : /*
621 : * Main entry point for evaluating a graph during index scanning.
622 : *
623 : * The check[] array is indexed by trigram number (in the array of simple
624 : * trigrams returned by createTrgmNFA), and holds true for those trigrams
625 : * that are present in the index entry being checked.
626 : */
627 : bool
628 7122 : trigramsMatchGraph(TrgmPackedGraph *graph, bool *check)
629 : {
630 : int i,
631 : j,
632 : k,
633 : queueIn,
634 : queueOut;
635 :
636 : /*
637 : * Reset temporary working areas.
638 : */
639 7122 : memset(graph->colorTrigramsActive, 0,
640 7122 : sizeof(bool) * graph->colorTrigramsCount);
641 7122 : memset(graph->statesActive, 0, sizeof(bool) * graph->statesCount);
642 :
643 : /*
644 : * Check which color trigrams were matched. A match for any simple
645 : * trigram associated with a color trigram counts as a match of the color
646 : * trigram.
647 : */
648 7122 : j = 0;
649 22102 : for (i = 0; i < graph->colorTrigramsCount; i++)
650 : {
651 14980 : int cnt = graph->colorTrigramGroups[i];
652 :
653 333660 : for (k = j; k < j + cnt; k++)
654 : {
655 326400 : if (check[k])
656 : {
657 : /*
658 : * Found one matched trigram in the group. Can skip the rest
659 : * of them and go to the next group.
660 : */
661 7720 : graph->colorTrigramsActive[i] = true;
662 7720 : break;
663 : }
664 : }
665 14980 : j = j + cnt;
666 : }
667 :
668 : /*
669 : * Initialize the statesQueue to hold just the initial state. Note:
670 : * statesQueue has room for statesCount entries, which is certainly enough
671 : * since no state will be put in the queue more than once. The
672 : * statesActive array marks which states have been queued.
673 : */
674 7122 : graph->statesActive[0] = true;
675 7122 : graph->statesQueue[0] = 0;
676 7122 : queueIn = 0;
677 7122 : queueOut = 1;
678 :
679 : /* Process queued states as long as there are any. */
680 15322 : while (queueIn < queueOut)
681 : {
682 15050 : int stateno = graph->statesQueue[queueIn++];
683 15050 : TrgmPackedState *state = &graph->states[stateno];
684 15050 : int cnt = state->arcsCount;
685 :
686 : /* Loop over state's out-arcs */
687 30328 : for (i = 0; i < cnt; i++)
688 : {
689 22128 : TrgmPackedArc *arc = &state->arcs[i];
690 :
691 : /*
692 : * If corresponding color trigram is present then activate the
693 : * corresponding state. We're done if that's the final state,
694 : * otherwise queue the state if it's not been queued already.
695 : */
696 22128 : if (graph->colorTrigramsActive[arc->colorTrgm])
697 : {
698 15464 : int nextstate = arc->targetState;
699 :
700 15464 : if (nextstate == 1)
701 6850 : return true; /* success: final state is reachable */
702 :
703 8614 : if (!graph->statesActive[nextstate])
704 : {
705 8500 : graph->statesActive[nextstate] = true;
706 8500 : graph->statesQueue[queueOut++] = nextstate;
707 : }
708 : }
709 : }
710 : }
711 :
712 : /* Queue is empty, so match fails. */
713 272 : return false;
714 : }
715 :
716 : /*
717 : * Compile regex string into struct at *regex.
718 : * NB: pg_regfree must be applied to regex if this completes successfully.
719 : */
720 : static void
721 130 : RE_compile(regex_t *regex, text *text_re, int cflags, Oid collation)
722 : {
723 130 : int text_re_len = VARSIZE_ANY_EXHDR(text_re);
724 130 : char *text_re_val = VARDATA_ANY(text_re);
725 : pg_wchar *pattern;
726 : int pattern_len;
727 : int regcomp_result;
728 : char errMsg[100];
729 :
730 : /* Convert pattern string to wide characters */
731 130 : pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
732 130 : pattern_len = pg_mb2wchar_with_len(text_re_val,
733 : pattern,
734 : text_re_len);
735 :
736 : /* Compile regex */
737 130 : regcomp_result = pg_regcomp(regex,
738 : pattern,
739 : pattern_len,
740 : cflags,
741 : collation);
742 :
743 130 : pfree(pattern);
744 :
745 130 : if (regcomp_result != REG_OKAY)
746 : {
747 : /* re didn't compile (no need for pg_regfree, if so) */
748 0 : pg_regerror(regcomp_result, regex, errMsg, sizeof(errMsg));
749 0 : ereport(ERROR,
750 : (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
751 : errmsg("invalid regular expression: %s", errMsg)));
752 : }
753 130 : }
754 :
755 :
756 : /*---------------------
757 : * Subroutines for pre-processing the color map (stage 1).
758 : *---------------------
759 : */
760 :
761 : /*
762 : * Fill TrgmColorInfo structure for each color using regex export functions.
763 : */
764 : static void
765 130 : getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
766 : {
767 130 : int colorsCount = pg_reg_getnumcolors(regex);
768 : int i;
769 :
770 130 : trgmNFA->ncolors = colorsCount;
771 130 : trgmNFA->colorInfo = (TrgmColorInfo *)
772 130 : palloc0(colorsCount * sizeof(TrgmColorInfo));
773 :
774 : /*
775 : * Loop over colors, filling TrgmColorInfo about each. Note we include
776 : * WHITE (0) even though we know it'll be reported as non-expandable.
777 : */
778 1192 : for (i = 0; i < colorsCount; i++)
779 : {
780 1062 : TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[i];
781 1062 : int charsCount = pg_reg_getnumcharacters(regex, i);
782 : pg_wchar *chars;
783 : int j;
784 :
785 1062 : if (charsCount < 0 || charsCount > COLOR_COUNT_LIMIT)
786 : {
787 : /* Non expandable, or too large to work with */
788 650 : colorInfo->expandable = false;
789 650 : continue;
790 : }
791 :
792 412 : colorInfo->expandable = true;
793 412 : colorInfo->containsNonWord = false;
794 412 : colorInfo->wordChars = palloc_array(trgm_mb_char, charsCount);
795 412 : colorInfo->wordCharsCount = 0;
796 :
797 : /* Extract all the chars in this color */
798 412 : chars = palloc_array(pg_wchar, charsCount);
799 412 : pg_reg_getcharacters(regex, i, chars, charsCount);
800 :
801 : /*
802 : * Convert characters back to multibyte form, and save only those that
803 : * are word characters. Set "containsNonWord" if any non-word
804 : * character. (Note: it'd probably be nicer to keep the chars in
805 : * pg_wchar format for now, but ISWORDCHR wants to see multibyte.)
806 : */
807 1982 : for (j = 0; j < charsCount; j++)
808 : {
809 : trgm_mb_char c;
810 :
811 1570 : if (!convertPgWchar(chars[j], &c))
812 730 : continue; /* ok to ignore it altogether */
813 840 : if (ISWORDCHR(c.bytes))
814 790 : colorInfo->wordChars[colorInfo->wordCharsCount++] = c;
815 : else
816 50 : colorInfo->containsNonWord = true;
817 : }
818 :
819 412 : pfree(chars);
820 : }
821 130 : }
822 :
823 : /*
824 : * Convert pg_wchar to multibyte format.
825 : * Returns false if the character should be ignored completely.
826 : */
827 : static bool
828 1570 : convertPgWchar(pg_wchar c, trgm_mb_char *result)
829 : {
830 : /* "s" has enough space for a multibyte character and a trailing NUL */
831 : char s[MAX_MULTIBYTE_CHAR_LEN + 1];
832 :
833 : /*
834 : * We can ignore the NUL character, since it can never appear in a PG text
835 : * string. This avoids the need for various special cases when
836 : * reconstructing trigrams.
837 : */
838 1570 : if (c == 0)
839 0 : return false;
840 :
841 : /* Do the conversion, making sure the result is NUL-terminated */
842 1570 : memset(s, 0, sizeof(s));
843 1570 : pg_wchar2mb_with_len(&c, s, 1);
844 :
845 : /*
846 : * In IGNORECASE mode, we can ignore uppercase characters. We assume that
847 : * the regex engine generated both uppercase and lowercase equivalents
848 : * within each color, since we used the REG_ICASE option; so there's no
849 : * need to process the uppercase version.
850 : *
851 : * XXX this code is dependent on the assumption that str_tolower() works
852 : * the same as the regex engine's internal case folding machinery. Might
853 : * be wiser to expose pg_wc_tolower and test whether c ==
854 : * pg_wc_tolower(c). On the other hand, the trigrams in the index were
855 : * created using str_tolower(), so we're probably screwed if there's any
856 : * incompatibility anyway.
857 : */
858 : #ifdef IGNORECASE
859 : {
860 1570 : char *lowerCased = str_tolower(s, strlen(s), DEFAULT_COLLATION_OID);
861 :
862 1570 : if (strcmp(lowerCased, s) != 0)
863 : {
864 730 : pfree(lowerCased);
865 730 : return false;
866 : }
867 840 : pfree(lowerCased);
868 : }
869 : #endif
870 :
871 : /* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */
872 840 : memcpy(result->bytes, s, MAX_MULTIBYTE_CHAR_LEN);
873 840 : return true;
874 : }
875 :
876 :
877 : /*---------------------
878 : * Subroutines for expanding original NFA graph into a trigram graph (stage 2).
879 : *---------------------
880 : */
881 :
882 : /*
883 : * Transform the graph, given a regex and extracted color information.
884 : *
885 : * We create and process a queue of expanded-graph states until all the states
886 : * are processed.
887 : *
888 : * This algorithm may be stopped due to resource limitation. In this case we
889 : * force every unprocessed branch to immediately finish with matching (this
890 : * can give us false positives but no false negatives) by marking all
891 : * unprocessed states as final.
892 : */
893 : static void
894 130 : transformGraph(TrgmNFA *trgmNFA)
895 : {
896 : HASHCTL hashCtl;
897 : TrgmStateKey initkey;
898 : TrgmState *initstate;
899 : ListCell *lc;
900 :
901 : /* Initialize this stage's workspace in trgmNFA struct */
902 130 : trgmNFA->queue = NIL;
903 130 : trgmNFA->keysQueue = NIL;
904 130 : trgmNFA->arcsCount = 0;
905 130 : trgmNFA->overflowed = false;
906 :
907 : /* Create hashtable for states */
908 130 : hashCtl.keysize = sizeof(TrgmStateKey);
909 130 : hashCtl.entrysize = sizeof(TrgmState);
910 130 : hashCtl.hcxt = CurrentMemoryContext;
911 130 : trgmNFA->states = hash_create("Trigram NFA",
912 : 1024,
913 : &hashCtl,
914 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
915 130 : trgmNFA->nstates = 0;
916 :
917 : /* Create initial state: ambiguous prefix, NFA's initial state */
918 130 : MemSet(&initkey, 0, sizeof(initkey));
919 130 : initkey.prefix.colors[0] = COLOR_UNKNOWN;
920 130 : initkey.prefix.colors[1] = COLOR_UNKNOWN;
921 130 : initkey.nstate = pg_reg_getinitialstate(trgmNFA->regex);
922 :
923 130 : initstate = getState(trgmNFA, &initkey);
924 130 : initstate->flags |= TSTATE_INIT;
925 130 : trgmNFA->initState = initstate;
926 :
927 : /*
928 : * Recursively build the expanded graph by processing queue of states
929 : * (breadth-first search). getState already put initstate in the queue.
930 : * Note that getState will append new states to the queue within the loop,
931 : * too; this works as long as we don't do repeat fetches using the "lc"
932 : * pointer.
933 : */
934 1458 : foreach(lc, trgmNFA->queue)
935 : {
936 1328 : TrgmState *state = (TrgmState *) lfirst(lc);
937 :
938 : /*
939 : * If we overflowed then just mark state as final. Otherwise do
940 : * actual processing.
941 : */
942 1328 : if (trgmNFA->overflowed)
943 18 : state->flags |= TSTATE_FIN;
944 : else
945 1310 : processState(trgmNFA, state);
946 :
947 : /* Did we overflow? */
948 2656 : if (trgmNFA->arcsCount > MAX_EXPANDED_ARCS ||
949 1328 : hash_get_num_entries(trgmNFA->states) > MAX_EXPANDED_STATES)
950 24 : trgmNFA->overflowed = true;
951 : }
952 130 : }
953 :
954 : /*
955 : * Process one state: add enter keys and then add outgoing arcs.
956 : */
957 : static void
958 1310 : processState(TrgmNFA *trgmNFA, TrgmState *state)
959 : {
960 : ListCell *lc;
961 :
962 : /* keysQueue should be NIL already, but make sure */
963 1310 : trgmNFA->keysQueue = NIL;
964 :
965 : /*
966 : * Add state's own key, and then process all keys added to keysQueue until
967 : * queue is finished. But we can quit if the state gets marked final.
968 : */
969 1310 : addKey(trgmNFA, state, &state->stateKey);
970 2532 : foreach(lc, trgmNFA->keysQueue)
971 : {
972 1384 : TrgmStateKey *key = (TrgmStateKey *) lfirst(lc);
973 :
974 1384 : if (state->flags & TSTATE_FIN)
975 162 : break;
976 1222 : addKey(trgmNFA, state, key);
977 : }
978 :
979 : /* Release keysQueue to clean up for next cycle */
980 1310 : list_free(trgmNFA->keysQueue);
981 1310 : trgmNFA->keysQueue = NIL;
982 :
983 : /*
984 : * Add outgoing arcs only if state isn't final (we have no interest in
985 : * outgoing arcs if we already match)
986 : */
987 1310 : if (!(state->flags & TSTATE_FIN))
988 1142 : addArcs(trgmNFA, state);
989 1310 : }
990 :
991 : /*
992 : * Add the given enter key into the state's enterKeys list, and determine
993 : * whether this should result in any further enter keys being added.
994 : * If so, add those keys to keysQueue so that processState will handle them.
995 : *
996 : * If the enter key is for the NFA's final state, mark state as TSTATE_FIN.
997 : * This situation means that we can reach the final state from this expanded
998 : * state without reading any predictable trigram, so we must consider this
999 : * state as an accepting one.
1000 : *
1001 : * The given key could be a duplicate of one already in enterKeys, or be
1002 : * redundant with some enterKeys. So we check that before doing anything.
1003 : *
1004 : * Note that we don't generate any actual arcs here. addArcs will do that
1005 : * later, after we have identified all the enter keys for this state.
1006 : */
1007 : static void
1008 2532 : addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
1009 : {
1010 : regex_arc_t *arcs;
1011 : TrgmStateKey destKey;
1012 : ListCell *cell;
1013 : int i,
1014 : arcsCount;
1015 :
1016 : /*
1017 : * Ensure any pad bytes in destKey are zero, since it may get used as a
1018 : * hashtable key by getState.
1019 : */
1020 2532 : MemSet(&destKey, 0, sizeof(destKey));
1021 :
1022 : /*
1023 : * Compare key to each existing enter key of the state to check for
1024 : * redundancy. We can drop either old key(s) or the new key if we find
1025 : * redundancy.
1026 : */
1027 3958 : foreach(cell, state->enterKeys)
1028 : {
1029 2032 : TrgmStateKey *existingKey = (TrgmStateKey *) lfirst(cell);
1030 :
1031 2032 : if (existingKey->nstate == key->nstate)
1032 : {
1033 624 : if (prefixContains(&existingKey->prefix, &key->prefix))
1034 : {
1035 : /* This old key already covers the new key. Nothing to do */
1036 606 : return;
1037 : }
1038 18 : if (prefixContains(&key->prefix, &existingKey->prefix))
1039 : {
1040 : /*
1041 : * The new key covers this old key. Remove the old key, it's
1042 : * no longer needed once we add this key to the list.
1043 : */
1044 12 : state->enterKeys = foreach_delete_current(state->enterKeys,
1045 : cell);
1046 : }
1047 : }
1048 : }
1049 :
1050 : /* No redundancy, so add this key to the state's list */
1051 1926 : state->enterKeys = lappend(state->enterKeys, key);
1052 :
1053 : /* If state is now known final, mark it and we're done */
1054 1926 : if (key->nstate == pg_reg_getfinalstate(trgmNFA->regex))
1055 : {
1056 168 : state->flags |= TSTATE_FIN;
1057 168 : return;
1058 : }
1059 :
1060 : /*
1061 : * Loop through all outgoing arcs of the corresponding state in the
1062 : * original NFA.
1063 : */
1064 1758 : arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate);
1065 1758 : arcs = palloc_array(regex_arc_t, arcsCount);
1066 1758 : pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount);
1067 :
1068 4768 : for (i = 0; i < arcsCount; i++)
1069 : {
1070 3010 : regex_arc_t *arc = &arcs[i];
1071 :
1072 3010 : if (pg_reg_colorisbegin(trgmNFA->regex, arc->co))
1073 : {
1074 : /*
1075 : * Start of line/string (^). Trigram extraction treats start of
1076 : * line same as start of word: double space prefix is added.
1077 : * Hence, make an enter key showing we can reach the arc
1078 : * destination with all-blank prefix.
1079 : */
1080 492 : destKey.prefix.colors[0] = COLOR_BLANK;
1081 492 : destKey.prefix.colors[1] = COLOR_BLANK;
1082 492 : destKey.nstate = arc->to;
1083 :
1084 : /* Add enter key to this state */
1085 492 : addKeyToQueue(trgmNFA, &destKey);
1086 : }
1087 2518 : else if (pg_reg_colorisend(trgmNFA->regex, arc->co))
1088 : {
1089 : /*
1090 : * End of line/string ($). We must consider this arc as a
1091 : * transition that doesn't read anything. The reason for adding
1092 : * this enter key to the state is that if the arc leads to the
1093 : * NFA's final state, we must mark this expanded state as final.
1094 : */
1095 324 : destKey.prefix.colors[0] = COLOR_UNKNOWN;
1096 324 : destKey.prefix.colors[1] = COLOR_UNKNOWN;
1097 324 : destKey.nstate = arc->to;
1098 :
1099 : /* Add enter key to this state */
1100 324 : addKeyToQueue(trgmNFA, &destKey);
1101 : }
1102 2194 : else if (arc->co >= 0)
1103 : {
1104 : /* Regular color (including WHITE) */
1105 1786 : TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[arc->co];
1106 :
1107 1786 : if (colorInfo->expandable)
1108 : {
1109 1786 : if (colorInfo->containsNonWord &&
1110 110 : !validArcLabel(key, COLOR_BLANK))
1111 : {
1112 : /*
1113 : * We can reach the arc destination after reading a
1114 : * non-word character, but the prefix is not something
1115 : * that addArc will accept with COLOR_BLANK, so no trigram
1116 : * arc can get made for this transition. We must make an
1117 : * enter key to show that the arc destination is
1118 : * reachable. Set it up with an all-blank prefix, since
1119 : * that corresponds to what the trigram extraction code
1120 : * will do at a word starting boundary.
1121 : */
1122 54 : destKey.prefix.colors[0] = COLOR_BLANK;
1123 54 : destKey.prefix.colors[1] = COLOR_BLANK;
1124 54 : destKey.nstate = arc->to;
1125 54 : addKeyToQueue(trgmNFA, &destKey);
1126 : }
1127 :
1128 1786 : if (colorInfo->wordCharsCount > 0 &&
1129 1676 : !validArcLabel(key, arc->co))
1130 : {
1131 : /*
1132 : * We can reach the arc destination after reading a word
1133 : * character, but the prefix is not something that addArc
1134 : * will accept, so no trigram arc can get made for this
1135 : * transition. We must make an enter key to show that the
1136 : * arc destination is reachable. The prefix for the enter
1137 : * key should reflect the info we have for this arc.
1138 : */
1139 262 : destKey.prefix.colors[0] = key->prefix.colors[1];
1140 262 : destKey.prefix.colors[1] = arc->co;
1141 262 : destKey.nstate = arc->to;
1142 262 : addKeyToQueue(trgmNFA, &destKey);
1143 : }
1144 : }
1145 : else
1146 : {
1147 : /*
1148 : * Unexpandable color. Add enter key with ambiguous prefix,
1149 : * showing we can reach the destination from this state, but
1150 : * the preceding colors will be uncertain. (We do not set the
1151 : * first prefix color to key->prefix.colors[1], because a
1152 : * prefix of known followed by unknown is invalid.)
1153 : */
1154 0 : destKey.prefix.colors[0] = COLOR_UNKNOWN;
1155 0 : destKey.prefix.colors[1] = COLOR_UNKNOWN;
1156 0 : destKey.nstate = arc->to;
1157 0 : addKeyToQueue(trgmNFA, &destKey);
1158 : }
1159 : }
1160 : else
1161 : {
1162 : /* RAINBOW: treat as unexpandable color */
1163 408 : destKey.prefix.colors[0] = COLOR_UNKNOWN;
1164 408 : destKey.prefix.colors[1] = COLOR_UNKNOWN;
1165 408 : destKey.nstate = arc->to;
1166 408 : addKeyToQueue(trgmNFA, &destKey);
1167 : }
1168 : }
1169 :
1170 1758 : pfree(arcs);
1171 : }
1172 :
1173 : /*
1174 : * Add copy of given key to keysQueue for later processing.
1175 : */
1176 : static void
1177 1540 : addKeyToQueue(TrgmNFA *trgmNFA, TrgmStateKey *key)
1178 : {
1179 1540 : TrgmStateKey *keyCopy = palloc_object(TrgmStateKey);
1180 :
1181 1540 : memcpy(keyCopy, key, sizeof(TrgmStateKey));
1182 1540 : trgmNFA->keysQueue = lappend(trgmNFA->keysQueue, keyCopy);
1183 1540 : }
1184 :
1185 : /*
1186 : * Add outgoing arcs from given state, whose enter keys are all now known.
1187 : */
1188 : static void
1189 1142 : addArcs(TrgmNFA *trgmNFA, TrgmState *state)
1190 : {
1191 : TrgmStateKey destKey;
1192 : ListCell *cell;
1193 : regex_arc_t *arcs;
1194 : int arcsCount,
1195 : i;
1196 :
1197 : /*
1198 : * Ensure any pad bytes in destKey are zero, since it may get used as a
1199 : * hashtable key by getState.
1200 : */
1201 1142 : MemSet(&destKey, 0, sizeof(destKey));
1202 :
1203 : /*
1204 : * Iterate over enter keys associated with this expanded-graph state. This
1205 : * includes both the state's own stateKey, and any enter keys we added to
1206 : * it during addKey (which represent expanded-graph states that are not
1207 : * distinguishable from this one by means of trigrams). For each such
1208 : * enter key, examine all the out-arcs of the key's underlying NFA state,
1209 : * and try to make a trigram arc leading to where the out-arc leads.
1210 : * (addArc will deal with whether the arc is valid or not.)
1211 : */
1212 2672 : foreach(cell, state->enterKeys)
1213 : {
1214 1530 : TrgmStateKey *key = (TrgmStateKey *) lfirst(cell);
1215 :
1216 1530 : arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate);
1217 1530 : arcs = palloc_array(regex_arc_t, arcsCount);
1218 1530 : pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount);
1219 :
1220 3892 : for (i = 0; i < arcsCount; i++)
1221 : {
1222 2362 : regex_arc_t *arc = &arcs[i];
1223 : TrgmColorInfo *colorInfo;
1224 :
1225 : /*
1226 : * Ignore non-expandable colors; addKey already handled the case.
1227 : *
1228 : * We need no special check for WHITE or begin/end pseudocolors
1229 : * here. We don't need to do any processing for them, and they
1230 : * will be marked non-expandable since the regex engine will have
1231 : * reported them that way. We do have to watch out for RAINBOW,
1232 : * which has a negative color number.
1233 : */
1234 2362 : if (arc->co < 0)
1235 204 : continue;
1236 : Assert(arc->co < trgmNFA->ncolors);
1237 :
1238 2158 : colorInfo = &trgmNFA->colorInfo[arc->co];
1239 2158 : if (!colorInfo->expandable)
1240 420 : continue;
1241 :
1242 1738 : if (colorInfo->containsNonWord)
1243 : {
1244 : /*
1245 : * Color includes non-word character(s).
1246 : *
1247 : * Generate an arc, treating this transition as occurring on
1248 : * BLANK. This allows word-ending trigrams to be manufactured
1249 : * if possible.
1250 : */
1251 110 : destKey.prefix.colors[0] = key->prefix.colors[1];
1252 110 : destKey.prefix.colors[1] = COLOR_BLANK;
1253 110 : destKey.nstate = arc->to;
1254 :
1255 110 : addArc(trgmNFA, state, key, COLOR_BLANK, &destKey);
1256 : }
1257 :
1258 1738 : if (colorInfo->wordCharsCount > 0)
1259 : {
1260 : /*
1261 : * Color includes word character(s).
1262 : *
1263 : * Generate an arc. Color is pushed into prefix of target
1264 : * state.
1265 : */
1266 1628 : destKey.prefix.colors[0] = key->prefix.colors[1];
1267 1628 : destKey.prefix.colors[1] = arc->co;
1268 1628 : destKey.nstate = arc->to;
1269 :
1270 1628 : addArc(trgmNFA, state, key, arc->co, &destKey);
1271 : }
1272 : }
1273 :
1274 1530 : pfree(arcs);
1275 : }
1276 1142 : }
1277 :
1278 : /*
1279 : * Generate an out-arc of the expanded graph, if it's valid and not redundant.
1280 : *
1281 : * state: expanded-graph state we want to add an out-arc to
1282 : * key: provides prefix colors (key->nstate is not used)
1283 : * co: transition color
1284 : * destKey: identifier for destination state of expanded graph
1285 : */
1286 : static void
1287 1738 : addArc(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key,
1288 : TrgmColor co, TrgmStateKey *destKey)
1289 : {
1290 : TrgmArc *arc;
1291 : ListCell *cell;
1292 :
1293 : /* Do nothing if this wouldn't be a valid arc label trigram */
1294 1738 : if (!validArcLabel(key, co))
1295 274 : return;
1296 :
1297 : /*
1298 : * Check if we are going to reach key which is covered by a key which is
1299 : * already listed in this state. If so arc is useless: the NFA can bypass
1300 : * it through a path that doesn't require any predictable trigram, so
1301 : * whether the arc's trigram is present or not doesn't really matter.
1302 : */
1303 3534 : foreach(cell, state->enterKeys)
1304 : {
1305 2082 : TrgmStateKey *existingKey = (TrgmStateKey *) lfirst(cell);
1306 :
1307 2132 : if (existingKey->nstate == destKey->nstate &&
1308 50 : prefixContains(&existingKey->prefix, &destKey->prefix))
1309 12 : return;
1310 : }
1311 :
1312 : /* Checks were successful, add new arc */
1313 1452 : arc = palloc_object(TrgmArc);
1314 1452 : arc->target = getState(trgmNFA, destKey);
1315 1452 : arc->ctrgm.colors[0] = key->prefix.colors[0];
1316 1452 : arc->ctrgm.colors[1] = key->prefix.colors[1];
1317 1452 : arc->ctrgm.colors[2] = co;
1318 :
1319 1452 : state->arcs = lappend(state->arcs, arc);
1320 1452 : trgmNFA->arcsCount++;
1321 : }
1322 :
1323 : /*
1324 : * Can we make a valid trigram arc label from the given prefix and arc color?
1325 : *
1326 : * This is split out so that tests in addKey and addArc will stay in sync.
1327 : */
1328 : static bool
1329 3524 : validArcLabel(TrgmStateKey *key, TrgmColor co)
1330 : {
1331 : /*
1332 : * We have to know full trigram in order to add outgoing arc. So we can't
1333 : * do it if prefix is ambiguous.
1334 : */
1335 3524 : if (key->prefix.colors[0] == COLOR_UNKNOWN)
1336 466 : return false;
1337 :
1338 : /* If key->prefix.colors[0] isn't unknown, its second color isn't either */
1339 : Assert(key->prefix.colors[1] != COLOR_UNKNOWN);
1340 : /* And we should not be called with an unknown arc color anytime */
1341 : Assert(co != COLOR_UNKNOWN);
1342 :
1343 : /*
1344 : * We don't bother with making arcs representing three non-word
1345 : * characters, since that's useless for trigram extraction.
1346 : */
1347 3058 : if (key->prefix.colors[0] == COLOR_BLANK &&
1348 328 : key->prefix.colors[1] == COLOR_BLANK &&
1349 : co == COLOR_BLANK)
1350 24 : return false;
1351 :
1352 : /*
1353 : * We also reject nonblank-blank-anything. The nonblank-blank-nonblank
1354 : * case doesn't correspond to any trigram the trigram extraction code
1355 : * would make. The nonblank-blank-blank case is also not possible with
1356 : * RPADDING = 1. (Note that in many cases we'd fail to generate such a
1357 : * trigram even if it were valid, for example processing "foo bar" will
1358 : * not result in considering the trigram "o ". So if you want to support
1359 : * RPADDING = 2, there's more to do than just twiddle this test.)
1360 : */
1361 3034 : if (key->prefix.colors[0] != COLOR_BLANK &&
1362 2730 : key->prefix.colors[1] == COLOR_BLANK)
1363 100 : return false;
1364 :
1365 : /*
1366 : * Other combinations involving blank are valid, in particular we assume
1367 : * blank-blank-nonblank is valid, which presumes that LPADDING is 2.
1368 : *
1369 : * Note: Using again the example "foo bar", we will not consider the
1370 : * trigram " b", though this trigram would be found by the trigram
1371 : * extraction code. Since we will find " ba", it doesn't seem worth
1372 : * trying to hack the algorithm to generate the additional trigram.
1373 : */
1374 :
1375 : /* arc label is valid */
1376 2934 : return true;
1377 : }
1378 :
1379 : /*
1380 : * Get state of expanded graph for given state key,
1381 : * and queue the state for processing if it didn't already exist.
1382 : */
1383 : static TrgmState *
1384 1582 : getState(TrgmNFA *trgmNFA, TrgmStateKey *key)
1385 : {
1386 : TrgmState *state;
1387 : bool found;
1388 :
1389 1582 : state = (TrgmState *) hash_search(trgmNFA->states, key, HASH_ENTER,
1390 : &found);
1391 1582 : if (!found)
1392 : {
1393 : /* New state: initialize and queue it */
1394 1328 : state->arcs = NIL;
1395 1328 : state->enterKeys = NIL;
1396 1328 : state->flags = 0;
1397 : /* states are initially given negative numbers */
1398 1328 : state->snumber = -(++trgmNFA->nstates);
1399 1328 : state->parent = NULL;
1400 1328 : state->tentFlags = 0;
1401 1328 : state->tentParent = NULL;
1402 :
1403 1328 : trgmNFA->queue = lappend(trgmNFA->queue, state);
1404 : }
1405 1582 : return state;
1406 : }
1407 :
1408 : /*
1409 : * Check if prefix1 "contains" prefix2.
1410 : *
1411 : * "contains" means that any exact prefix (with no ambiguity) that satisfies
1412 : * prefix2 also satisfies prefix1.
1413 : */
1414 : static bool
1415 692 : prefixContains(TrgmPrefix *prefix1, TrgmPrefix *prefix2)
1416 : {
1417 692 : if (prefix1->colors[1] == COLOR_UNKNOWN)
1418 : {
1419 : /* Fully ambiguous prefix contains everything */
1420 612 : return true;
1421 : }
1422 80 : else if (prefix1->colors[0] == COLOR_UNKNOWN)
1423 : {
1424 : /*
1425 : * Prefix with only first unknown color contains every prefix with
1426 : * same second color.
1427 : */
1428 24 : if (prefix1->colors[1] == prefix2->colors[1])
1429 6 : return true;
1430 : else
1431 18 : return false;
1432 : }
1433 : else
1434 : {
1435 : /* Exact prefix contains only the exact same prefix */
1436 56 : if (prefix1->colors[0] == prefix2->colors[0] &&
1437 26 : prefix1->colors[1] == prefix2->colors[1])
1438 12 : return true;
1439 : else
1440 44 : return false;
1441 : }
1442 : }
1443 :
1444 :
1445 : /*---------------------
1446 : * Subroutines for expanding color trigrams into regular trigrams (stage 3).
1447 : *---------------------
1448 : */
1449 :
1450 : /*
1451 : * Get vector of all color trigrams in graph and select which of them
1452 : * to expand into simple trigrams.
1453 : *
1454 : * Returns true if OK, false if exhausted resource limits.
1455 : */
1456 : static bool
1457 112 : selectColorTrigrams(TrgmNFA *trgmNFA)
1458 : {
1459 : HASH_SEQ_STATUS scan_status;
1460 112 : int arcsCount = trgmNFA->arcsCount,
1461 : i;
1462 : TrgmState *state;
1463 : ColorTrgmInfo *colorTrgms;
1464 : int64 totalTrgmCount;
1465 : float4 totalTrgmPenalty;
1466 : int cnumber;
1467 :
1468 : /* Collect color trigrams from all arcs */
1469 112 : colorTrgms = palloc0_array(ColorTrgmInfo, arcsCount);
1470 112 : trgmNFA->colorTrgms = colorTrgms;
1471 :
1472 112 : i = 0;
1473 112 : hash_seq_init(&scan_status, trgmNFA->states);
1474 1422 : while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1475 : {
1476 : ListCell *cell;
1477 :
1478 2762 : foreach(cell, state->arcs)
1479 : {
1480 1452 : TrgmArc *arc = (TrgmArc *) lfirst(cell);
1481 1452 : TrgmArcInfo *arcInfo = palloc_object(TrgmArcInfo);
1482 1452 : ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1483 :
1484 1452 : arcInfo->source = state;
1485 1452 : arcInfo->target = arc->target;
1486 1452 : trgmInfo->ctrgm = arc->ctrgm;
1487 1452 : trgmInfo->cnumber = -1;
1488 : /* count and penalty will be set below */
1489 1452 : trgmInfo->expanded = true;
1490 1452 : trgmInfo->arcs = list_make1(arcInfo);
1491 1452 : i++;
1492 : }
1493 : }
1494 : Assert(i == arcsCount);
1495 :
1496 : /* Remove duplicates, merging their arcs lists */
1497 112 : if (arcsCount >= 2)
1498 : {
1499 : ColorTrgmInfo *p1,
1500 : *p2;
1501 :
1502 : /* Sort trigrams to ease duplicate detection */
1503 68 : qsort(colorTrgms, arcsCount, sizeof(ColorTrgmInfo), colorTrgmInfoCmp);
1504 :
1505 : /* p1 is probe point, p2 is last known non-duplicate. */
1506 68 : p2 = colorTrgms;
1507 1412 : for (p1 = colorTrgms + 1; p1 < colorTrgms + arcsCount; p1++)
1508 : {
1509 1344 : if (colorTrgmInfoCmp(p1, p2) > 0)
1510 : {
1511 448 : p2++;
1512 448 : *p2 = *p1;
1513 : }
1514 : else
1515 : {
1516 896 : p2->arcs = list_concat(p2->arcs, p1->arcs);
1517 : }
1518 : }
1519 68 : trgmNFA->colorTrgmsCount = (p2 - colorTrgms) + 1;
1520 : }
1521 : else
1522 : {
1523 44 : trgmNFA->colorTrgmsCount = arcsCount;
1524 : }
1525 :
1526 : /*
1527 : * Count number of simple trigrams generated by each color trigram, and
1528 : * also compute a penalty value, which is the number of simple trigrams
1529 : * times a multiplier that depends on its whitespace content.
1530 : *
1531 : * Note: per-color-trigram counts cannot overflow an int so long as
1532 : * COLOR_COUNT_LIMIT is not more than the cube root of INT_MAX, ie about
1533 : * 1290. However, the grand total totalTrgmCount might conceivably
1534 : * overflow an int, so we use int64 for that within this routine. Also,
1535 : * penalties are calculated in float4 arithmetic to avoid any overflow
1536 : * worries.
1537 : */
1538 112 : totalTrgmCount = 0;
1539 112 : totalTrgmPenalty = 0.0f;
1540 668 : for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1541 : {
1542 556 : ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1543 : int j,
1544 556 : count = 1,
1545 556 : typeIndex = 0;
1546 :
1547 2224 : for (j = 0; j < 3; j++)
1548 : {
1549 1668 : TrgmColor c = trgmInfo->ctrgm.colors[j];
1550 :
1551 1668 : typeIndex *= 2;
1552 1668 : if (c == COLOR_BLANK)
1553 226 : typeIndex++;
1554 : else
1555 1442 : count *= trgmNFA->colorInfo[c].wordCharsCount;
1556 : }
1557 556 : trgmInfo->count = count;
1558 556 : totalTrgmCount += count;
1559 556 : trgmInfo->penalty = penalties[typeIndex] * (float4) count;
1560 556 : totalTrgmPenalty += trgmInfo->penalty;
1561 : }
1562 :
1563 : /* Sort color trigrams in descending order of their penalties */
1564 112 : qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
1565 : colorTrgmInfoPenaltyCmp);
1566 :
1567 : /*
1568 : * Remove color trigrams from the graph so long as total penalty of color
1569 : * trigrams exceeds WISH_TRGM_PENALTY. (If we fail to get down to
1570 : * WISH_TRGM_PENALTY, it's OK so long as total count is no more than
1571 : * MAX_TRGM_COUNT.) We prefer to remove color trigrams with higher
1572 : * penalty, since those are the most promising for reducing the total
1573 : * penalty. When removing a color trigram we have to merge states
1574 : * connected by arcs labeled with that trigram. It's necessary to not
1575 : * merge initial and final states, because our graph becomes useless if
1576 : * that happens; so we cannot always remove the trigram we'd prefer to.
1577 : */
1578 408 : for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1579 : {
1580 358 : ColorTrgmInfo *trgmInfo = &colorTrgms[i];
1581 358 : bool canRemove = true;
1582 : ListCell *cell;
1583 :
1584 : /* Done if we've reached the target */
1585 358 : if (totalTrgmPenalty <= WISH_TRGM_PENALTY)
1586 62 : break;
1587 :
1588 : #ifdef TRGM_REGEXP_DEBUG
1589 : fprintf(stderr, "considering ctrgm %d %d %d, penalty %f, %d arcs\n",
1590 : trgmInfo->ctrgm.colors[0],
1591 : trgmInfo->ctrgm.colors[1],
1592 : trgmInfo->ctrgm.colors[2],
1593 : trgmInfo->penalty,
1594 : list_length(trgmInfo->arcs));
1595 : #endif
1596 :
1597 : /*
1598 : * Does any arc of this color trigram connect initial and final
1599 : * states? If so we can't remove it.
1600 : */
1601 612 : foreach(cell, trgmInfo->arcs)
1602 : {
1603 382 : TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
1604 382 : TrgmState *source = arcInfo->source,
1605 382 : *target = arcInfo->target;
1606 : int source_flags,
1607 : target_flags;
1608 :
1609 : #ifdef TRGM_REGEXP_DEBUG
1610 : fprintf(stderr, "examining arc to s%d (%x) from s%d (%x)\n",
1611 : -target->snumber, target->flags,
1612 : -source->snumber, source->flags);
1613 : #endif
1614 :
1615 : /* examine parent states, if any merging has already happened */
1616 682 : while (source->parent)
1617 300 : source = source->parent;
1618 814 : while (target->parent)
1619 432 : target = target->parent;
1620 :
1621 : #ifdef TRGM_REGEXP_DEBUG
1622 : fprintf(stderr, " ... after completed merges: to s%d (%x) from s%d (%x)\n",
1623 : -target->snumber, target->flags,
1624 : -source->snumber, source->flags);
1625 : #endif
1626 :
1627 : /* we must also consider merges we are planning right now */
1628 382 : source_flags = source->flags | source->tentFlags;
1629 390 : while (source->tentParent)
1630 : {
1631 8 : source = source->tentParent;
1632 8 : source_flags |= source->flags | source->tentFlags;
1633 : }
1634 382 : target_flags = target->flags | target->tentFlags;
1635 412 : while (target->tentParent)
1636 : {
1637 30 : target = target->tentParent;
1638 30 : target_flags |= target->flags | target->tentFlags;
1639 : }
1640 :
1641 : #ifdef TRGM_REGEXP_DEBUG
1642 : fprintf(stderr, " ... after tentative merges: to s%d (%x) from s%d (%x)\n",
1643 : -target->snumber, target_flags,
1644 : -source->snumber, source_flags);
1645 : #endif
1646 :
1647 : /* would fully-merged state have both INIT and FIN set? */
1648 382 : if (((source_flags | target_flags) & (TSTATE_INIT | TSTATE_FIN)) ==
1649 : (TSTATE_INIT | TSTATE_FIN))
1650 : {
1651 66 : canRemove = false;
1652 66 : break;
1653 : }
1654 :
1655 : /* ok so far, so remember planned merge */
1656 316 : if (source != target)
1657 : {
1658 : #ifdef TRGM_REGEXP_DEBUG
1659 : fprintf(stderr, " ... tentatively merging s%d into s%d\n",
1660 : -target->snumber, -source->snumber);
1661 : #endif
1662 226 : target->tentParent = source;
1663 226 : source->tentFlags |= target_flags;
1664 : }
1665 : }
1666 :
1667 : /*
1668 : * We must reset all the tentFlags/tentParent fields before
1669 : * continuing. tentFlags could only have become set in states that
1670 : * are the source or parent or tentative parent of one of the current
1671 : * arcs; likewise tentParent could only have become set in states that
1672 : * are the target or parent or tentative parent of one of the current
1673 : * arcs. There might be some overlap between those sets, but if we
1674 : * clear tentFlags in target states as well as source states, we
1675 : * should be okay even if we visit a state as target before visiting
1676 : * it as a source.
1677 : */
1678 696 : foreach(cell, trgmInfo->arcs)
1679 : {
1680 400 : TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
1681 400 : TrgmState *source = arcInfo->source,
1682 400 : *target = arcInfo->target;
1683 : TrgmState *ttarget;
1684 :
1685 : /* no need to touch previously-merged states */
1686 700 : while (source->parent)
1687 300 : source = source->parent;
1688 880 : while (target->parent)
1689 480 : target = target->parent;
1690 :
1691 812 : while (source)
1692 : {
1693 412 : source->tentFlags = 0;
1694 412 : source = source->tentParent;
1695 : }
1696 :
1697 626 : while ((ttarget = target->tentParent) != NULL)
1698 : {
1699 226 : target->tentParent = NULL;
1700 226 : target->tentFlags = 0; /* in case it was also a source */
1701 226 : target = ttarget;
1702 : }
1703 : }
1704 :
1705 : /* Now, move on if we can't drop this trigram */
1706 296 : if (!canRemove)
1707 : {
1708 : #ifdef TRGM_REGEXP_DEBUG
1709 : fprintf(stderr, " ... not ok to merge\n");
1710 : #endif
1711 66 : continue;
1712 : }
1713 :
1714 : /* OK, merge states linked by each arc labeled by the trigram */
1715 526 : foreach(cell, trgmInfo->arcs)
1716 : {
1717 296 : TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
1718 296 : TrgmState *source = arcInfo->source,
1719 296 : *target = arcInfo->target;
1720 :
1721 584 : while (source->parent)
1722 288 : source = source->parent;
1723 692 : while (target->parent)
1724 396 : target = target->parent;
1725 296 : if (source != target)
1726 : {
1727 : #ifdef TRGM_REGEXP_DEBUG
1728 : fprintf(stderr, "merging s%d into s%d\n",
1729 : -target->snumber, -source->snumber);
1730 : #endif
1731 206 : mergeStates(source, target);
1732 : /* Assert we didn't merge initial and final states */
1733 : Assert((source->flags & (TSTATE_INIT | TSTATE_FIN)) !=
1734 : (TSTATE_INIT | TSTATE_FIN));
1735 : }
1736 : }
1737 :
1738 : /* Mark trigram unexpanded, and update totals */
1739 230 : trgmInfo->expanded = false;
1740 230 : totalTrgmCount -= trgmInfo->count;
1741 230 : totalTrgmPenalty -= trgmInfo->penalty;
1742 : }
1743 :
1744 : /* Did we succeed in fitting into MAX_TRGM_COUNT? */
1745 112 : if (totalTrgmCount > MAX_TRGM_COUNT)
1746 6 : return false;
1747 :
1748 106 : trgmNFA->totalTrgmCount = (int) totalTrgmCount;
1749 :
1750 : /*
1751 : * Sort color trigrams by colors (will be useful for bsearch in packGraph)
1752 : * and enumerate the color trigrams that are expanded.
1753 : */
1754 106 : cnumber = 0;
1755 106 : qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
1756 : colorTrgmInfoCmp);
1757 656 : for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1758 : {
1759 550 : if (colorTrgms[i].expanded)
1760 : {
1761 320 : colorTrgms[i].cnumber = cnumber;
1762 320 : cnumber++;
1763 : }
1764 : }
1765 :
1766 106 : return true;
1767 : }
1768 :
1769 : /*
1770 : * Expand selected color trigrams into regular trigrams.
1771 : *
1772 : * Returns the TRGM array to be passed to the index machinery.
1773 : * The array must be allocated in rcontext.
1774 : */
1775 : static TRGM *
1776 106 : expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext)
1777 : {
1778 : TRGM *trg;
1779 : trgm *p;
1780 : int i;
1781 : TrgmColorInfo blankColor;
1782 : trgm_mb_char blankChar;
1783 :
1784 : /* Set up "blank" color structure containing a single zero character */
1785 106 : memset(blankChar.bytes, 0, sizeof(blankChar.bytes));
1786 106 : blankColor.wordCharsCount = 1;
1787 106 : blankColor.wordChars = &blankChar;
1788 :
1789 : /* Construct the trgm array */
1790 : trg = (TRGM *)
1791 106 : MemoryContextAllocZero(rcontext,
1792 : TRGMHDRSIZE +
1793 106 : trgmNFA->totalTrgmCount * sizeof(trgm));
1794 106 : trg->flag = ARRKEY;
1795 106 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, trgmNFA->totalTrgmCount));
1796 106 : p = GETARR(trg);
1797 656 : for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
1798 : {
1799 550 : ColorTrgmInfo *colorTrgm = &trgmNFA->colorTrgms[i];
1800 : TrgmColorInfo *c[3];
1801 : trgm_mb_char s[3];
1802 : int j,
1803 : i1,
1804 : i2,
1805 : i3;
1806 :
1807 : /* Ignore any unexpanded trigrams ... */
1808 550 : if (!colorTrgm->expanded)
1809 230 : continue;
1810 :
1811 : /* Get colors, substituting the dummy struct for COLOR_BLANK */
1812 1280 : for (j = 0; j < 3; j++)
1813 : {
1814 960 : if (colorTrgm->ctrgm.colors[j] != COLOR_BLANK)
1815 826 : c[j] = &trgmNFA->colorInfo[colorTrgm->ctrgm.colors[j]];
1816 : else
1817 134 : c[j] = &blankColor;
1818 : }
1819 :
1820 : /* Iterate over all possible combinations of colors' characters */
1821 754 : for (i1 = 0; i1 < c[0]->wordCharsCount; i1++)
1822 : {
1823 434 : s[0] = c[0]->wordChars[i1];
1824 1456 : for (i2 = 0; i2 < c[1]->wordCharsCount; i2++)
1825 : {
1826 1022 : s[1] = c[1]->wordChars[i2];
1827 3940 : for (i3 = 0; i3 < c[2]->wordCharsCount; i3++)
1828 : {
1829 2918 : s[2] = c[2]->wordChars[i3];
1830 2918 : fillTrgm(p, s);
1831 2918 : p++;
1832 : }
1833 : }
1834 : }
1835 : }
1836 :
1837 106 : return trg;
1838 : }
1839 :
1840 : /*
1841 : * Convert trigram into trgm datatype.
1842 : */
1843 : static void
1844 2918 : fillTrgm(trgm *ptrgm, trgm_mb_char s[3])
1845 : {
1846 : char str[3 * MAX_MULTIBYTE_CHAR_LEN],
1847 : *p;
1848 : int i,
1849 : j;
1850 :
1851 : /* Write multibyte string into "str" (we don't need null termination) */
1852 2918 : p = str;
1853 :
1854 11672 : for (i = 0; i < 3; i++)
1855 : {
1856 8754 : if (s[i].bytes[0] != 0)
1857 : {
1858 16464 : for (j = 0; j < MAX_MULTIBYTE_CHAR_LEN && s[i].bytes[j]; j++)
1859 8232 : *p++ = s[i].bytes[j];
1860 : }
1861 : else
1862 : {
1863 : /* Emit a space in place of COLOR_BLANK */
1864 522 : *p++ = ' ';
1865 : }
1866 : }
1867 :
1868 : /* Convert "str" to a standard trigram (possibly hashing it) */
1869 2918 : compact_trigram(ptrgm, str, p - str);
1870 2918 : }
1871 :
1872 : /*
1873 : * Merge two states of graph.
1874 : */
1875 : static void
1876 206 : mergeStates(TrgmState *state1, TrgmState *state2)
1877 : {
1878 : Assert(state1 != state2);
1879 : Assert(!state1->parent);
1880 : Assert(!state2->parent);
1881 :
1882 : /* state1 absorbs state2's flags */
1883 206 : state1->flags |= state2->flags;
1884 :
1885 : /* state2, and indirectly all its children, become children of state1 */
1886 206 : state2->parent = state1;
1887 206 : }
1888 :
1889 : /*
1890 : * Compare function for sorting of color trigrams by their colors.
1891 : */
1892 : static int
1893 10178 : colorTrgmInfoCmp(const void *p1, const void *p2)
1894 : {
1895 10178 : const ColorTrgmInfo *c1 = (const ColorTrgmInfo *) p1;
1896 10178 : const ColorTrgmInfo *c2 = (const ColorTrgmInfo *) p2;
1897 :
1898 10178 : return memcmp(&c1->ctrgm, &c2->ctrgm, sizeof(ColorTrgm));
1899 : }
1900 :
1901 : /*
1902 : * Compare function for sorting color trigrams in descending order of
1903 : * their penalty fields.
1904 : */
1905 : static int
1906 846 : colorTrgmInfoPenaltyCmp(const void *p1, const void *p2)
1907 : {
1908 846 : float4 penalty1 = ((const ColorTrgmInfo *) p1)->penalty;
1909 846 : float4 penalty2 = ((const ColorTrgmInfo *) p2)->penalty;
1910 :
1911 846 : if (penalty1 < penalty2)
1912 254 : return 1;
1913 592 : else if (penalty1 == penalty2)
1914 324 : return 0;
1915 : else
1916 268 : return -1;
1917 : }
1918 :
1919 :
1920 : /*---------------------
1921 : * Subroutines for packing the graph into final representation (stage 4).
1922 : *---------------------
1923 : */
1924 :
1925 : /*
1926 : * Pack expanded graph into final representation.
1927 : *
1928 : * The result data must be allocated in rcontext.
1929 : */
1930 : static TrgmPackedGraph *
1931 106 : packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
1932 : {
1933 106 : int snumber = 2,
1934 : arcIndex,
1935 : arcsCount;
1936 : HASH_SEQ_STATUS scan_status;
1937 : TrgmState *state;
1938 : TrgmPackArcInfo *arcs;
1939 : TrgmPackedArc *packedArcs;
1940 : TrgmPackedGraph *result;
1941 : int i,
1942 : j;
1943 :
1944 : /* Enumerate surviving states, giving init and fin reserved numbers */
1945 106 : hash_seq_init(&scan_status, trgmNFA->states);
1946 1510 : while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1947 : {
1948 1942 : while (state->parent)
1949 644 : state = state->parent;
1950 :
1951 1298 : if (state->snumber < 0)
1952 : {
1953 1092 : if (state->flags & TSTATE_INIT)
1954 106 : state->snumber = 0;
1955 986 : else if (state->flags & TSTATE_FIN)
1956 114 : state->snumber = 1;
1957 : else
1958 : {
1959 872 : state->snumber = snumber;
1960 872 : snumber++;
1961 : }
1962 : }
1963 : }
1964 :
1965 : /* Collect array of all arcs */
1966 106 : arcs = palloc_array(TrgmPackArcInfo, trgmNFA->arcsCount);
1967 106 : arcIndex = 0;
1968 106 : hash_seq_init(&scan_status, trgmNFA->states);
1969 1404 : while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
1970 : {
1971 1298 : TrgmState *source = state;
1972 : ListCell *cell;
1973 :
1974 1942 : while (source->parent)
1975 644 : source = source->parent;
1976 :
1977 2744 : foreach(cell, state->arcs)
1978 : {
1979 1446 : TrgmArc *arc = (TrgmArc *) lfirst(cell);
1980 1446 : TrgmState *target = arc->target;
1981 :
1982 2704 : while (target->parent)
1983 1258 : target = target->parent;
1984 :
1985 1446 : if (source->snumber != target->snumber)
1986 : {
1987 : ColorTrgmInfo *ctrgm;
1988 :
1989 1132 : ctrgm = (ColorTrgmInfo *) bsearch(&arc->ctrgm,
1990 1132 : trgmNFA->colorTrgms,
1991 1132 : trgmNFA->colorTrgmsCount,
1992 : sizeof(ColorTrgmInfo),
1993 : colorTrgmInfoCmp);
1994 : Assert(ctrgm != NULL);
1995 : Assert(ctrgm->expanded);
1996 :
1997 1132 : arcs[arcIndex].sourceState = source->snumber;
1998 1132 : arcs[arcIndex].targetState = target->snumber;
1999 1132 : arcs[arcIndex].colorTrgm = ctrgm->cnumber;
2000 1132 : arcIndex++;
2001 : }
2002 : }
2003 : }
2004 :
2005 : /* Sort arcs to ease duplicate detection */
2006 106 : qsort(arcs, arcIndex, sizeof(TrgmPackArcInfo), packArcInfoCmp);
2007 :
2008 : /* We could have duplicates because states were merged. Remove them. */
2009 106 : if (arcIndex > 1)
2010 : {
2011 : /* p1 is probe point, p2 is last known non-duplicate. */
2012 : TrgmPackArcInfo *p1,
2013 : *p2;
2014 :
2015 62 : p2 = arcs;
2016 1092 : for (p1 = arcs + 1; p1 < arcs + arcIndex; p1++)
2017 : {
2018 1030 : if (packArcInfoCmp(p1, p2) > 0)
2019 : {
2020 1018 : p2++;
2021 1018 : *p2 = *p1;
2022 : }
2023 : }
2024 62 : arcsCount = (p2 - arcs) + 1;
2025 : }
2026 : else
2027 44 : arcsCount = arcIndex;
2028 :
2029 : /* Create packed representation */
2030 : result = (TrgmPackedGraph *)
2031 106 : MemoryContextAlloc(rcontext, sizeof(TrgmPackedGraph));
2032 :
2033 : /* Pack color trigrams information */
2034 106 : result->colorTrigramsCount = 0;
2035 656 : for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
2036 : {
2037 550 : if (trgmNFA->colorTrgms[i].expanded)
2038 320 : result->colorTrigramsCount++;
2039 : }
2040 106 : result->colorTrigramGroups = (int *)
2041 106 : MemoryContextAlloc(rcontext, sizeof(int) * result->colorTrigramsCount);
2042 106 : j = 0;
2043 656 : for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
2044 : {
2045 550 : if (trgmNFA->colorTrgms[i].expanded)
2046 : {
2047 320 : result->colorTrigramGroups[j] = trgmNFA->colorTrgms[i].count;
2048 320 : j++;
2049 : }
2050 : }
2051 :
2052 : /* Pack states and arcs information */
2053 106 : result->statesCount = snumber;
2054 106 : result->states = (TrgmPackedState *)
2055 106 : MemoryContextAlloc(rcontext, snumber * sizeof(TrgmPackedState));
2056 : packedArcs = (TrgmPackedArc *)
2057 106 : MemoryContextAlloc(rcontext, arcsCount * sizeof(TrgmPackedArc));
2058 106 : j = 0;
2059 1190 : for (i = 0; i < snumber; i++)
2060 : {
2061 1084 : int cnt = 0;
2062 :
2063 1084 : result->states[i].arcs = &packedArcs[j];
2064 2204 : while (j < arcsCount && arcs[j].sourceState == i)
2065 : {
2066 1120 : packedArcs[j].targetState = arcs[j].targetState;
2067 1120 : packedArcs[j].colorTrgm = arcs[j].colorTrgm;
2068 1120 : cnt++;
2069 1120 : j++;
2070 : }
2071 1084 : result->states[i].arcsCount = cnt;
2072 : }
2073 :
2074 : /* Allocate working memory for trigramsMatchGraph() */
2075 106 : result->colorTrigramsActive = (bool *)
2076 106 : MemoryContextAlloc(rcontext, sizeof(bool) * result->colorTrigramsCount);
2077 106 : result->statesActive = (bool *)
2078 106 : MemoryContextAlloc(rcontext, sizeof(bool) * result->statesCount);
2079 106 : result->statesQueue = (int *)
2080 106 : MemoryContextAlloc(rcontext, sizeof(int) * result->statesCount);
2081 :
2082 106 : return result;
2083 : }
2084 :
2085 : /*
2086 : * Comparison function for sorting TrgmPackArcInfos.
2087 : *
2088 : * Compares arcs in following order: sourceState, colorTrgm, targetState.
2089 : */
2090 : static int
2091 9092 : packArcInfoCmp(const void *a1, const void *a2)
2092 : {
2093 9092 : const TrgmPackArcInfo *p1 = (const TrgmPackArcInfo *) a1;
2094 9092 : const TrgmPackArcInfo *p2 = (const TrgmPackArcInfo *) a2;
2095 :
2096 9092 : if (p1->sourceState < p2->sourceState)
2097 4358 : return -1;
2098 4734 : if (p1->sourceState > p2->sourceState)
2099 4100 : return 1;
2100 634 : if (p1->colorTrgm < p2->colorTrgm)
2101 396 : return -1;
2102 238 : if (p1->colorTrgm > p2->colorTrgm)
2103 214 : return 1;
2104 24 : if (p1->targetState < p2->targetState)
2105 0 : return -1;
2106 24 : if (p1->targetState > p2->targetState)
2107 0 : return 1;
2108 24 : return 0;
2109 : }
2110 :
2111 :
2112 : /*---------------------
2113 : * Debugging functions
2114 : *
2115 : * These are designed to emit GraphViz files.
2116 : *---------------------
2117 : */
2118 :
2119 : #ifdef TRGM_REGEXP_DEBUG
2120 :
2121 : /*
2122 : * Print initial NFA, in regexp library's representation
2123 : */
2124 : static void
2125 : printSourceNFA(regex_t *regex, TrgmColorInfo *colors, int ncolors)
2126 : {
2127 : StringInfoData buf;
2128 : int nstates = pg_reg_getnumstates(regex);
2129 : int state;
2130 : int i;
2131 :
2132 : initStringInfo(&buf);
2133 :
2134 : appendStringInfoString(&buf, "\ndigraph sourceNFA {\n");
2135 :
2136 : for (state = 0; state < nstates; state++)
2137 : {
2138 : regex_arc_t *arcs;
2139 : int i,
2140 : arcsCount;
2141 :
2142 : appendStringInfo(&buf, "s%d", state);
2143 : if (pg_reg_getfinalstate(regex) == state)
2144 : appendStringInfoString(&buf, " [shape = doublecircle]");
2145 : appendStringInfoString(&buf, ";\n");
2146 :
2147 : arcsCount = pg_reg_getnumoutarcs(regex, state);
2148 : arcs = palloc_array(regex_arc_t, arcsCount);
2149 : pg_reg_getoutarcs(regex, state, arcs, arcsCount);
2150 :
2151 : for (i = 0; i < arcsCount; i++)
2152 : {
2153 : appendStringInfo(&buf, " s%d -> s%d [label = \"%d\"];\n",
2154 : state, arcs[i].to, arcs[i].co);
2155 : }
2156 :
2157 : pfree(arcs);
2158 : }
2159 :
2160 : appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
2161 : appendStringInfo(&buf, " initial -> s%d;\n",
2162 : pg_reg_getinitialstate(regex));
2163 :
2164 : /* Print colors */
2165 : appendStringInfoString(&buf, " { rank = sink;\n");
2166 : appendStringInfoString(&buf, " Colors [shape = none, margin=0, label=<\n");
2167 :
2168 : for (i = 0; i < ncolors; i++)
2169 : {
2170 : TrgmColorInfo *color = &colors[i];
2171 : int j;
2172 :
2173 : appendStringInfo(&buf, "<br/>Color %d: ", i);
2174 : if (color->expandable)
2175 : {
2176 : for (j = 0; j < color->wordCharsCount; j++)
2177 : {
2178 : char s[MAX_MULTIBYTE_CHAR_LEN + 1];
2179 :
2180 : memcpy(s, color->wordChars[j].bytes, MAX_MULTIBYTE_CHAR_LEN);
2181 : s[MAX_MULTIBYTE_CHAR_LEN] = '\0';
2182 : appendStringInfoString(&buf, s);
2183 : }
2184 : }
2185 : else
2186 : appendStringInfoString(&buf, "not expandable");
2187 : appendStringInfoChar(&buf, '\n');
2188 : }
2189 :
2190 : appendStringInfoString(&buf, " >];\n");
2191 : appendStringInfoString(&buf, " }\n");
2192 : appendStringInfoString(&buf, "}\n");
2193 :
2194 : {
2195 : /* dot -Tpng -o /tmp/source.png < /tmp/source.gv */
2196 : FILE *fp = fopen("/tmp/source.gv", "w");
2197 :
2198 : fprintf(fp, "%s", buf.data);
2199 : fclose(fp);
2200 : }
2201 :
2202 : pfree(buf.data);
2203 : }
2204 :
2205 : /*
2206 : * Print expanded graph.
2207 : */
2208 : static void
2209 : printTrgmNFA(TrgmNFA *trgmNFA)
2210 : {
2211 : StringInfoData buf;
2212 : HASH_SEQ_STATUS scan_status;
2213 : TrgmState *state;
2214 : TrgmState *initstate = NULL;
2215 :
2216 : initStringInfo(&buf);
2217 :
2218 : appendStringInfoString(&buf, "\ndigraph transformedNFA {\n");
2219 :
2220 : hash_seq_init(&scan_status, trgmNFA->states);
2221 : while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL)
2222 : {
2223 : ListCell *cell;
2224 :
2225 : appendStringInfo(&buf, "s%d", -state->snumber);
2226 : if (state->flags & TSTATE_FIN)
2227 : appendStringInfoString(&buf, " [shape = doublecircle]");
2228 : if (state->flags & TSTATE_INIT)
2229 : initstate = state;
2230 : appendStringInfo(&buf, " [label = \"%d\"]", state->stateKey.nstate);
2231 : appendStringInfoString(&buf, ";\n");
2232 :
2233 : foreach(cell, state->arcs)
2234 : {
2235 : TrgmArc *arc = (TrgmArc *) lfirst(cell);
2236 :
2237 : appendStringInfo(&buf, " s%d -> s%d [label = \"",
2238 : -state->snumber, -arc->target->snumber);
2239 : printTrgmColor(&buf, arc->ctrgm.colors[0]);
2240 : appendStringInfoChar(&buf, ' ');
2241 : printTrgmColor(&buf, arc->ctrgm.colors[1]);
2242 : appendStringInfoChar(&buf, ' ');
2243 : printTrgmColor(&buf, arc->ctrgm.colors[2]);
2244 : appendStringInfoString(&buf, "\"];\n");
2245 : }
2246 : }
2247 :
2248 : if (initstate)
2249 : {
2250 : appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
2251 : appendStringInfo(&buf, " initial -> s%d;\n", -initstate->snumber);
2252 : }
2253 :
2254 : appendStringInfoString(&buf, "}\n");
2255 :
2256 : {
2257 : /* dot -Tpng -o /tmp/transformed.png < /tmp/transformed.gv */
2258 : FILE *fp = fopen("/tmp/transformed.gv", "w");
2259 :
2260 : fprintf(fp, "%s", buf.data);
2261 : fclose(fp);
2262 : }
2263 :
2264 : pfree(buf.data);
2265 : }
2266 :
2267 : /*
2268 : * Print a TrgmColor readably.
2269 : */
2270 : static void
2271 : printTrgmColor(StringInfo buf, TrgmColor co)
2272 : {
2273 : if (co == COLOR_UNKNOWN)
2274 : appendStringInfoChar(buf, 'u');
2275 : else if (co == COLOR_BLANK)
2276 : appendStringInfoChar(buf, 'b');
2277 : else
2278 : appendStringInfo(buf, "%d", (int) co);
2279 : }
2280 :
2281 : /*
2282 : * Print final packed representation of trigram-based expanded graph.
2283 : */
2284 : static void
2285 : printTrgmPackedGraph(TrgmPackedGraph *packedGraph, TRGM *trigrams)
2286 : {
2287 : StringInfoData buf;
2288 : trgm *p;
2289 : int i;
2290 :
2291 : initStringInfo(&buf);
2292 :
2293 : appendStringInfoString(&buf, "\ndigraph packedGraph {\n");
2294 :
2295 : for (i = 0; i < packedGraph->statesCount; i++)
2296 : {
2297 : TrgmPackedState *state = &packedGraph->states[i];
2298 : int j;
2299 :
2300 : appendStringInfo(&buf, " s%d", i);
2301 : if (i == 1)
2302 : appendStringInfoString(&buf, " [shape = doublecircle]");
2303 :
2304 : appendStringInfo(&buf, " [label = <s%d>];\n", i);
2305 :
2306 : for (j = 0; j < state->arcsCount; j++)
2307 : {
2308 : TrgmPackedArc *arc = &state->arcs[j];
2309 :
2310 : appendStringInfo(&buf, " s%d -> s%d [label = \"trigram %d\"];\n",
2311 : i, arc->targetState, arc->colorTrgm);
2312 : }
2313 : }
2314 :
2315 : appendStringInfoString(&buf, " node [shape = point ]; initial;\n");
2316 : appendStringInfo(&buf, " initial -> s%d;\n", 0);
2317 :
2318 : /* Print trigrams */
2319 : appendStringInfoString(&buf, " { rank = sink;\n");
2320 : appendStringInfoString(&buf, " Trigrams [shape = none, margin=0, label=<\n");
2321 :
2322 : p = GETARR(trigrams);
2323 : for (i = 0; i < packedGraph->colorTrigramsCount; i++)
2324 : {
2325 : int count = packedGraph->colorTrigramGroups[i];
2326 : int j;
2327 :
2328 : appendStringInfo(&buf, "<br/>Trigram %d: ", i);
2329 :
2330 : for (j = 0; j < count; j++)
2331 : {
2332 : if (j > 0)
2333 : appendStringInfoString(&buf, ", ");
2334 :
2335 : /*
2336 : * XXX This representation is nice only for all-ASCII trigrams.
2337 : */
2338 : appendStringInfo(&buf, "\"%c%c%c\"", (*p)[0], (*p)[1], (*p)[2]);
2339 : p++;
2340 : }
2341 : }
2342 :
2343 : appendStringInfoString(&buf, " >];\n");
2344 : appendStringInfoString(&buf, " }\n");
2345 : appendStringInfoString(&buf, "}\n");
2346 :
2347 : {
2348 : /* dot -Tpng -o /tmp/packed.png < /tmp/packed.gv */
2349 : FILE *fp = fopen("/tmp/packed.gv", "w");
2350 :
2351 : fprintf(fp, "%s", buf.data);
2352 : fclose(fp);
2353 : }
2354 :
2355 : pfree(buf.data);
2356 : }
2357 :
2358 : #endif /* TRGM_REGEXP_DEBUG */
|