Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * regprefix.c
4 : * Extract a common prefix, if any, from a compiled regex.
5 : *
6 : *
7 : * Portions Copyright (c) 2012-2026, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1998, 1999 Henry Spencer
9 : *
10 : * IDENTIFICATION
11 : * src/backend/regex/regprefix.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "regex/regguts.h"
17 :
18 :
19 : /*
20 : * forward declarations
21 : */
22 : static int findprefix(struct cnfa *cnfa, struct colormap *cm,
23 : chr *string, size_t *slength);
24 :
25 :
26 : /*
27 : * pg_regprefix - get common prefix for regular expression
28 : *
29 : * Returns one of:
30 : * REG_NOMATCH: there is no common prefix of strings matching the regex
31 : * REG_PREFIX: there is a common prefix of strings matching the regex
32 : * REG_EXACT: all strings satisfying the regex must match the same string
33 : * or a REG_XXX error code
34 : *
35 : * In the non-failure cases, *string is set to a palloc'd string containing
36 : * the common prefix or exact value, of length *slength (measured in chrs
37 : * not bytes!).
38 : *
39 : * This function does not analyze all complex cases (such as lookaround
40 : * constraints) exactly. Therefore it is possible that some strings matching
41 : * the reported prefix or exact-match string do not satisfy the regex. But
42 : * it should never be the case that a string satisfying the regex does not
43 : * match the reported prefix or exact-match string.
44 : */
45 : int
46 8421 : pg_regprefix(regex_t *re,
47 : chr **string,
48 : size_t *slength)
49 : {
50 : struct guts *g;
51 : struct cnfa *cnfa;
52 : int st;
53 :
54 : /* sanity checks */
55 8421 : if (string == NULL || slength == NULL)
56 0 : return REG_INVARG;
57 8421 : *string = NULL; /* initialize for failure cases */
58 8421 : *slength = 0;
59 8421 : if (re == NULL || re->re_magic != REMAGIC)
60 0 : return REG_INVARG;
61 8421 : if (re->re_csize != sizeof(chr))
62 0 : return REG_MIXED;
63 :
64 : /* Initialize locale-dependent support */
65 8421 : pg_set_regex_collation(re->re_collation);
66 :
67 : /* setup */
68 8421 : g = (struct guts *) re->re_guts;
69 8421 : if (g->info & REG_UIMPOSSIBLE)
70 1 : return REG_NOMATCH;
71 :
72 : /*
73 : * This implementation considers only the search NFA for the topmost regex
74 : * tree node. Therefore, constraints such as backrefs are not fully
75 : * applied, which is allowed per the function's API spec.
76 : */
77 : assert(g->tree != NULL);
78 8420 : cnfa = &g->tree->cnfa;
79 :
80 : /* matchall NFAs never have a fixed prefix */
81 8420 : if (cnfa->flags & MATCHALL)
82 6 : return REG_NOMATCH;
83 :
84 : /*
85 : * Since a correct NFA should never contain any exit-free loops, it should
86 : * not be possible for our traversal to return to a previously visited NFA
87 : * state. Hence we need at most nstates chrs in the output string.
88 : */
89 8414 : *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
90 8414 : if (*string == NULL)
91 0 : return REG_ESPACE;
92 :
93 : /* do it */
94 8414 : st = findprefix(cnfa, &g->cmap, *string, slength);
95 :
96 : assert(*slength <= cnfa->nstates);
97 :
98 : /* clean up */
99 8414 : if (st != REG_PREFIX && st != REG_EXACT)
100 : {
101 377 : FREE(*string);
102 377 : *string = NULL;
103 377 : *slength = 0;
104 : }
105 :
106 8414 : return st;
107 : }
108 :
109 : /*
110 : * findprefix - extract common prefix from cNFA
111 : *
112 : * Results are returned into the preallocated chr array string[], with
113 : * *slength (which must be preset to zero) incremented for each chr.
114 : */
115 : static int /* regprefix return code */
116 8414 : findprefix(struct cnfa *cnfa,
117 : struct colormap *cm,
118 : chr *string,
119 : size_t *slength)
120 : {
121 : int st;
122 : int nextst;
123 : color thiscolor;
124 : chr c;
125 : struct carc *ca;
126 :
127 : /*
128 : * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
129 : * anchored left. If we have both BOS and BOL, they must go to the same
130 : * next state.
131 : */
132 8414 : st = cnfa->pre;
133 8414 : nextst = -1;
134 16565 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
135 : {
136 8420 : if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
137 : {
138 8157 : if (nextst == -1)
139 8151 : nextst = ca->to;
140 6 : else if (nextst != ca->to)
141 6 : return REG_NOMATCH;
142 : }
143 : else
144 263 : return REG_NOMATCH;
145 : }
146 8145 : if (nextst == -1)
147 0 : return REG_NOMATCH;
148 :
149 : /*
150 : * Scan through successive states, stopping as soon as we find one with
151 : * more than one acceptable transition character (either multiple colors
152 : * on out-arcs, or a color with more than one member chr).
153 : *
154 : * We could find a state with multiple out-arcs that are all labeled with
155 : * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
156 : * In that case we add the chr "c" to the output string but then exit the
157 : * loop with nextst == -1. This leaves a little bit on the table: if the
158 : * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
159 : * to the prefix. But chasing multiple parallel state chains doesn't seem
160 : * worth the trouble.
161 : */
162 : do
163 : {
164 101809 : st = nextst;
165 101809 : nextst = -1;
166 101809 : thiscolor = COLORLESS;
167 195567 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
168 : {
169 : /* We can ignore BOS/BOL arcs */
170 101848 : if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
171 0 : continue;
172 :
173 : /*
174 : * ... but EOS/EOL arcs terminate the search, as do RAINBOW arcs
175 : * and LACONs
176 : */
177 101848 : if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] ||
178 95341 : ca->co == RAINBOW || ca->co >= cnfa->ncolors)
179 : {
180 8063 : thiscolor = COLORLESS;
181 8063 : break;
182 : }
183 93785 : if (thiscolor == COLORLESS)
184 : {
185 : /* First plain outarc */
186 93752 : thiscolor = ca->co;
187 93752 : nextst = ca->to;
188 : }
189 33 : else if (thiscolor == ca->co)
190 : {
191 : /* Another plain outarc for same color */
192 6 : nextst = -1;
193 : }
194 : else
195 : {
196 : /* More than one plain outarc color terminates the search */
197 27 : thiscolor = COLORLESS;
198 27 : break;
199 : }
200 : }
201 : /* Done if we didn't find exactly one color on plain outarcs */
202 101809 : if (thiscolor == COLORLESS)
203 8090 : break;
204 : /* The color must be a singleton */
205 93719 : if (cm->cd[thiscolor].nschrs != 1)
206 49 : break;
207 : /* Must not have any high-color-map entries */
208 93670 : if (cm->cd[thiscolor].nuchrs != 0)
209 0 : break;
210 :
211 : /*
212 : * Identify the color's sole member chr and add it to the prefix
213 : * string. In general the colormap data structure doesn't provide a
214 : * way to find color member chrs, except by trying GETCOLOR() on each
215 : * possible chr value, which won't do at all. However, for the cases
216 : * we care about it should be sufficient to test the "firstchr" value,
217 : * that is the first chr ever added to the color. There are cases
218 : * where this might no longer be a member of the color (so we do need
219 : * to test), but none of them are likely to arise for a character that
220 : * is a member of a common prefix. If we do hit such a corner case,
221 : * we just fall out without adding anything to the prefix string.
222 : */
223 93670 : c = cm->cd[thiscolor].firstchr;
224 93670 : if (GETCOLOR(cm, c) != thiscolor)
225 0 : break;
226 :
227 93670 : string[(*slength)++] = c;
228 :
229 : /* Advance to next state, but only if we have a unique next state */
230 93670 : } while (nextst != -1);
231 :
232 : /*
233 : * If we ended at a state that only has EOS/EOL outarcs leading to the
234 : * "post" state, then we have an exact-match string. Note this is true
235 : * even if the string is of zero length.
236 : */
237 8145 : nextst = -1;
238 14652 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
239 : {
240 8145 : if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
241 : {
242 6507 : if (nextst == -1)
243 6507 : nextst = ca->to;
244 0 : else if (nextst != ca->to)
245 : {
246 0 : nextst = -1;
247 0 : break;
248 : }
249 : }
250 : else
251 : {
252 1638 : nextst = -1;
253 1638 : break;
254 : }
255 : }
256 8145 : if (nextst == cnfa->post)
257 6507 : return REG_EXACT;
258 :
259 : /*
260 : * Otherwise, if we were unable to identify any prefix characters, say
261 : * NOMATCH --- the pattern is anchored left, but doesn't specify any
262 : * particular first character.
263 : */
264 1638 : if (*slength > 0)
265 1530 : return REG_PREFIX;
266 :
267 108 : return REG_NOMATCH;
268 : }
|