Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * regprefix.c
4 : * Extract a common prefix, if any, from a compiled regex.
5 : *
6 : *
7 : * Portions Copyright (c) 2012-2026, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1998, 1999 Henry Spencer
9 : *
10 : * IDENTIFICATION
11 : * src/backend/regex/regprefix.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "regex/regguts.h"
17 :
18 :
19 : /*
20 : * forward declarations
21 : */
22 : static int findprefix(struct cnfa *cnfa, struct colormap *cm,
23 : chr *string, size_t *slength);
24 :
25 :
26 : /*
27 : * pg_regprefix - get common prefix for regular expression
28 : *
29 : * Returns one of:
30 : * REG_NOMATCH: there is no common prefix of strings matching the regex
31 : * REG_PREFIX: there is a common prefix of strings matching the regex
32 : * REG_EXACT: all strings satisfying the regex must match the same string
33 : * or a REG_XXX error code
34 : *
35 : * In the non-failure cases, *string is set to a palloc'd string containing
36 : * the common prefix or exact value, of length *slength (measured in chrs
37 : * not bytes!).
38 : *
39 : * This function does not analyze all complex cases (such as lookaround
40 : * constraints) exactly. Therefore it is possible that some strings matching
41 : * the reported prefix or exact-match string do not satisfy the regex. But
42 : * it should never be the case that a string satisfying the regex does not
43 : * match the reported prefix or exact-match string.
44 : */
45 : int
46 8648 : pg_regprefix(regex_t *re,
47 : chr **string,
48 : size_t *slength)
49 : {
50 : struct guts *g;
51 : struct cnfa *cnfa;
52 : int st;
53 :
54 : /* sanity checks */
55 8648 : if (string == NULL || slength == NULL)
56 0 : return REG_INVARG;
57 8648 : *string = NULL; /* initialize for failure cases */
58 8648 : *slength = 0;
59 8648 : if (re == NULL || re->re_magic != REMAGIC)
60 0 : return REG_INVARG;
61 8648 : if (re->re_csize != sizeof(chr))
62 0 : return REG_MIXED;
63 :
64 : /* Initialize locale-dependent support */
65 8648 : pg_set_regex_collation(re->re_collation);
66 :
67 : /* setup */
68 8648 : g = (struct guts *) re->re_guts;
69 8648 : if (g->info & REG_UIMPOSSIBLE)
70 1 : return REG_NOMATCH;
71 :
72 : /*
73 : * This implementation considers only the search NFA for the topmost regex
74 : * tree node. Therefore, constraints such as backrefs are not fully
75 : * applied, which is allowed per the function's API spec.
76 : */
77 : assert(g->tree != NULL);
78 8647 : cnfa = &g->tree->cnfa;
79 :
80 : /* matchall NFAs never have a fixed prefix */
81 8647 : if (cnfa->flags & MATCHALL)
82 6 : return REG_NOMATCH;
83 :
84 : /*
85 : * Since a correct NFA should never contain any exit-free loops, it should
86 : * not be possible for our traversal to return to a previously visited NFA
87 : * state. Hence we need at most nstates chrs in the output string.
88 : */
89 8641 : *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
90 8641 : if (*string == NULL)
91 0 : return REG_ESPACE;
92 :
93 : /* do it */
94 8641 : st = findprefix(cnfa, &g->cmap, *string, slength);
95 :
96 : assert(*slength <= cnfa->nstates);
97 :
98 : /* clean up */
99 8641 : if (st != REG_PREFIX && st != REG_EXACT)
100 : {
101 382 : FREE(*string);
102 382 : *string = NULL;
103 382 : *slength = 0;
104 : }
105 :
106 8641 : return st;
107 : }
108 :
109 : /*
110 : * findprefix - extract common prefix from cNFA
111 : *
112 : * Results are returned into the preallocated chr array string[], with
113 : * *slength (which must be preset to zero) incremented for each chr.
114 : */
115 : static int /* regprefix return code */
116 8641 : findprefix(struct cnfa *cnfa,
117 : struct colormap *cm,
118 : chr *string,
119 : size_t *slength)
120 : {
121 : int st;
122 : int nextst;
123 : color thiscolor;
124 : chr c;
125 : struct carc *ca;
126 :
127 : /*
128 : * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
129 : * anchored left. If we have both BOS and BOL, they must go to the same
130 : * next state.
131 : */
132 8641 : st = cnfa->pre;
133 8641 : nextst = -1;
134 17020 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
135 : {
136 8647 : if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
137 : {
138 8385 : if (nextst == -1)
139 8379 : nextst = ca->to;
140 6 : else if (nextst != ca->to)
141 6 : return REG_NOMATCH;
142 : }
143 : else
144 262 : return REG_NOMATCH;
145 : }
146 8373 : if (nextst == -1)
147 0 : return REG_NOMATCH;
148 :
149 : /*
150 : * Scan through successive states, stopping as soon as we find one with
151 : * more than one acceptable transition character (either multiple colors
152 : * on out-arcs, or a color with more than one member chr).
153 : *
154 : * We could find a state with multiple out-arcs that are all labeled with
155 : * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
156 : * In that case we add the chr "c" to the output string but then exit the
157 : * loop with nextst == -1. This leaves a little bit on the table: if the
158 : * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
159 : * to the prefix. But chasing multiple parallel state chains doesn't seem
160 : * worth the trouble.
161 : */
162 : do
163 : {
164 103745 : st = nextst;
165 103745 : nextst = -1;
166 103745 : thiscolor = COLORLESS;
167 199217 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
168 : {
169 : /* We can ignore BOS/BOL arcs */
170 103784 : if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
171 0 : continue;
172 :
173 : /*
174 : * ... but EOS/EOL arcs terminate the search, as do RAINBOW arcs
175 : * and LACONs
176 : */
177 103784 : if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] ||
178 97151 : ca->co == RAINBOW || ca->co >= cnfa->ncolors)
179 : {
180 8285 : thiscolor = COLORLESS;
181 8285 : break;
182 : }
183 95499 : if (thiscolor == COLORLESS)
184 : {
185 : /* First plain outarc */
186 95466 : thiscolor = ca->co;
187 95466 : nextst = ca->to;
188 : }
189 33 : else if (thiscolor == ca->co)
190 : {
191 : /* Another plain outarc for same color */
192 6 : nextst = -1;
193 : }
194 : else
195 : {
196 : /* More than one plain outarc color terminates the search */
197 27 : thiscolor = COLORLESS;
198 27 : break;
199 : }
200 : }
201 : /* Done if we didn't find exactly one color on plain outarcs */
202 103745 : if (thiscolor == COLORLESS)
203 8312 : break;
204 : /* The color must be a singleton */
205 95433 : if (cm->cd[thiscolor].nschrs != 1)
206 55 : break;
207 : /* Must not have any high-color-map entries */
208 95378 : if (cm->cd[thiscolor].nuchrs != 0)
209 0 : break;
210 :
211 : /*
212 : * Identify the color's sole member chr and add it to the prefix
213 : * string. In general the colormap data structure doesn't provide a
214 : * way to find color member chrs, except by trying GETCOLOR() on each
215 : * possible chr value, which won't do at all. However, for the cases
216 : * we care about it should be sufficient to test the "firstchr" value,
217 : * that is the first chr ever added to the color. There are cases
218 : * where this might no longer be a member of the color (so we do need
219 : * to test), but none of them are likely to arise for a character that
220 : * is a member of a common prefix. If we do hit such a corner case,
221 : * we just fall out without adding anything to the prefix string.
222 : */
223 95378 : c = cm->cd[thiscolor].firstchr;
224 95378 : if (GETCOLOR(cm, c) != thiscolor)
225 0 : break;
226 :
227 95378 : string[(*slength)++] = c;
228 :
229 : /* Advance to next state, but only if we have a unique next state */
230 95378 : } while (nextst != -1);
231 :
232 : /*
233 : * If we ended at a state that only has EOS/EOL outarcs leading to the
234 : * "post" state, then we have an exact-match string. Note this is true
235 : * even if the string is of zero length.
236 : */
237 8373 : nextst = -1;
238 15006 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
239 : {
240 8373 : if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
241 : {
242 6633 : if (nextst == -1)
243 6633 : nextst = ca->to;
244 0 : else if (nextst != ca->to)
245 : {
246 0 : nextst = -1;
247 0 : break;
248 : }
249 : }
250 : else
251 : {
252 1740 : nextst = -1;
253 1740 : break;
254 : }
255 : }
256 8373 : if (nextst == cnfa->post)
257 6633 : return REG_EXACT;
258 :
259 : /*
260 : * Otherwise, if we were unable to identify any prefix characters, say
261 : * NOMATCH --- the pattern is anchored left, but doesn't specify any
262 : * particular first character.
263 : */
264 1740 : if (*slength > 0)
265 1626 : return REG_PREFIX;
266 :
267 114 : return REG_NOMATCH;
268 : }
|