Line data Source code
1 : /*------------------------------------------------------------------------- 2 : * 3 : * regprefix.c 4 : * Extract a common prefix, if any, from a compiled regex. 5 : * 6 : * 7 : * Portions Copyright (c) 2012-2025, PostgreSQL Global Development Group 8 : * Portions Copyright (c) 1998, 1999 Henry Spencer 9 : * 10 : * IDENTIFICATION 11 : * src/backend/regex/regprefix.c 12 : * 13 : *------------------------------------------------------------------------- 14 : */ 15 : 16 : #include "regex/regguts.h" 17 : 18 : 19 : /* 20 : * forward declarations 21 : */ 22 : static int findprefix(struct cnfa *cnfa, struct colormap *cm, 23 : chr *string, size_t *slength); 24 : 25 : 26 : /* 27 : * pg_regprefix - get common prefix for regular expression 28 : * 29 : * Returns one of: 30 : * REG_NOMATCH: there is no common prefix of strings matching the regex 31 : * REG_PREFIX: there is a common prefix of strings matching the regex 32 : * REG_EXACT: all strings satisfying the regex must match the same string 33 : * or a REG_XXX error code 34 : * 35 : * In the non-failure cases, *string is set to a palloc'd string containing 36 : * the common prefix or exact value, of length *slength (measured in chrs 37 : * not bytes!). 38 : * 39 : * This function does not analyze all complex cases (such as lookaround 40 : * constraints) exactly. Therefore it is possible that some strings matching 41 : * the reported prefix or exact-match string do not satisfy the regex. But 42 : * it should never be the case that a string satisfying the regex does not 43 : * match the reported prefix or exact-match string. 44 : */ 45 : int 46 14830 : pg_regprefix(regex_t *re, 47 : chr **string, 48 : size_t *slength) 49 : { 50 : struct guts *g; 51 : struct cnfa *cnfa; 52 : int st; 53 : 54 : /* sanity checks */ 55 14830 : if (string == NULL || slength == NULL) 56 0 : return REG_INVARG; 57 14830 : *string = NULL; /* initialize for failure cases */ 58 14830 : *slength = 0; 59 14830 : if (re == NULL || re->re_magic != REMAGIC) 60 0 : return REG_INVARG; 61 14830 : if (re->re_csize != sizeof(chr)) 62 0 : return REG_MIXED; 63 : 64 : /* Initialize locale-dependent support */ 65 14830 : pg_set_regex_collation(re->re_collation); 66 : 67 : /* setup */ 68 14830 : g = (struct guts *) re->re_guts; 69 14830 : if (g->info & REG_UIMPOSSIBLE) 70 2 : return REG_NOMATCH; 71 : 72 : /* 73 : * This implementation considers only the search NFA for the topmost regex 74 : * tree node. Therefore, constraints such as backrefs are not fully 75 : * applied, which is allowed per the function's API spec. 76 : */ 77 : assert(g->tree != NULL); 78 14828 : cnfa = &g->tree->cnfa; 79 : 80 : /* matchall NFAs never have a fixed prefix */ 81 14828 : if (cnfa->flags & MATCHALL) 82 12 : return REG_NOMATCH; 83 : 84 : /* 85 : * Since a correct NFA should never contain any exit-free loops, it should 86 : * not be possible for our traversal to return to a previously visited NFA 87 : * state. Hence we need at most nstates chrs in the output string. 88 : */ 89 14816 : *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr)); 90 14816 : if (*string == NULL) 91 0 : return REG_ESPACE; 92 : 93 : /* do it */ 94 14816 : st = findprefix(cnfa, &g->cmap, *string, slength); 95 : 96 : assert(*slength <= cnfa->nstates); 97 : 98 : /* clean up */ 99 14816 : if (st != REG_PREFIX && st != REG_EXACT) 100 : { 101 712 : FREE(*string); 102 712 : *string = NULL; 103 712 : *slength = 0; 104 : } 105 : 106 14816 : return st; 107 : } 108 : 109 : /* 110 : * findprefix - extract common prefix from cNFA 111 : * 112 : * Results are returned into the preallocated chr array string[], with 113 : * *slength (which must be preset to zero) incremented for each chr. 114 : */ 115 : static int /* regprefix return code */ 116 14816 : findprefix(struct cnfa *cnfa, 117 : struct colormap *cm, 118 : chr *string, 119 : size_t *slength) 120 : { 121 : int st; 122 : int nextst; 123 : color thiscolor; 124 : chr c; 125 : struct carc *ca; 126 : 127 : /* 128 : * The "pre" state must have only BOS/BOL outarcs, else pattern isn't 129 : * anchored left. If we have both BOS and BOL, they must go to the same 130 : * next state. 131 : */ 132 14816 : st = cnfa->pre; 133 14816 : nextst = -1; 134 29112 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) 135 : { 136 14828 : if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) 137 : { 138 14308 : if (nextst == -1) 139 14296 : nextst = ca->to; 140 12 : else if (nextst != ca->to) 141 12 : return REG_NOMATCH; 142 : } 143 : else 144 520 : return REG_NOMATCH; 145 : } 146 14284 : if (nextst == -1) 147 0 : return REG_NOMATCH; 148 : 149 : /* 150 : * Scan through successive states, stopping as soon as we find one with 151 : * more than one acceptable transition character (either multiple colors 152 : * on out-arcs, or a color with more than one member chr). 153 : * 154 : * We could find a state with multiple out-arcs that are all labeled with 155 : * the same singleton color; this comes from patterns like "^ab(cde|cxy)". 156 : * In that case we add the chr "c" to the output string but then exit the 157 : * loop with nextst == -1. This leaves a little bit on the table: if the 158 : * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added 159 : * to the prefix. But chasing multiple parallel state chains doesn't seem 160 : * worth the trouble. 161 : */ 162 : do 163 : { 164 181892 : st = nextst; 165 181892 : nextst = -1; 166 181892 : thiscolor = COLORLESS; 167 349652 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) 168 : { 169 : /* We can ignore BOS/BOL arcs */ 170 181946 : if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) 171 0 : continue; 172 : 173 : /* 174 : * ... but EOS/EOL arcs terminate the search, as do RAINBOW arcs 175 : * and LACONs 176 : */ 177 181946 : if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] || 178 169740 : ca->co == RAINBOW || ca->co >= cnfa->ncolors) 179 : { 180 14156 : thiscolor = COLORLESS; 181 14156 : break; 182 : } 183 167790 : if (thiscolor == COLORLESS) 184 : { 185 : /* First plain outarc */ 186 167748 : thiscolor = ca->co; 187 167748 : nextst = ca->to; 188 : } 189 42 : else if (thiscolor == ca->co) 190 : { 191 : /* Another plain outarc for same color */ 192 12 : nextst = -1; 193 : } 194 : else 195 : { 196 : /* More than one plain outarc color terminates the search */ 197 30 : thiscolor = COLORLESS; 198 30 : break; 199 : } 200 : } 201 : /* Done if we didn't find exactly one color on plain outarcs */ 202 181892 : if (thiscolor == COLORLESS) 203 14186 : break; 204 : /* The color must be a singleton */ 205 167706 : if (cm->cd[thiscolor].nschrs != 1) 206 86 : break; 207 : /* Must not have any high-color-map entries */ 208 167620 : if (cm->cd[thiscolor].nuchrs != 0) 209 0 : break; 210 : 211 : /* 212 : * Identify the color's sole member chr and add it to the prefix 213 : * string. In general the colormap data structure doesn't provide a 214 : * way to find color member chrs, except by trying GETCOLOR() on each 215 : * possible chr value, which won't do at all. However, for the cases 216 : * we care about it should be sufficient to test the "firstchr" value, 217 : * that is the first chr ever added to the color. There are cases 218 : * where this might no longer be a member of the color (so we do need 219 : * to test), but none of them are likely to arise for a character that 220 : * is a member of a common prefix. If we do hit such a corner case, 221 : * we just fall out without adding anything to the prefix string. 222 : */ 223 167620 : c = cm->cd[thiscolor].firstchr; 224 167620 : if (GETCOLOR(cm, c) != thiscolor) 225 0 : break; 226 : 227 167620 : string[(*slength)++] = c; 228 : 229 : /* Advance to next state, but only if we have a unique next state */ 230 167620 : } while (nextst != -1); 231 : 232 : /* 233 : * If we ended at a state that only has EOS/EOL outarcs leading to the 234 : * "post" state, then we have an exact-match string. Note this is true 235 : * even if the string is of zero length. 236 : */ 237 14284 : nextst = -1; 238 26490 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) 239 : { 240 14284 : if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1]) 241 : { 242 12206 : if (nextst == -1) 243 12206 : nextst = ca->to; 244 0 : else if (nextst != ca->to) 245 : { 246 0 : nextst = -1; 247 0 : break; 248 : } 249 : } 250 : else 251 : { 252 2078 : nextst = -1; 253 2078 : break; 254 : } 255 : } 256 14284 : if (nextst == cnfa->post) 257 12206 : return REG_EXACT; 258 : 259 : /* 260 : * Otherwise, if we were unable to identify any prefix characters, say 261 : * NOMATCH --- the pattern is anchored left, but doesn't specify any 262 : * particular first character. 263 : */ 264 2078 : if (*slength > 0) 265 1898 : return REG_PREFIX; 266 : 267 180 : return REG_NOMATCH; 268 : }