LCOV - code coverage report
Current view: top level - src/backend/regex - regprefix.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 86.2 % 80 69
Test Date: 2026-02-17 17:20:33 Functions: 100.0 % 2 2
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * regprefix.c
       4              :  *    Extract a common prefix, if any, from a compiled regex.
       5              :  *
       6              :  *
       7              :  * Portions Copyright (c) 2012-2026, PostgreSQL Global Development Group
       8              :  * Portions Copyright (c) 1998, 1999 Henry Spencer
       9              :  *
      10              :  * IDENTIFICATION
      11              :  *    src/backend/regex/regprefix.c
      12              :  *
      13              :  *-------------------------------------------------------------------------
      14              :  */
      15              : 
      16              : #include "regex/regguts.h"
      17              : 
      18              : 
      19              : /*
      20              :  * forward declarations
      21              :  */
      22              : static int  findprefix(struct cnfa *cnfa, struct colormap *cm,
      23              :                        chr *string, size_t *slength);
      24              : 
      25              : 
      26              : /*
      27              :  * pg_regprefix - get common prefix for regular expression
      28              :  *
      29              :  * Returns one of:
      30              :  *  REG_NOMATCH: there is no common prefix of strings matching the regex
      31              :  *  REG_PREFIX: there is a common prefix of strings matching the regex
      32              :  *  REG_EXACT: all strings satisfying the regex must match the same string
      33              :  *  or a REG_XXX error code
      34              :  *
      35              :  * In the non-failure cases, *string is set to a palloc'd string containing
      36              :  * the common prefix or exact value, of length *slength (measured in chrs
      37              :  * not bytes!).
      38              :  *
      39              :  * This function does not analyze all complex cases (such as lookaround
      40              :  * constraints) exactly.  Therefore it is possible that some strings matching
      41              :  * the reported prefix or exact-match string do not satisfy the regex.  But
      42              :  * it should never be the case that a string satisfying the regex does not
      43              :  * match the reported prefix or exact-match string.
      44              :  */
      45              : int
      46         8421 : pg_regprefix(regex_t *re,
      47              :              chr **string,
      48              :              size_t *slength)
      49              : {
      50              :     struct guts *g;
      51              :     struct cnfa *cnfa;
      52              :     int         st;
      53              : 
      54              :     /* sanity checks */
      55         8421 :     if (string == NULL || slength == NULL)
      56            0 :         return REG_INVARG;
      57         8421 :     *string = NULL;             /* initialize for failure cases */
      58         8421 :     *slength = 0;
      59         8421 :     if (re == NULL || re->re_magic != REMAGIC)
      60            0 :         return REG_INVARG;
      61         8421 :     if (re->re_csize != sizeof(chr))
      62            0 :         return REG_MIXED;
      63              : 
      64              :     /* Initialize locale-dependent support */
      65         8421 :     pg_set_regex_collation(re->re_collation);
      66              : 
      67              :     /* setup */
      68         8421 :     g = (struct guts *) re->re_guts;
      69         8421 :     if (g->info & REG_UIMPOSSIBLE)
      70            1 :         return REG_NOMATCH;
      71              : 
      72              :     /*
      73              :      * This implementation considers only the search NFA for the topmost regex
      74              :      * tree node.  Therefore, constraints such as backrefs are not fully
      75              :      * applied, which is allowed per the function's API spec.
      76              :      */
      77              :     assert(g->tree != NULL);
      78         8420 :     cnfa = &g->tree->cnfa;
      79              : 
      80              :     /* matchall NFAs never have a fixed prefix */
      81         8420 :     if (cnfa->flags & MATCHALL)
      82            6 :         return REG_NOMATCH;
      83              : 
      84              :     /*
      85              :      * Since a correct NFA should never contain any exit-free loops, it should
      86              :      * not be possible for our traversal to return to a previously visited NFA
      87              :      * state.  Hence we need at most nstates chrs in the output string.
      88              :      */
      89         8414 :     *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
      90         8414 :     if (*string == NULL)
      91            0 :         return REG_ESPACE;
      92              : 
      93              :     /* do it */
      94         8414 :     st = findprefix(cnfa, &g->cmap, *string, slength);
      95              : 
      96              :     assert(*slength <= cnfa->nstates);
      97              : 
      98              :     /* clean up */
      99         8414 :     if (st != REG_PREFIX && st != REG_EXACT)
     100              :     {
     101          377 :         FREE(*string);
     102          377 :         *string = NULL;
     103          377 :         *slength = 0;
     104              :     }
     105              : 
     106         8414 :     return st;
     107              : }
     108              : 
     109              : /*
     110              :  * findprefix - extract common prefix from cNFA
     111              :  *
     112              :  * Results are returned into the preallocated chr array string[], with
     113              :  * *slength (which must be preset to zero) incremented for each chr.
     114              :  */
     115              : static int                      /* regprefix return code */
     116         8414 : findprefix(struct cnfa *cnfa,
     117              :            struct colormap *cm,
     118              :            chr *string,
     119              :            size_t *slength)
     120              : {
     121              :     int         st;
     122              :     int         nextst;
     123              :     color       thiscolor;
     124              :     chr         c;
     125              :     struct carc *ca;
     126              : 
     127              :     /*
     128              :      * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
     129              :      * anchored left.  If we have both BOS and BOL, they must go to the same
     130              :      * next state.
     131              :      */
     132         8414 :     st = cnfa->pre;
     133         8414 :     nextst = -1;
     134        16565 :     for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
     135              :     {
     136         8420 :         if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
     137              :         {
     138         8157 :             if (nextst == -1)
     139         8151 :                 nextst = ca->to;
     140            6 :             else if (nextst != ca->to)
     141            6 :                 return REG_NOMATCH;
     142              :         }
     143              :         else
     144          263 :             return REG_NOMATCH;
     145              :     }
     146         8145 :     if (nextst == -1)
     147            0 :         return REG_NOMATCH;
     148              : 
     149              :     /*
     150              :      * Scan through successive states, stopping as soon as we find one with
     151              :      * more than one acceptable transition character (either multiple colors
     152              :      * on out-arcs, or a color with more than one member chr).
     153              :      *
     154              :      * We could find a state with multiple out-arcs that are all labeled with
     155              :      * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
     156              :      * In that case we add the chr "c" to the output string but then exit the
     157              :      * loop with nextst == -1.  This leaves a little bit on the table: if the
     158              :      * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
     159              :      * to the prefix.  But chasing multiple parallel state chains doesn't seem
     160              :      * worth the trouble.
     161              :      */
     162              :     do
     163              :     {
     164       101809 :         st = nextst;
     165       101809 :         nextst = -1;
     166       101809 :         thiscolor = COLORLESS;
     167       195567 :         for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
     168              :         {
     169              :             /* We can ignore BOS/BOL arcs */
     170       101848 :             if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
     171            0 :                 continue;
     172              : 
     173              :             /*
     174              :              * ... but EOS/EOL arcs terminate the search, as do RAINBOW arcs
     175              :              * and LACONs
     176              :              */
     177       101848 :             if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] ||
     178        95341 :                 ca->co == RAINBOW || ca->co >= cnfa->ncolors)
     179              :             {
     180         8063 :                 thiscolor = COLORLESS;
     181         8063 :                 break;
     182              :             }
     183        93785 :             if (thiscolor == COLORLESS)
     184              :             {
     185              :                 /* First plain outarc */
     186        93752 :                 thiscolor = ca->co;
     187        93752 :                 nextst = ca->to;
     188              :             }
     189           33 :             else if (thiscolor == ca->co)
     190              :             {
     191              :                 /* Another plain outarc for same color */
     192            6 :                 nextst = -1;
     193              :             }
     194              :             else
     195              :             {
     196              :                 /* More than one plain outarc color terminates the search */
     197           27 :                 thiscolor = COLORLESS;
     198           27 :                 break;
     199              :             }
     200              :         }
     201              :         /* Done if we didn't find exactly one color on plain outarcs */
     202       101809 :         if (thiscolor == COLORLESS)
     203         8090 :             break;
     204              :         /* The color must be a singleton */
     205        93719 :         if (cm->cd[thiscolor].nschrs != 1)
     206           49 :             break;
     207              :         /* Must not have any high-color-map entries */
     208        93670 :         if (cm->cd[thiscolor].nuchrs != 0)
     209            0 :             break;
     210              : 
     211              :         /*
     212              :          * Identify the color's sole member chr and add it to the prefix
     213              :          * string.  In general the colormap data structure doesn't provide a
     214              :          * way to find color member chrs, except by trying GETCOLOR() on each
     215              :          * possible chr value, which won't do at all.  However, for the cases
     216              :          * we care about it should be sufficient to test the "firstchr" value,
     217              :          * that is the first chr ever added to the color.  There are cases
     218              :          * where this might no longer be a member of the color (so we do need
     219              :          * to test), but none of them are likely to arise for a character that
     220              :          * is a member of a common prefix.  If we do hit such a corner case,
     221              :          * we just fall out without adding anything to the prefix string.
     222              :          */
     223        93670 :         c = cm->cd[thiscolor].firstchr;
     224        93670 :         if (GETCOLOR(cm, c) != thiscolor)
     225            0 :             break;
     226              : 
     227        93670 :         string[(*slength)++] = c;
     228              : 
     229              :         /* Advance to next state, but only if we have a unique next state */
     230        93670 :     } while (nextst != -1);
     231              : 
     232              :     /*
     233              :      * If we ended at a state that only has EOS/EOL outarcs leading to the
     234              :      * "post" state, then we have an exact-match string.  Note this is true
     235              :      * even if the string is of zero length.
     236              :      */
     237         8145 :     nextst = -1;
     238        14652 :     for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
     239              :     {
     240         8145 :         if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
     241              :         {
     242         6507 :             if (nextst == -1)
     243         6507 :                 nextst = ca->to;
     244            0 :             else if (nextst != ca->to)
     245              :             {
     246            0 :                 nextst = -1;
     247            0 :                 break;
     248              :             }
     249              :         }
     250              :         else
     251              :         {
     252         1638 :             nextst = -1;
     253         1638 :             break;
     254              :         }
     255              :     }
     256         8145 :     if (nextst == cnfa->post)
     257         6507 :         return REG_EXACT;
     258              : 
     259              :     /*
     260              :      * Otherwise, if we were unable to identify any prefix characters, say
     261              :      * NOMATCH --- the pattern is anchored left, but doesn't specify any
     262              :      * particular first character.
     263              :      */
     264         1638 :     if (*slength > 0)
     265         1530 :         return REG_PREFIX;
     266              : 
     267          108 :     return REG_NOMATCH;
     268              : }
        

Generated by: LCOV version 2.0-1