LCOV - code coverage report
Current view: top level - src/backend/regex - regprefix.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 69 80 86.2 %
Date: 2025-01-18 03:14:54 Functions: 2 2 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * regprefix.c
       4             :  *    Extract a common prefix, if any, from a compiled regex.
       5             :  *
       6             :  *
       7             :  * Portions Copyright (c) 2012-2025, PostgreSQL Global Development Group
       8             :  * Portions Copyright (c) 1998, 1999 Henry Spencer
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/regex/regprefix.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : 
      16             : #include "regex/regguts.h"
      17             : 
      18             : 
      19             : /*
      20             :  * forward declarations
      21             :  */
      22             : static int  findprefix(struct cnfa *cnfa, struct colormap *cm,
      23             :                        chr *string, size_t *slength);
      24             : 
      25             : 
      26             : /*
      27             :  * pg_regprefix - get common prefix for regular expression
      28             :  *
      29             :  * Returns one of:
      30             :  *  REG_NOMATCH: there is no common prefix of strings matching the regex
      31             :  *  REG_PREFIX: there is a common prefix of strings matching the regex
      32             :  *  REG_EXACT: all strings satisfying the regex must match the same string
      33             :  *  or a REG_XXX error code
      34             :  *
      35             :  * In the non-failure cases, *string is set to a palloc'd string containing
      36             :  * the common prefix or exact value, of length *slength (measured in chrs
      37             :  * not bytes!).
      38             :  *
      39             :  * This function does not analyze all complex cases (such as lookaround
      40             :  * constraints) exactly.  Therefore it is possible that some strings matching
      41             :  * the reported prefix or exact-match string do not satisfy the regex.  But
      42             :  * it should never be the case that a string satisfying the regex does not
      43             :  * match the reported prefix or exact-match string.
      44             :  */
      45             : int
      46       14830 : pg_regprefix(regex_t *re,
      47             :              chr **string,
      48             :              size_t *slength)
      49             : {
      50             :     struct guts *g;
      51             :     struct cnfa *cnfa;
      52             :     int         st;
      53             : 
      54             :     /* sanity checks */
      55       14830 :     if (string == NULL || slength == NULL)
      56           0 :         return REG_INVARG;
      57       14830 :     *string = NULL;             /* initialize for failure cases */
      58       14830 :     *slength = 0;
      59       14830 :     if (re == NULL || re->re_magic != REMAGIC)
      60           0 :         return REG_INVARG;
      61       14830 :     if (re->re_csize != sizeof(chr))
      62           0 :         return REG_MIXED;
      63             : 
      64             :     /* Initialize locale-dependent support */
      65       14830 :     pg_set_regex_collation(re->re_collation);
      66             : 
      67             :     /* setup */
      68       14830 :     g = (struct guts *) re->re_guts;
      69       14830 :     if (g->info & REG_UIMPOSSIBLE)
      70           2 :         return REG_NOMATCH;
      71             : 
      72             :     /*
      73             :      * This implementation considers only the search NFA for the topmost regex
      74             :      * tree node.  Therefore, constraints such as backrefs are not fully
      75             :      * applied, which is allowed per the function's API spec.
      76             :      */
      77             :     assert(g->tree != NULL);
      78       14828 :     cnfa = &g->tree->cnfa;
      79             : 
      80             :     /* matchall NFAs never have a fixed prefix */
      81       14828 :     if (cnfa->flags & MATCHALL)
      82          12 :         return REG_NOMATCH;
      83             : 
      84             :     /*
      85             :      * Since a correct NFA should never contain any exit-free loops, it should
      86             :      * not be possible for our traversal to return to a previously visited NFA
      87             :      * state.  Hence we need at most nstates chrs in the output string.
      88             :      */
      89       14816 :     *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
      90       14816 :     if (*string == NULL)
      91           0 :         return REG_ESPACE;
      92             : 
      93             :     /* do it */
      94       14816 :     st = findprefix(cnfa, &g->cmap, *string, slength);
      95             : 
      96             :     assert(*slength <= cnfa->nstates);
      97             : 
      98             :     /* clean up */
      99       14816 :     if (st != REG_PREFIX && st != REG_EXACT)
     100             :     {
     101         712 :         FREE(*string);
     102         712 :         *string = NULL;
     103         712 :         *slength = 0;
     104             :     }
     105             : 
     106       14816 :     return st;
     107             : }
     108             : 
     109             : /*
     110             :  * findprefix - extract common prefix from cNFA
     111             :  *
     112             :  * Results are returned into the preallocated chr array string[], with
     113             :  * *slength (which must be preset to zero) incremented for each chr.
     114             :  */
     115             : static int                      /* regprefix return code */
     116       14816 : findprefix(struct cnfa *cnfa,
     117             :            struct colormap *cm,
     118             :            chr *string,
     119             :            size_t *slength)
     120             : {
     121             :     int         st;
     122             :     int         nextst;
     123             :     color       thiscolor;
     124             :     chr         c;
     125             :     struct carc *ca;
     126             : 
     127             :     /*
     128             :      * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
     129             :      * anchored left.  If we have both BOS and BOL, they must go to the same
     130             :      * next state.
     131             :      */
     132       14816 :     st = cnfa->pre;
     133       14816 :     nextst = -1;
     134       29112 :     for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
     135             :     {
     136       14828 :         if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
     137             :         {
     138       14308 :             if (nextst == -1)
     139       14296 :                 nextst = ca->to;
     140          12 :             else if (nextst != ca->to)
     141          12 :                 return REG_NOMATCH;
     142             :         }
     143             :         else
     144         520 :             return REG_NOMATCH;
     145             :     }
     146       14284 :     if (nextst == -1)
     147           0 :         return REG_NOMATCH;
     148             : 
     149             :     /*
     150             :      * Scan through successive states, stopping as soon as we find one with
     151             :      * more than one acceptable transition character (either multiple colors
     152             :      * on out-arcs, or a color with more than one member chr).
     153             :      *
     154             :      * We could find a state with multiple out-arcs that are all labeled with
     155             :      * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
     156             :      * In that case we add the chr "c" to the output string but then exit the
     157             :      * loop with nextst == -1.  This leaves a little bit on the table: if the
     158             :      * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
     159             :      * to the prefix.  But chasing multiple parallel state chains doesn't seem
     160             :      * worth the trouble.
     161             :      */
     162             :     do
     163             :     {
     164      181892 :         st = nextst;
     165      181892 :         nextst = -1;
     166      181892 :         thiscolor = COLORLESS;
     167      349652 :         for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
     168             :         {
     169             :             /* We can ignore BOS/BOL arcs */
     170      181946 :             if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
     171           0 :                 continue;
     172             : 
     173             :             /*
     174             :              * ... but EOS/EOL arcs terminate the search, as do RAINBOW arcs
     175             :              * and LACONs
     176             :              */
     177      181946 :             if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] ||
     178      169740 :                 ca->co == RAINBOW || ca->co >= cnfa->ncolors)
     179             :             {
     180       14156 :                 thiscolor = COLORLESS;
     181       14156 :                 break;
     182             :             }
     183      167790 :             if (thiscolor == COLORLESS)
     184             :             {
     185             :                 /* First plain outarc */
     186      167748 :                 thiscolor = ca->co;
     187      167748 :                 nextst = ca->to;
     188             :             }
     189          42 :             else if (thiscolor == ca->co)
     190             :             {
     191             :                 /* Another plain outarc for same color */
     192          12 :                 nextst = -1;
     193             :             }
     194             :             else
     195             :             {
     196             :                 /* More than one plain outarc color terminates the search */
     197          30 :                 thiscolor = COLORLESS;
     198          30 :                 break;
     199             :             }
     200             :         }
     201             :         /* Done if we didn't find exactly one color on plain outarcs */
     202      181892 :         if (thiscolor == COLORLESS)
     203       14186 :             break;
     204             :         /* The color must be a singleton */
     205      167706 :         if (cm->cd[thiscolor].nschrs != 1)
     206          86 :             break;
     207             :         /* Must not have any high-color-map entries */
     208      167620 :         if (cm->cd[thiscolor].nuchrs != 0)
     209           0 :             break;
     210             : 
     211             :         /*
     212             :          * Identify the color's sole member chr and add it to the prefix
     213             :          * string.  In general the colormap data structure doesn't provide a
     214             :          * way to find color member chrs, except by trying GETCOLOR() on each
     215             :          * possible chr value, which won't do at all.  However, for the cases
     216             :          * we care about it should be sufficient to test the "firstchr" value,
     217             :          * that is the first chr ever added to the color.  There are cases
     218             :          * where this might no longer be a member of the color (so we do need
     219             :          * to test), but none of them are likely to arise for a character that
     220             :          * is a member of a common prefix.  If we do hit such a corner case,
     221             :          * we just fall out without adding anything to the prefix string.
     222             :          */
     223      167620 :         c = cm->cd[thiscolor].firstchr;
     224      167620 :         if (GETCOLOR(cm, c) != thiscolor)
     225           0 :             break;
     226             : 
     227      167620 :         string[(*slength)++] = c;
     228             : 
     229             :         /* Advance to next state, but only if we have a unique next state */
     230      167620 :     } while (nextst != -1);
     231             : 
     232             :     /*
     233             :      * If we ended at a state that only has EOS/EOL outarcs leading to the
     234             :      * "post" state, then we have an exact-match string.  Note this is true
     235             :      * even if the string is of zero length.
     236             :      */
     237       14284 :     nextst = -1;
     238       26490 :     for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
     239             :     {
     240       14284 :         if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
     241             :         {
     242       12206 :             if (nextst == -1)
     243       12206 :                 nextst = ca->to;
     244           0 :             else if (nextst != ca->to)
     245             :             {
     246           0 :                 nextst = -1;
     247           0 :                 break;
     248             :             }
     249             :         }
     250             :         else
     251             :         {
     252        2078 :             nextst = -1;
     253        2078 :             break;
     254             :         }
     255             :     }
     256       14284 :     if (nextst == cnfa->post)
     257       12206 :         return REG_EXACT;
     258             : 
     259             :     /*
     260             :      * Otherwise, if we were unable to identify any prefix characters, say
     261             :      * NOMATCH --- the pattern is anchored left, but doesn't specify any
     262             :      * particular first character.
     263             :      */
     264        2078 :     if (*slength > 0)
     265        1898 :         return REG_PREFIX;
     266             : 
     267         180 :     return REG_NOMATCH;
     268             : }

Generated by: LCOV version 1.14