LCOV - code coverage report
Current view: top level - src/backend/regex - regprefix.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13devel Lines: 66 78 84.6 %
Date: 2019-11-21 14:06:36 Functions: 2 2 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * regprefix.c
       4             :  *    Extract a common prefix, if any, from a compiled regex.
       5             :  *
       6             :  *
       7             :  * Portions Copyright (c) 2012-2019, PostgreSQL Global Development Group
       8             :  * Portions Copyright (c) 1998, 1999 Henry Spencer
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/regex/regprefix.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : 
      16             : #include "regex/regguts.h"
      17             : 
      18             : 
      19             : /*
      20             :  * forward declarations
      21             :  */
      22             : static int  findprefix(struct cnfa *cnfa, struct colormap *cm,
      23             :                        chr *string, size_t *slength);
      24             : 
      25             : 
      26             : /*
      27             :  * pg_regprefix - get common prefix for regular expression
      28             :  *
      29             :  * Returns one of:
      30             :  *  REG_NOMATCH: there is no common prefix of strings matching the regex
      31             :  *  REG_PREFIX: there is a common prefix of strings matching the regex
      32             :  *  REG_EXACT: all strings satisfying the regex must match the same string
      33             :  *  or a REG_XXX error code
      34             :  *
      35             :  * In the non-failure cases, *string is set to a malloc'd string containing
      36             :  * the common prefix or exact value, of length *slength (measured in chrs
      37             :  * not bytes!).
      38             :  *
      39             :  * This function does not analyze all complex cases (such as lookaround
      40             :  * constraints) exactly.  Therefore it is possible that some strings matching
      41             :  * the reported prefix or exact-match string do not satisfy the regex.  But
      42             :  * it should never be the case that a string satisfying the regex does not
      43             :  * match the reported prefix or exact-match string.
      44             :  */
      45             : int
      46        5284 : pg_regprefix(regex_t *re,
      47             :              chr **string,
      48             :              size_t *slength)
      49             : {
      50             :     struct guts *g;
      51             :     struct cnfa *cnfa;
      52             :     int         st;
      53             : 
      54             :     /* sanity checks */
      55        5284 :     if (string == NULL || slength == NULL)
      56           0 :         return REG_INVARG;
      57        5284 :     *string = NULL;             /* initialize for failure cases */
      58        5284 :     *slength = 0;
      59        5284 :     if (re == NULL || re->re_magic != REMAGIC)
      60           0 :         return REG_INVARG;
      61        5284 :     if (re->re_csize != sizeof(chr))
      62           0 :         return REG_MIXED;
      63             : 
      64             :     /* Initialize locale-dependent support */
      65        5284 :     pg_set_regex_collation(re->re_collation);
      66             : 
      67             :     /* setup */
      68        5284 :     g = (struct guts *) re->re_guts;
      69        5284 :     if (g->info & REG_UIMPOSSIBLE)
      70           0 :         return REG_NOMATCH;
      71             : 
      72             :     /*
      73             :      * This implementation considers only the search NFA for the topmost regex
      74             :      * tree node.  Therefore, constraints such as backrefs are not fully
      75             :      * applied, which is allowed per the function's API spec.
      76             :      */
      77             :     assert(g->tree != NULL);
      78        5284 :     cnfa = &g->tree->cnfa;
      79             : 
      80             :     /*
      81             :      * Since a correct NFA should never contain any exit-free loops, it should
      82             :      * not be possible for our traversal to return to a previously visited NFA
      83             :      * state.  Hence we need at most nstates chrs in the output string.
      84             :      */
      85        5284 :     *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
      86        5284 :     if (*string == NULL)
      87           0 :         return REG_ESPACE;
      88             : 
      89             :     /* do it */
      90        5284 :     st = findprefix(cnfa, &g->cmap, *string, slength);
      91             : 
      92             :     assert(*slength <= cnfa->nstates);
      93             : 
      94             :     /* clean up */
      95        5284 :     if (st != REG_PREFIX && st != REG_EXACT)
      96             :     {
      97         402 :         FREE(*string);
      98         402 :         *string = NULL;
      99         402 :         *slength = 0;
     100             :     }
     101             : 
     102        5284 :     return st;
     103             : }
     104             : 
     105             : /*
     106             :  * findprefix - extract common prefix from cNFA
     107             :  *
     108             :  * Results are returned into the preallocated chr array string[], with
     109             :  * *slength (which must be preset to zero) incremented for each chr.
     110             :  */
     111             : static int                      /* regprefix return code */
     112        5284 : findprefix(struct cnfa *cnfa,
     113             :            struct colormap *cm,
     114             :            chr *string,
     115             :            size_t *slength)
     116             : {
     117             :     int         st;
     118             :     int         nextst;
     119             :     color       thiscolor;
     120             :     chr         c;
     121             :     struct carc *ca;
     122             : 
     123             :     /*
     124             :      * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
     125             :      * anchored left.  If we have both BOS and BOL, they must go to the same
     126             :      * next state.
     127             :      */
     128        5284 :     st = cnfa->pre;
     129        5284 :     nextst = -1;
     130       10214 :     for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
     131             :     {
     132        5292 :         if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
     133             :         {
     134        9868 :             if (nextst == -1)
     135        4930 :                 nextst = ca->to;
     136           8 :             else if (nextst != ca->to)
     137           8 :                 return REG_NOMATCH;
     138             :         }
     139             :         else
     140         354 :             return REG_NOMATCH;
     141             :     }
     142        4922 :     if (nextst == -1)
     143           0 :         return REG_NOMATCH;
     144             : 
     145             :     /*
     146             :      * Scan through successive states, stopping as soon as we find one with
     147             :      * more than one acceptable transition character (either multiple colors
     148             :      * on out-arcs, or a color with more than one member chr).
     149             :      *
     150             :      * We could find a state with multiple out-arcs that are all labeled with
     151             :      * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
     152             :      * In that case we add the chr "c" to the output string but then exit the
     153             :      * loop with nextst == -1.  This leaves a little bit on the table: if the
     154             :      * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
     155             :      * to the prefix.  But chasing multiple parallel state chains doesn't seem
     156             :      * worth the trouble.
     157             :      */
     158             :     do
     159             :     {
     160       57012 :         st = nextst;
     161       57012 :         nextst = -1;
     162       57012 :         thiscolor = COLORLESS;
     163      109792 :         for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
     164             :         {
     165             :             /* We can ignore BOS/BOL arcs */
     166       57658 :             if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
     167           0 :                 continue;
     168             :             /* ... but EOS/EOL arcs terminate the search, as do LACONs */
     169      111076 :             if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] ||
     170       53418 :                 ca->co >= cnfa->ncolors)
     171             :             {
     172        4248 :                 thiscolor = COLORLESS;
     173        4248 :                 break;
     174             :             }
     175       53410 :             if (thiscolor == COLORLESS)
     176             :             {
     177             :                 /* First plain outarc */
     178       52772 :                 thiscolor = ca->co;
     179       52772 :                 nextst = ca->to;
     180             :             }
     181         638 :             else if (thiscolor == ca->co)
     182             :             {
     183             :                 /* Another plain outarc for same color */
     184           8 :                 nextst = -1;
     185             :             }
     186             :             else
     187             :             {
     188             :                 /* More than one plain outarc color terminates the search */
     189         630 :                 thiscolor = COLORLESS;
     190         630 :                 break;
     191             :             }
     192             :         }
     193             :         /* Done if we didn't find exactly one color on plain outarcs */
     194       57012 :         if (thiscolor == COLORLESS)
     195        4878 :             break;
     196             :         /* The color must be a singleton */
     197       52134 :         if (cm->cd[thiscolor].nschrs != 1)
     198          36 :             break;
     199             :         /* Must not have any high-color-map entries */
     200       52098 :         if (cm->cd[thiscolor].nuchrs != 0)
     201           0 :             break;
     202             : 
     203             :         /*
     204             :          * Identify the color's sole member chr and add it to the prefix
     205             :          * string.  In general the colormap data structure doesn't provide a
     206             :          * way to find color member chrs, except by trying GETCOLOR() on each
     207             :          * possible chr value, which won't do at all.  However, for the cases
     208             :          * we care about it should be sufficient to test the "firstchr" value,
     209             :          * that is the first chr ever added to the color.  There are cases
     210             :          * where this might no longer be a member of the color (so we do need
     211             :          * to test), but none of them are likely to arise for a character that
     212             :          * is a member of a common prefix.  If we do hit such a corner case,
     213             :          * we just fall out without adding anything to the prefix string.
     214             :          */
     215       52098 :         c = cm->cd[thiscolor].firstchr;
     216       52098 :         if (GETCOLOR(cm, c) != thiscolor)
     217           0 :             break;
     218             : 
     219       52098 :         string[(*slength)++] = c;
     220             : 
     221             :         /* Advance to next state, but only if we have a unique next state */
     222       52098 :     } while (nextst != -1);
     223             : 
     224             :     /*
     225             :      * If we ended at a state that only has EOS/EOL outarcs leading to the
     226             :      * "post" state, then we have an exact-match string.  Note this is true
     227             :      * even if the string is of zero length.
     228             :      */
     229        4922 :     nextst = -1;
     230        9162 :     for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
     231             :     {
     232        4922 :         if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
     233             :         {
     234        8480 :             if (nextst == -1)
     235        4240 :                 nextst = ca->to;
     236           0 :             else if (nextst != ca->to)
     237             :             {
     238           0 :                 nextst = -1;
     239           0 :                 break;
     240             :             }
     241             :         }
     242             :         else
     243             :         {
     244         682 :             nextst = -1;
     245         682 :             break;
     246             :         }
     247             :     }
     248        4922 :     if (nextst == cnfa->post)
     249        4240 :         return REG_EXACT;
     250             : 
     251             :     /*
     252             :      * Otherwise, if we were unable to identify any prefix characters, say
     253             :      * NOMATCH --- the pattern is anchored left, but doesn't specify any
     254             :      * particular first character.
     255             :      */
     256         682 :     if (*slength > 0)
     257         642 :         return REG_PREFIX;
     258             : 
     259          40 :     return REG_NOMATCH;
     260             : }

Generated by: LCOV version 1.13