LCOV - code coverage report
Current view: top level - src/backend/regex - rege_dfa.c (source / functions) Hit Total Coverage
Test: PostgreSQL 15devel Lines: 404 450 89.8 %
Date: 2021-12-09 03:08:47 Functions: 13 13 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * DFA routines
       3             :  * This file is #included by regexec.c.
       4             :  *
       5             :  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
       6             :  *
       7             :  * Development of this software was funded, in part, by Cray Research Inc.,
       8             :  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
       9             :  * Corporation, none of whom are responsible for the results.  The author
      10             :  * thanks all of them.
      11             :  *
      12             :  * Redistribution and use in source and binary forms -- with or without
      13             :  * modification -- are permitted for any purpose, provided that
      14             :  * redistributions in source form retain this entire copyright notice and
      15             :  * indicate the origin and nature of any modifications.
      16             :  *
      17             :  * I'd appreciate being given credit for this package in the documentation
      18             :  * of software which uses it, but that is not a requirement.
      19             :  *
      20             :  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
      21             :  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
      22             :  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
      23             :  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
      24             :  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
      25             :  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
      26             :  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
      27             :  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
      28             :  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
      29             :  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
      30             :  *
      31             :  * src/backend/regex/rege_dfa.c
      32             :  *
      33             :  */
      34             : 
      35             : /*
      36             :  * longest - longest-preferred matching engine
      37             :  *
      38             :  * On success, returns match endpoint address.  Returns NULL on no match.
      39             :  * Internal errors also return NULL, with v->err set.
      40             :  */
      41             : static chr *
      42      832432 : longest(struct vars *v,
      43             :         struct dfa *d,
      44             :         chr *start,             /* where the match should start */
      45             :         chr *stop,              /* match must end at or before here */
      46             :         int *hitstopp)          /* record whether hit v->stop, if non-NULL */
      47             : {
      48             :     chr        *cp;
      49      832432 :     chr        *realstop = (stop == v->stop) ? stop : stop + 1;
      50             :     color       co;
      51             :     struct sset *css;
      52             :     struct sset *ss;
      53             :     chr        *post;
      54             :     int         i;
      55      832432 :     struct colormap *cm = d->cm;
      56             : 
      57             :     /* prevent "uninitialized variable" warnings */
      58      832432 :     if (hitstopp != NULL)
      59      809424 :         *hitstopp = 0;
      60             : 
      61             :     /* if this is a backref to a known string, just match against that */
      62      832432 :     if (d->backno >= 0)
      63             :     {
      64             :         assert((size_t) d->backno < v->nmatch);
      65        1180 :         if (v->pmatch[d->backno].rm_so >= 0)
      66             :         {
      67         908 :             cp = dfa_backref(v, d, start, start, stop, false);
      68         908 :             if (cp == v->stop && stop == v->stop && hitstopp != NULL)
      69           0 :                 *hitstopp = 1;
      70         908 :             return cp;
      71             :         }
      72             :     }
      73             : 
      74             :     /* fast path for matchall NFAs */
      75      831524 :     if (d->cnfa->flags & MATCHALL)
      76             :     {
      77        3692 :         size_t      nchr = stop - start;
      78        3692 :         size_t      maxmatchall = d->cnfa->maxmatchall;
      79             : 
      80        3692 :         if (nchr < d->cnfa->minmatchall)
      81         248 :             return NULL;
      82        3444 :         if (maxmatchall == DUPINF)
      83             :         {
      84        2204 :             if (stop == v->stop && hitstopp != NULL)
      85           8 :                 *hitstopp = 1;
      86             :         }
      87             :         else
      88             :         {
      89        1240 :             if (stop == v->stop && nchr <= maxmatchall + 1 && hitstopp != NULL)
      90         122 :                 *hitstopp = 1;
      91        1240 :             if (nchr > maxmatchall)
      92         822 :                 return start + maxmatchall;
      93             :         }
      94        2622 :         return stop;
      95             :     }
      96             : 
      97             :     /* initialize */
      98      827832 :     css = initialize(v, d, start);
      99      827832 :     if (css == NULL)
     100           0 :         return NULL;
     101      827832 :     cp = start;
     102             : 
     103             :     /* startup */
     104             :     FDEBUG(("+++ startup +++\n"));
     105      827832 :     if (cp == v->start)
     106             :     {
     107        2930 :         co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
     108             :         FDEBUG(("color %ld\n", (long) co));
     109             :     }
     110             :     else
     111             :     {
     112      824902 :         co = GETCOLOR(cm, *(cp - 1));
     113             :         FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
     114             :     }
     115      827832 :     css = miss(v, d, css, co, cp, start);
     116      827832 :     if (css == NULL)
     117         324 :         return NULL;
     118      827508 :     css->lastseen = cp;
     119             : 
     120             :     /*
     121             :      * This is the main text-scanning loop.  It seems worth having two copies
     122             :      * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
     123             :      * builds, when you're not actively tracing.
     124             :      */
     125             : #ifdef REG_DEBUG
     126             :     if (v->eflags & REG_FTRACE)
     127             :     {
     128             :         while (cp < realstop)
     129             :         {
     130             :             FDEBUG(("+++ at c%d +++\n", (int) (css - d->ssets)));
     131             :             co = GETCOLOR(cm, *cp);
     132             :             FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
     133             :             ss = css->outs[co];
     134             :             if (ss == NULL)
     135             :             {
     136             :                 ss = miss(v, d, css, co, cp + 1, start);
     137             :                 if (ss == NULL)
     138             :                     break;      /* NOTE BREAK OUT */
     139             :             }
     140             :             cp++;
     141             :             ss->lastseen = cp;
     142             :             css = ss;
     143             :         }
     144             :     }
     145             :     else
     146             : #endif
     147             :     {
     148    11326152 :         while (cp < realstop)
     149             :         {
     150    11309252 :             co = GETCOLOR(cm, *cp);
     151    11309252 :             ss = css->outs[co];
     152    11309252 :             if (ss == NULL)
     153             :             {
     154     2594428 :                 ss = miss(v, d, css, co, cp + 1, start);
     155     2594428 :                 if (ss == NULL)
     156      810608 :                     break;      /* NOTE BREAK OUT */
     157             :             }
     158    10498644 :             cp++;
     159    10498644 :             ss->lastseen = cp;
     160    10498644 :             css = ss;
     161             :         }
     162             :     }
     163             : 
     164      827508 :     if (ISERR())
     165           0 :         return NULL;
     166             : 
     167             :     /* shutdown */
     168             :     FDEBUG(("+++ shutdown at c%d +++\n", (int) (css - d->ssets)));
     169      827508 :     if (cp == v->stop && stop == v->stop)
     170             :     {
     171        9126 :         if (hitstopp != NULL)
     172        4120 :             *hitstopp = 1;
     173        9126 :         co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
     174             :         FDEBUG(("color %ld\n", (long) co));
     175        9126 :         ss = miss(v, d, css, co, cp, start);
     176        9126 :         if (ISERR())
     177           0 :             return NULL;
     178             :         /* special case:  match ended at eol? */
     179        9126 :         if (ss != NULL && (ss->flags & POSTSTATE))
     180        4772 :             return cp;
     181        4354 :         else if (ss != NULL)
     182           0 :             ss->lastseen = cp;   /* to be tidy */
     183             :     }
     184             : 
     185             :     /* find last match, if any */
     186      822736 :     post = d->lastpost;
     187     4242058 :     for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
     188     3419322 :         if ((ss->flags & POSTSTATE) && post != ss->lastseen &&
     189        9606 :             (post == NULL || post < ss->lastseen))
     190      828776 :             post = ss->lastseen;
     191      822736 :     if (post != NULL)           /* found one */
     192      819772 :         return post - 1;
     193             : 
     194        2964 :     return NULL;
     195             : }
     196             : 
     197             : /*
     198             :  * shortest - shortest-preferred matching engine
     199             :  *
     200             :  * On success, returns match endpoint address.  Returns NULL on no match.
     201             :  * Internal errors also return NULL, with v->err set.
     202             :  */
     203             : static chr *
     204     1506484 : shortest(struct vars *v,
     205             :          struct dfa *d,
     206             :          chr *start,            /* where the match should start */
     207             :          chr *min,              /* match must end at or after here */
     208             :          chr *max,              /* match must end at or before here */
     209             :          chr **coldp,           /* store coldstart pointer here, if non-NULL */
     210             :          int *hitstopp)         /* record whether hit v->stop, if non-NULL */
     211             : {
     212             :     chr        *cp;
     213     1506484 :     chr        *realmin = (min == v->stop) ? min : min + 1;
     214     1506484 :     chr        *realmax = (max == v->stop) ? max : max + 1;
     215             :     color       co;
     216             :     struct sset *css;
     217             :     struct sset *ss;
     218     1506484 :     struct colormap *cm = d->cm;
     219             : 
     220             :     /* prevent "uninitialized variable" warnings */
     221     1506484 :     if (coldp != NULL)
     222     1505296 :         *coldp = NULL;
     223     1506484 :     if (hitstopp != NULL)
     224         224 :         *hitstopp = 0;
     225             : 
     226             :     /* if this is a backref to a known string, just match against that */
     227     1506484 :     if (d->backno >= 0)
     228             :     {
     229             :         assert((size_t) d->backno < v->nmatch);
     230           0 :         if (v->pmatch[d->backno].rm_so >= 0)
     231             :         {
     232           0 :             cp = dfa_backref(v, d, start, min, max, true);
     233           0 :             if (cp != NULL && coldp != NULL)
     234           0 :                 *coldp = start;
     235             :             /* there is no case where we should set *hitstopp */
     236           0 :             return cp;
     237             :         }
     238             :     }
     239             : 
     240             :     /* fast path for matchall NFAs */
     241     1506484 :     if (d->cnfa->flags & MATCHALL)
     242             :     {
     243        1366 :         size_t      nchr = min - start;
     244             : 
     245        1366 :         if (d->cnfa->maxmatchall != DUPINF &&
     246          16 :             nchr > d->cnfa->maxmatchall)
     247           0 :             return NULL;
     248        1366 :         if ((max - start) < d->cnfa->minmatchall)
     249          12 :             return NULL;
     250        1354 :         if (nchr < d->cnfa->minmatchall)
     251          96 :             min = start + d->cnfa->minmatchall;
     252        1354 :         if (coldp != NULL)
     253         608 :             *coldp = start;
     254             :         /* there is no case where we should set *hitstopp */
     255        1354 :         return min;
     256             :     }
     257             : 
     258             :     /* initialize */
     259     1505118 :     css = initialize(v, d, start);
     260     1505118 :     if (css == NULL)
     261           0 :         return NULL;
     262     1505118 :     cp = start;
     263             : 
     264             :     /* startup */
     265             :     FDEBUG(("--- startup ---\n"));
     266     1505118 :     if (cp == v->start)
     267             :     {
     268      701586 :         co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
     269             :         FDEBUG(("color %ld\n", (long) co));
     270             :     }
     271             :     else
     272             :     {
     273      803532 :         co = GETCOLOR(cm, *(cp - 1));
     274             :         FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
     275             :     }
     276     1505118 :     css = miss(v, d, css, co, cp, start);
     277     1505118 :     if (css == NULL)
     278          10 :         return NULL;
     279     1505108 :     css->lastseen = cp;
     280     1505108 :     ss = css;
     281             : 
     282             :     /*
     283             :      * This is the main text-scanning loop.  It seems worth having two copies
     284             :      * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
     285             :      * builds, when you're not actively tracing.
     286             :      */
     287             : #ifdef REG_DEBUG
     288             :     if (v->eflags & REG_FTRACE)
     289             :     {
     290             :         while (cp < realmax)
     291             :         {
     292             :             FDEBUG(("--- at c%d ---\n", (int) (css - d->ssets)));
     293             :             co = GETCOLOR(cm, *cp);
     294             :             FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
     295             :             ss = css->outs[co];
     296             :             if (ss == NULL)
     297             :             {
     298             :                 ss = miss(v, d, css, co, cp + 1, start);
     299             :                 if (ss == NULL)
     300             :                     break;      /* NOTE BREAK OUT */
     301             :             }
     302             :             cp++;
     303             :             ss->lastseen = cp;
     304             :             css = ss;
     305             :             if ((ss->flags & POSTSTATE) && cp >= realmin)
     306             :                 break;          /* NOTE BREAK OUT */
     307             :         }
     308             :     }
     309             :     else
     310             : #endif
     311             :     {
     312    33062322 :         while (cp < realmax)
     313             :         {
     314    32665442 :             co = GETCOLOR(cm, *cp);
     315    32665442 :             ss = css->outs[co];
     316    32665442 :             if (ss == NULL)
     317             :             {
     318     5330374 :                 ss = miss(v, d, css, co, cp + 1, start);
     319     5330374 :                 if (ss == NULL)
     320       84272 :                     break;      /* NOTE BREAK OUT */
     321             :             }
     322    32581170 :             cp++;
     323    32581170 :             ss->lastseen = cp;
     324    32581170 :             css = ss;
     325    32581170 :             if ((ss->flags & POSTSTATE) && cp >= realmin)
     326     1023956 :                 break;          /* NOTE BREAK OUT */
     327             :         }
     328             :     }
     329             : 
     330     1505108 :     if (ss == NULL)
     331       84272 :         return NULL;
     332             : 
     333     1420836 :     if (coldp != NULL)          /* report last no-progress state set, if any */
     334     1420446 :         *coldp = lastcold(v, d);
     335             : 
     336     1420836 :     if ((ss->flags & POSTSTATE) && cp > min)
     337             :     {
     338             :         assert(cp >= realmin);
     339     1023932 :         cp--;
     340             :     }
     341      396904 :     else if (cp == v->stop && max == v->stop)
     342             :     {
     343      396904 :         co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
     344             :         FDEBUG(("color %ld\n", (long) co));
     345      396904 :         ss = miss(v, d, css, co, cp, start);
     346             :         /* match might have ended at eol */
     347      396904 :         if ((ss == NULL || !(ss->flags & POSTSTATE)) && hitstopp != NULL)
     348           8 :             *hitstopp = 1;
     349             :     }
     350             : 
     351     1420836 :     if (ss == NULL || !(ss->flags & POSTSTATE))
     352      383762 :         return NULL;
     353             : 
     354     1037074 :     return cp;
     355             : }
     356             : 
     357             : /*
     358             :  * matchuntil - incremental matching engine
     359             :  *
     360             :  * This is meant for use with a search-style NFA (that is, the pattern is
     361             :  * known to act as though it had a leading .*).  We determine whether a
     362             :  * match exists starting at v->start and ending at probe.  Multiple calls
     363             :  * require only O(N) time not O(N^2) so long as the probe values are
     364             :  * nondecreasing.  *lastcss and *lastcp must be initialized to NULL before
     365             :  * starting a series of calls.
     366             :  *
     367             :  * Returns 1 if a match exists, 0 if not.
     368             :  * Internal errors also return 0, with v->err set.
     369             :  */
     370             : static int
     371          92 : matchuntil(struct vars *v,
     372             :            struct dfa *d,
     373             :            chr *probe,          /* we want to know if a match ends here */
     374             :            struct sset **lastcss,   /* state storage across calls */
     375             :            chr **lastcp)        /* state storage across calls */
     376             : {
     377          92 :     chr        *cp = *lastcp;
     378             :     color       co;
     379          92 :     struct sset *css = *lastcss;
     380             :     struct sset *ss;
     381          92 :     struct colormap *cm = d->cm;
     382             : 
     383             :     /* fast path for matchall NFAs */
     384          92 :     if (d->cnfa->flags & MATCHALL)
     385             :     {
     386          36 :         size_t      nchr = probe - v->start;
     387             : 
     388          36 :         if (nchr < d->cnfa->minmatchall)
     389          18 :             return 0;
     390             :         /* maxmatchall will always be infinity, cf. makesearch() */
     391             :         assert(d->cnfa->maxmatchall == DUPINF);
     392          18 :         return 1;
     393             :     }
     394             : 
     395             :     /* initialize and startup, or restart, if necessary */
     396          56 :     if (cp == NULL || cp > probe)
     397             :     {
     398          16 :         cp = v->start;
     399          16 :         css = initialize(v, d, cp);
     400          16 :         if (css == NULL)
     401           0 :             return 0;
     402             : 
     403             :         FDEBUG((">>> startup >>>\n"));
     404          16 :         co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
     405             :         FDEBUG(("color %ld\n", (long) co));
     406             : 
     407          16 :         css = miss(v, d, css, co, cp, v->start);
     408          16 :         if (css == NULL)
     409           0 :             return 0;
     410          16 :         css->lastseen = cp;
     411             :     }
     412          40 :     else if (css == NULL)
     413             :     {
     414             :         /* we previously found that no match is possible beyond *lastcp */
     415           0 :         return 0;
     416             :     }
     417          56 :     ss = css;
     418             : 
     419             :     /*
     420             :      * This is the main text-scanning loop.  It seems worth having two copies
     421             :      * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
     422             :      * builds, when you're not actively tracing.
     423             :      */
     424             : #ifdef REG_DEBUG
     425             :     if (v->eflags & REG_FTRACE)
     426             :     {
     427             :         while (cp < probe)
     428             :         {
     429             :             FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
     430             :             co = GETCOLOR(cm, *cp);
     431             :             FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
     432             :             ss = css->outs[co];
     433             :             if (ss == NULL)
     434             :             {
     435             :                 ss = miss(v, d, css, co, cp + 1, v->start);
     436             :                 if (ss == NULL)
     437             :                     break;      /* NOTE BREAK OUT */
     438             :             }
     439             :             cp++;
     440             :             ss->lastseen = cp;
     441             :             css = ss;
     442             :         }
     443             :     }
     444             :     else
     445             : #endif
     446             :     {
     447         120 :         while (cp < probe)
     448             :         {
     449          64 :             co = GETCOLOR(cm, *cp);
     450          64 :             ss = css->outs[co];
     451          64 :             if (ss == NULL)
     452             :             {
     453           8 :                 ss = miss(v, d, css, co, cp + 1, v->start);
     454           8 :                 if (ss == NULL)
     455           0 :                     break;      /* NOTE BREAK OUT */
     456             :             }
     457          64 :             cp++;
     458          64 :             ss->lastseen = cp;
     459          64 :             css = ss;
     460             :         }
     461             :     }
     462             : 
     463          56 :     *lastcss = ss;
     464          56 :     *lastcp = cp;
     465             : 
     466          56 :     if (ss == NULL)
     467           0 :         return 0;               /* impossible match, or internal error */
     468             : 
     469             :     /* We need to process one more chr, or the EOS symbol, to check match */
     470          56 :     if (cp < v->stop)
     471             :     {
     472             :         FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
     473          56 :         co = GETCOLOR(cm, *cp);
     474             :         FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
     475          56 :         ss = css->outs[co];
     476          56 :         if (ss == NULL)
     477          36 :             ss = miss(v, d, css, co, cp + 1, v->start);
     478             :     }
     479             :     else
     480             :     {
     481             :         assert(cp == v->stop);
     482           0 :         co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
     483             :         FDEBUG(("color %ld\n", (long) co));
     484           0 :         ss = miss(v, d, css, co, cp, v->start);
     485             :     }
     486             : 
     487          56 :     if (ss == NULL || !(ss->flags & POSTSTATE))
     488          40 :         return 0;
     489             : 
     490          16 :     return 1;
     491             : }
     492             : 
     493             : /*
     494             :  * dfa_backref - find best match length for a known backref string
     495             :  *
     496             :  * When the backref's referent is already available, we can deliver an exact
     497             :  * answer with considerably less work than running the backref node's NFA.
     498             :  *
     499             :  * Return match endpoint for longest or shortest valid repeated match,
     500             :  * or NULL if there is no valid match.
     501             :  *
     502             :  * Should be in sync with cbrdissect(), although that has the different task
     503             :  * of checking a match to a predetermined section of the string.
     504             :  */
     505             : static chr *
     506         908 : dfa_backref(struct vars *v,
     507             :             struct dfa *d,
     508             :             chr *start,         /* where the match should start */
     509             :             chr *min,           /* match must end at or after here */
     510             :             chr *max,           /* match must end at or before here */
     511             :             bool shortest)
     512             : {
     513         908 :     int         n = d->backno;
     514         908 :     int         backmin = d->backmin;
     515         908 :     int         backmax = d->backmax;
     516             :     size_t      numreps;
     517             :     size_t      minreps;
     518             :     size_t      maxreps;
     519             :     size_t      brlen;
     520             :     chr        *brstring;
     521             :     chr        *p;
     522             : 
     523             :     /* get the backreferenced string (caller should have checked this) */
     524         908 :     if (v->pmatch[n].rm_so == -1)
     525           0 :         return NULL;
     526         908 :     brstring = v->start + v->pmatch[n].rm_so;
     527         908 :     brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
     528             : 
     529             :     /* special-case zero-length backreference to avoid divide by zero */
     530         908 :     if (brlen == 0)
     531             :     {
     532             :         /*
     533             :          * matches only a zero-length string, but any number of repetitions
     534             :          * can be considered to be present
     535             :          */
     536           2 :         if (min == start && backmin <= backmax)
     537           2 :             return start;
     538           0 :         return NULL;
     539             :     }
     540             : 
     541             :     /*
     542             :      * convert min and max into numbers of possible repetitions of the backref
     543             :      * string, rounding appropriately
     544             :      */
     545         906 :     if (min <= start)
     546         906 :         minreps = 0;
     547             :     else
     548           0 :         minreps = (min - start - 1) / brlen + 1;
     549         906 :     maxreps = (max - start) / brlen;
     550             : 
     551             :     /* apply bounds, then see if there is any allowed match length */
     552         906 :     if (minreps < backmin)
     553         880 :         minreps = backmin;
     554         906 :     if (backmax != DUPINF && maxreps > backmax)
     555         420 :         maxreps = backmax;
     556         906 :     if (maxreps < minreps)
     557         208 :         return NULL;
     558             : 
     559             :     /* quick exit if zero-repetitions match is valid and preferred */
     560         698 :     if (shortest && minreps == 0)
     561           0 :         return start;
     562             : 
     563             :     /* okay, compare the actual string contents */
     564         698 :     p = start;
     565         698 :     numreps = 0;
     566         840 :     while (numreps < maxreps)
     567             :     {
     568         714 :         if ((*v->g->compare) (brstring, p, brlen) != 0)
     569         572 :             break;
     570         142 :         p += brlen;
     571         142 :         numreps++;
     572         142 :         if (shortest && numreps >= minreps)
     573           0 :             break;
     574             :     }
     575             : 
     576         698 :     if (numreps >= minreps)
     577         132 :         return p;
     578         566 :     return NULL;
     579             : }
     580             : 
     581             : /*
     582             :  * lastcold - determine last point at which no progress had been made
     583             :  */
     584             : static chr *                    /* endpoint, or NULL */
     585     1420446 : lastcold(struct vars *v,
     586             :          struct dfa *d)
     587             : {
     588             :     struct sset *ss;
     589             :     chr        *nopr;
     590             :     int         i;
     591             : 
     592     1420446 :     nopr = d->lastnopr;
     593     1420446 :     if (nopr == NULL)
     594     1420442 :         nopr = v->start;
     595     7635194 :     for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
     596     6214748 :         if ((ss->flags & NOPROGRESS) && nopr < ss->lastseen)
     597     2113794 :             nopr = ss->lastseen;
     598     1420446 :     return nopr;
     599             : }
     600             : 
     601             : /*
     602             :  * newdfa - set up a fresh DFA
     603             :  *
     604             :  * Returns NULL (and sets v->err) on failure.
     605             :  */
     606             : static struct dfa *
     607     2330410 : newdfa(struct vars *v,
     608             :        struct cnfa *cnfa,
     609             :        struct colormap *cm,
     610             :        struct smalldfa *sml)    /* preallocated space, may be NULL */
     611             : {
     612             :     struct dfa *d;
     613     2330410 :     size_t      nss = cnfa->nstates * 2;
     614     2330410 :     int         wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
     615     2330410 :     bool        ismalloced = false;
     616             : 
     617             :     assert(cnfa != NULL && cnfa->nstates != 0);
     618             : 
     619     2330410 :     if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS)
     620             :     {
     621             :         assert(wordsper == 1);
     622     2236068 :         if (sml == NULL)
     623             :         {
     624       15638 :             sml = (struct smalldfa *) MALLOC(sizeof(struct smalldfa));
     625       15638 :             if (sml == NULL)
     626             :             {
     627           0 :                 ERR(REG_ESPACE);
     628           0 :                 return NULL;
     629             :             }
     630       15638 :             ismalloced = true;
     631             :         }
     632     2236068 :         d = &sml->dfa;
     633     2236068 :         d->ssets = sml->ssets;
     634     2236068 :         d->statesarea = sml->statesarea;
     635     2236068 :         d->work = &d->statesarea[nss];
     636     2236068 :         d->outsarea = sml->outsarea;
     637     2236068 :         d->incarea = sml->incarea;
     638     2236068 :         d->ismalloced = ismalloced;
     639     2236068 :         d->arraysmalloced = false;   /* not separately allocated, anyway */
     640             :     }
     641             :     else
     642             :     {
     643       94342 :         d = (struct dfa *) MALLOC(sizeof(struct dfa));
     644       94342 :         if (d == NULL)
     645             :         {
     646           0 :             ERR(REG_ESPACE);
     647           0 :             return NULL;
     648             :         }
     649       94342 :         d->ssets = (struct sset *) MALLOC(nss * sizeof(struct sset));
     650       94342 :         d->statesarea = (unsigned *) MALLOC((nss + WORK) * wordsper *
     651             :                                             sizeof(unsigned));
     652       94342 :         d->work = &d->statesarea[nss * wordsper];
     653       94342 :         d->outsarea = (struct sset **) MALLOC(nss * cnfa->ncolors *
     654             :                                               sizeof(struct sset *));
     655       94342 :         d->incarea = (struct arcp *) MALLOC(nss * cnfa->ncolors *
     656             :                                             sizeof(struct arcp));
     657       94342 :         d->ismalloced = true;
     658       94342 :         d->arraysmalloced = true;
     659             :         /* now freedfa() will behave sanely */
     660       94342 :         if (d->ssets == NULL || d->statesarea == NULL ||
     661       94342 :             d->outsarea == NULL || d->incarea == NULL)
     662             :         {
     663           0 :             freedfa(d);
     664           0 :             ERR(REG_ESPACE);
     665           0 :             return NULL;
     666             :         }
     667             :     }
     668             : 
     669     2330410 :     d->nssets = (v->eflags & REG_SMALL) ? 7 : nss;
     670     2330410 :     d->nssused = 0;
     671     2330410 :     d->nstates = cnfa->nstates;
     672     2330410 :     d->ncolors = cnfa->ncolors;
     673     2330410 :     d->wordsper = wordsper;
     674     2330410 :     d->cnfa = cnfa;
     675     2330410 :     d->cm = cm;
     676     2330410 :     d->lastpost = NULL;
     677     2330410 :     d->lastnopr = NULL;
     678     2330410 :     d->search = d->ssets;
     679     2330410 :     d->backno = -1;              /* may be set by caller */
     680     2330410 :     d->backmin = d->backmax = 0;
     681             : 
     682             :     /* initialization of sset fields is done as needed */
     683             : 
     684     2330410 :     return d;
     685             : }
     686             : 
     687             : /*
     688             :  * freedfa - free a DFA
     689             :  */
     690             : static void
     691     2330410 : freedfa(struct dfa *d)
     692             : {
     693     2330410 :     if (d->arraysmalloced)
     694             :     {
     695       94342 :         if (d->ssets != NULL)
     696       94342 :             FREE(d->ssets);
     697       94342 :         if (d->statesarea != NULL)
     698       94342 :             FREE(d->statesarea);
     699       94342 :         if (d->outsarea != NULL)
     700       94342 :             FREE(d->outsarea);
     701       94342 :         if (d->incarea != NULL)
     702       94342 :             FREE(d->incarea);
     703             :     }
     704             : 
     705     2330410 :     if (d->ismalloced)
     706      109980 :         FREE(d);
     707     2330410 : }
     708             : 
     709             : /*
     710             :  * hash - construct a hash code for a bitvector
     711             :  *
     712             :  * There are probably better ways, but they're more expensive.
     713             :  */
     714             : static unsigned
     715       22528 : hash(unsigned *uv,
     716             :      int n)
     717             : {
     718             :     int         i;
     719             :     unsigned    h;
     720             : 
     721       22528 :     h = 0;
     722      119474 :     for (i = 0; i < n; i++)
     723       96946 :         h ^= uv[i];
     724       22528 :     return h;
     725             : }
     726             : 
     727             : /*
     728             :  * initialize - hand-craft a cache entry for startup, otherwise get ready
     729             :  */
     730             : static struct sset *
     731     2332966 : initialize(struct vars *v,
     732             :            struct dfa *d,
     733             :            chr *start)
     734             : {
     735             :     struct sset *ss;
     736             :     int         i;
     737             : 
     738             :     /* is previous one still there? */
     739     2332966 :     if (d->nssused > 0 && (d->ssets[0].flags & STARTER))
     740        4414 :         ss = &d->ssets[0];
     741             :     else
     742             :     {                           /* no, must (re)build it */
     743     2328552 :         ss = getvacant(v, d, start, start);
     744     2328552 :         if (ss == NULL)
     745           0 :             return NULL;
     746     4658088 :         for (i = 0; i < d->wordsper; i++)
     747     2329536 :             ss->states[i] = 0;
     748     2328552 :         BSET(ss->states, d->cnfa->pre);
     749     2328552 :         ss->hash = HASH(ss->states, d->wordsper);
     750             :         assert(d->cnfa->pre != d->cnfa->post);
     751     2328552 :         ss->flags = STARTER | LOCKED | NOPROGRESS;
     752             :         /* lastseen dealt with below */
     753             :     }
     754             : 
     755     4737432 :     for (i = 0; i < d->nssused; i++)
     756     2404466 :         d->ssets[i].lastseen = NULL;
     757     2332966 :     ss->lastseen = start;        /* maybe untrue, but harmless */
     758     2332966 :     d->lastpost = NULL;
     759     2332966 :     d->lastnopr = NULL;
     760     2332966 :     return ss;
     761             : }
     762             : 
     763             : /*
     764             :  * miss - handle a stateset cache miss
     765             :  *
     766             :  * css is the current stateset, co is the color of the current input character,
     767             :  * cp points to the character after that (which is where we may need to test
     768             :  * LACONs).  start does not affect matching behavior but is needed for pickss'
     769             :  * heuristics about which stateset cache entry to replace.
     770             :  *
     771             :  * Ordinarily, returns the address of the next stateset (the one that is
     772             :  * valid after consuming the input character).  Returns NULL if no valid
     773             :  * NFA states remain, ie we have a certain match failure.
     774             :  * Internal errors also return NULL, with v->err set.
     775             :  */
     776             : static struct sset *
     777    10663842 : miss(struct vars *v,
     778             :      struct dfa *d,
     779             :      struct sset *css,
     780             :      color co,
     781             :      chr *cp,                   /* next chr */
     782             :      chr *start)                /* where the attempt got started */
     783             : {
     784    10663842 :     struct cnfa *cnfa = d->cnfa;
     785             :     int         i;
     786             :     unsigned    h;
     787             :     struct carc *ca;
     788             :     struct sset *p;
     789             :     int         ispseudocolor;
     790             :     int         ispost;
     791             :     int         noprogress;
     792             :     int         gotstate;
     793             :     int         dolacons;
     794             :     int         sawlacons;
     795             : 
     796             :     /* for convenience, we can be called even if it might not be a miss */
     797    10663842 :     if (css->outs[co] != NULL)
     798             :     {
     799             :         FDEBUG(("hit\n"));
     800        3692 :         return css->outs[co];
     801             :     }
     802             :     FDEBUG(("miss\n"));
     803             : 
     804             :     /*
     805             :      * Checking for operation cancel in the inner text search loop seems
     806             :      * unduly expensive.  As a compromise, check during cache misses.
     807             :      */
     808    10660150 :     if (CANCEL_REQUESTED(v->re))
     809             :     {
     810           0 :         ERR(REG_CANCEL);
     811           0 :         return NULL;
     812             :     }
     813             : 
     814             :     /*
     815             :      * What set of states would we end up in after consuming the co character?
     816             :      * We first consider PLAIN arcs that consume the character, and then look
     817             :      * to see what LACON arcs could be traversed after consuming it.
     818             :      */
     819    21397788 :     for (i = 0; i < d->wordsper; i++)
     820    10737638 :         d->work[i] = 0;          /* build new stateset bitmap in d->work */
     821    10660150 :     ispseudocolor = d->cm->cd[co].flags & PSEUDO;
     822    10660150 :     ispost = 0;
     823    10660150 :     noprogress = 1;
     824    10660150 :     gotstate = 0;
     825    76772140 :     for (i = 0; i < d->nstates; i++)
     826    66111990 :         if (ISBSET(css->states, i))
     827    77044528 :             for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
     828    58162482 :                 if (ca->co == co ||
     829    53231218 :                     (ca->co == RAINBOW && !ispseudocolor))
     830             :                 {
     831    19980814 :                     BSET(d->work, ca->to);
     832    19980814 :                     gotstate = 1;
     833    19980814 :                     if (ca->to == cnfa->post)
     834     1878950 :                         ispost = 1;
     835    19980814 :                     if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
     836     6455096 :                         noprogress = 0;
     837             :                     FDEBUG(("%d -> %d\n", i, ca->to));
     838             :                 }
     839    10660150 :     if (!gotstate)
     840     1283330 :         return NULL;            /* character cannot reach any new state */
     841     9376820 :     dolacons = (cnfa->flags & HASLACONS);
     842     9376820 :     sawlacons = 0;
     843             :     /* outer loop handles transitive closure of reachable-by-LACON states */
     844     9377942 :     while (dolacons)
     845             :     {
     846        1122 :         dolacons = 0;
     847       10122 :         for (i = 0; i < d->nstates; i++)
     848        9000 :             if (ISBSET(d->work, i))
     849        3898 :                 for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
     850             :                 {
     851        2382 :                     if (ca->co < cnfa->ncolors)
     852        2086 :                         continue;   /* not a LACON arc */
     853         296 :                     if (ISBSET(d->work, ca->to))
     854         108 :                         continue;   /* arc would be a no-op anyway */
     855         188 :                     sawlacons = 1;  /* this LACON affects our result */
     856         188 :                     if (!lacon(v, cnfa, cp, ca->co))
     857             :                     {
     858          94 :                         if (ISERR())
     859           0 :                             return NULL;
     860          94 :                         continue;   /* LACON arc cannot be traversed */
     861             :                     }
     862          94 :                     if (ISERR())
     863           0 :                         return NULL;
     864          94 :                     BSET(d->work, ca->to);
     865          94 :                     dolacons = 1;
     866          94 :                     if (ca->to == cnfa->post)
     867           0 :                         ispost = 1;
     868          94 :                     if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
     869          94 :                         noprogress = 0;
     870             :                     FDEBUG(("%d :> %d\n", i, ca->to));
     871             :                 }
     872             :     }
     873     9376820 :     h = HASH(d->work, d->wordsper);
     874             : 
     875             :     /* Is this stateset already in the cache? */
     876    32796076 :     for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
     877    25317422 :         if (HIT(h, d->work, p, d->wordsper))
     878             :         {
     879             :             FDEBUG(("cached c%d\n", (int) (p - d->ssets)));
     880             :             break;              /* NOTE BREAK OUT */
     881             :         }
     882     9376820 :     if (i == 0)
     883             :     {                           /* nope, need a new cache entry */
     884     7478654 :         p = getvacant(v, d, cp, start);
     885     7478654 :         if (p == NULL)
     886           0 :             return NULL;
     887             :         assert(p != css);
     888    15024822 :         for (i = 0; i < d->wordsper; i++)
     889     7546168 :             p->states[i] = d->work[i];
     890     7478654 :         p->hash = h;
     891     7478654 :         p->flags = (ispost) ? POSTSTATE : 0;
     892     7478654 :         if (noprogress)
     893     2329656 :             p->flags |= NOPROGRESS;
     894             :         /* lastseen to be dealt with by caller */
     895             :     }
     896             : 
     897             :     /*
     898             :      * Link new stateset to old, unless a LACON affected the result, in which
     899             :      * case we don't create the link.  That forces future transitions across
     900             :      * this same arc (same prior stateset and character color) to come through
     901             :      * miss() again, so that we can recheck the LACON(s), which might or might
     902             :      * not pass since context will be different.
     903             :      */
     904     9376820 :     if (!sawlacons)
     905             :     {
     906             :         FDEBUG(("c%d[%d]->c%d\n",
     907             :                 (int) (css - d->ssets), co, (int) (p - d->ssets)));
     908     9376682 :         css->outs[co] = p;
     909     9376682 :         css->inchain[co] = p->ins;
     910     9376682 :         p->ins.ss = css;
     911     9376682 :         p->ins.co = co;
     912             :     }
     913     9376820 :     return p;
     914             : }
     915             : 
     916             : /*
     917             :  * lacon - lookaround-constraint checker for miss()
     918             :  */
     919             : static int                      /* predicate:  constraint satisfied? */
     920         188 : lacon(struct vars *v,
     921             :       struct cnfa *pcnfa,       /* parent cnfa */
     922             :       chr *cp,
     923             :       color co)                 /* "color" of the lookaround constraint */
     924             : {
     925             :     int         n;
     926             :     struct subre *sub;
     927             :     struct dfa *d;
     928             :     chr        *end;
     929             :     int         satisfied;
     930             : 
     931             :     /* Since this is recursive, it could be driven to stack overflow */
     932         188 :     if (STACK_TOO_DEEP(v->re))
     933             :     {
     934           0 :         ERR(REG_ETOOBIG);
     935           0 :         return 0;
     936             :     }
     937             : 
     938         188 :     n = co - pcnfa->ncolors;
     939             :     assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
     940             :     FDEBUG(("=== testing lacon %d\n", n));
     941         188 :     sub = &v->g->lacons[n];
     942         188 :     d = getladfa(v, n);
     943         188 :     if (d == NULL)
     944           0 :         return 0;
     945         188 :     if (LATYPE_IS_AHEAD(sub->latype))
     946             :     {
     947             :         /* used to use longest() here, but shortest() could be much cheaper */
     948          96 :         end = shortest(v, d, cp, cp, v->stop,
     949             :                        (chr **) NULL, (int *) NULL);
     950          96 :         satisfied = LATYPE_IS_POS(sub->latype) ? (end != NULL) : (end == NULL);
     951             :     }
     952             :     else
     953             :     {
     954             :         /*
     955             :          * To avoid doing O(N^2) work when repeatedly testing a lookbehind
     956             :          * constraint in an N-character string, we use matchuntil() which can
     957             :          * cache the DFA state across calls.  We only need to restart if the
     958             :          * probe point decreases, which is not common.  The NFA we're using is
     959             :          * a search NFA, so it doesn't mind scanning over stuff before the
     960             :          * nominal match.
     961             :          */
     962          92 :         satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]);
     963          92 :         if (!LATYPE_IS_POS(sub->latype))
     964           0 :             satisfied = !satisfied;
     965             :     }
     966             :     FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied));
     967         188 :     return satisfied;
     968             : }
     969             : 
     970             : /*
     971             :  * getvacant - get a vacant state set
     972             :  *
     973             :  * This routine clears out the inarcs and outarcs, but does not otherwise
     974             :  * clear the innards of the state set -- that's up to the caller.
     975             :  */
     976             : static struct sset *
     977     9807206 : getvacant(struct vars *v,
     978             :           struct dfa *d,
     979             :           chr *cp,
     980             :           chr *start)
     981             : {
     982             :     int         i;
     983             :     struct sset *ss;
     984             :     struct sset *p;
     985             :     struct arcp ap;
     986             :     color       co;
     987             : 
     988     9807206 :     ss = pickss(v, d, cp, start);
     989     9807206 :     if (ss == NULL)
     990           0 :         return NULL;
     991             :     assert(!(ss->flags & LOCKED));
     992             : 
     993             :     /* clear out its inarcs, including self-referential ones */
     994     9807206 :     ap = ss->ins;
     995     9807230 :     while ((p = ap.ss) != NULL)
     996             :     {
     997          24 :         co = ap.co;
     998             :         FDEBUG(("zapping c%d's %ld outarc\n", (int) (p - d->ssets), (long) co));
     999          24 :         p->outs[co] = NULL;
    1000          24 :         ap = p->inchain[co];
    1001          24 :         p->inchain[co].ss = NULL;    /* paranoia */
    1002             :     }
    1003     9807206 :     ss->ins.ss = NULL;
    1004             : 
    1005             :     /* take it off the inarc chains of the ssets reached by its outarcs */
    1006    82935378 :     for (i = 0; i < d->ncolors; i++)
    1007             :     {
    1008    73128172 :         p = ss->outs[i];
    1009             :         assert(p != ss);        /* not self-referential */
    1010    73128172 :         if (p == NULL)
    1011    73128040 :             continue;           /* NOTE CONTINUE */
    1012             :         FDEBUG(("del outarc %d from c%d's in chn\n", i, (int) (p - d->ssets)));
    1013         132 :         if (p->ins.ss == ss && p->ins.co == i)
    1014         120 :             p->ins = ss->inchain[i];
    1015             :         else
    1016             :         {
    1017          12 :             struct arcp lastap = {NULL, 0};
    1018             : 
    1019             :             assert(p->ins.ss != NULL);
    1020          24 :             for (ap = p->ins; ap.ss != NULL &&
    1021          24 :                  !(ap.ss == ss && ap.co == i);
    1022          12 :                  ap = ap.ss->inchain[ap.co])
    1023          12 :                 lastap = ap;
    1024             :             assert(ap.ss != NULL);
    1025          12 :             lastap.ss->inchain[lastap.co] = ss->inchain[i];
    1026             :         }
    1027         132 :         ss->outs[i] = NULL;
    1028         132 :         ss->inchain[i].ss = NULL;
    1029             :     }
    1030             : 
    1031             :     /* if ss was a success state, may need to remember location */
    1032     9807206 :     if ((ss->flags & POSTSTATE) && ss->lastseen != d->lastpost &&
    1033          36 :         (d->lastpost == NULL || d->lastpost < ss->lastseen))
    1034          36 :         d->lastpost = ss->lastseen;
    1035             : 
    1036             :     /* likewise for a no-progress state */
    1037     9807206 :     if ((ss->flags & NOPROGRESS) && ss->lastseen != d->lastnopr &&
    1038          12 :         (d->lastnopr == NULL || d->lastnopr < ss->lastseen))
    1039          12 :         d->lastnopr = ss->lastseen;
    1040             : 
    1041     9807206 :     return ss;
    1042             : }
    1043             : 
    1044             : /*
    1045             :  * pickss - pick the next stateset to be used
    1046             :  */
    1047             : static struct sset *
    1048     9807206 : pickss(struct vars *v,
    1049             :        struct dfa *d,
    1050             :        chr *cp,
    1051             :        chr *start)
    1052             : {
    1053             :     int         i;
    1054             :     struct sset *ss;
    1055             :     struct sset *end;
    1056             :     chr        *ancient;
    1057             : 
    1058             :     /* shortcut for cases where cache isn't full */
    1059     9807206 :     if (d->nssused < d->nssets)
    1060             :     {
    1061     9807074 :         i = d->nssused;
    1062     9807074 :         d->nssused++;
    1063     9807074 :         ss = &d->ssets[i];
    1064             :         FDEBUG(("new c%d\n", i));
    1065             :         /* set up innards */
    1066     9807074 :         ss->states = &d->statesarea[i * d->wordsper];
    1067     9807074 :         ss->flags = 0;
    1068     9807074 :         ss->ins.ss = NULL;
    1069     9807074 :         ss->ins.co = WHITE;      /* give it some value */
    1070     9807074 :         ss->outs = &d->outsarea[i * d->ncolors];
    1071     9807074 :         ss->inchain = &d->incarea[i * d->ncolors];
    1072    82932174 :         for (i = 0; i < d->ncolors; i++)
    1073             :         {
    1074    73125100 :             ss->outs[i] = NULL;
    1075    73125100 :             ss->inchain[i].ss = NULL;
    1076             :         }
    1077     9807074 :         return ss;
    1078             :     }
    1079             : 
    1080             :     /* look for oldest, or old enough anyway */
    1081         132 :     if (cp - start > d->nssets * 2 / 3) /* oldest 33% are expendable */
    1082         132 :         ancient = cp - d->nssets * 2 / 3;
    1083             :     else
    1084           0 :         ancient = start;
    1085         144 :     for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++)
    1086         132 :         if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
    1087         132 :             !(ss->flags & LOCKED))
    1088             :         {
    1089         120 :             d->search = ss + 1;
    1090             :             FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
    1091         120 :             return ss;
    1092             :         }
    1093          24 :     for (ss = d->ssets, end = d->search; ss < end; ss++)
    1094          24 :         if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
    1095          24 :             !(ss->flags & LOCKED))
    1096             :         {
    1097          12 :             d->search = ss + 1;
    1098             :             FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
    1099          12 :             return ss;
    1100             :         }
    1101             : 
    1102             :     /* nobody's old enough?!? -- something's really wrong */
    1103             :     FDEBUG(("cannot find victim to replace!\n"));
    1104           0 :     ERR(REG_ASSERT);
    1105           0 :     return NULL;
    1106             : }

Generated by: LCOV version 1.14