LCOV - code coverage report
Current view: top level - src/backend/regex - rege_dfa.c (source / functions) Hit Total Coverage
Test: PostgreSQL 17devel Lines: 404 448 90.2 %
Date: 2024-04-26 17:11:02 Functions: 13 13 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * DFA routines
       3             :  * This file is #included by regexec.c.
       4             :  *
       5             :  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
       6             :  *
       7             :  * Development of this software was funded, in part, by Cray Research Inc.,
       8             :  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
       9             :  * Corporation, none of whom are responsible for the results.  The author
      10             :  * thanks all of them.
      11             :  *
      12             :  * Redistribution and use in source and binary forms -- with or without
      13             :  * modification -- are permitted for any purpose, provided that
      14             :  * redistributions in source form retain this entire copyright notice and
      15             :  * indicate the origin and nature of any modifications.
      16             :  *
      17             :  * I'd appreciate being given credit for this package in the documentation
      18             :  * of software which uses it, but that is not a requirement.
      19             :  *
      20             :  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
      21             :  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
      22             :  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
      23             :  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
      24             :  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
      25             :  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
      26             :  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
      27             :  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
      28             :  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
      29             :  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
      30             :  *
      31             :  * src/backend/regex/rege_dfa.c
      32             :  *
      33             :  */
      34             : 
      35             : /*
      36             :  * longest - longest-preferred matching engine
      37             :  *
      38             :  * On success, returns match endpoint address.  Returns NULL on no match.
      39             :  * Internal errors also return NULL, with v->err set.
      40             :  */
      41             : static chr *
      42      937362 : longest(struct vars *v,
      43             :         struct dfa *d,
      44             :         chr *start,             /* where the match should start */
      45             :         chr *stop,              /* match must end at or before here */
      46             :         int *hitstopp)          /* record whether hit v->stop, if non-NULL */
      47             : {
      48             :     chr        *cp;
      49      937362 :     chr        *realstop = (stop == v->stop) ? stop : stop + 1;
      50             :     color       co;
      51             :     struct sset *css;
      52             :     struct sset *ss;
      53             :     chr        *post;
      54             :     int         i;
      55      937362 :     struct colormap *cm = d->cm;
      56             : 
      57             :     /* prevent "uninitialized variable" warnings */
      58      937362 :     if (hitstopp != NULL)
      59      904276 :         *hitstopp = 0;
      60             : 
      61             :     /* if this is a backref to a known string, just match against that */
      62      937362 :     if (d->backno >= 0)
      63             :     {
      64             :         assert((size_t) d->backno < v->nmatch);
      65        1586 :         if (v->pmatch[d->backno].rm_so >= 0)
      66             :         {
      67        1232 :             cp = dfa_backref(v, d, start, start, stop, false);
      68        1232 :             if (cp == v->stop && stop == v->stop && hitstopp != NULL)
      69           0 :                 *hitstopp = 1;
      70        1232 :             return cp;
      71             :         }
      72             :     }
      73             : 
      74             :     /* fast path for matchall NFAs */
      75      936130 :     if (d->cnfa->flags & MATCHALL)
      76             :     {
      77        4576 :         size_t      nchr = stop - start;
      78        4576 :         size_t      maxmatchall = d->cnfa->maxmatchall;
      79             : 
      80        4576 :         if (nchr < d->cnfa->minmatchall)
      81         330 :             return NULL;
      82        4246 :         if (maxmatchall == DUPINF)
      83             :         {
      84        2562 :             if (stop == v->stop && hitstopp != NULL)
      85          10 :                 *hitstopp = 1;
      86             :         }
      87             :         else
      88             :         {
      89        1684 :             if (stop == v->stop && nchr <= maxmatchall + 1 && hitstopp != NULL)
      90         168 :                 *hitstopp = 1;
      91        1684 :             if (nchr > maxmatchall)
      92        1144 :                 return start + maxmatchall;
      93             :         }
      94        3102 :         return stop;
      95             :     }
      96             : 
      97             :     /* initialize */
      98      931554 :     css = initialize(v, d, start);
      99      931554 :     if (css == NULL)
     100           0 :         return NULL;
     101      931554 :     cp = start;
     102             : 
     103             :     /* startup */
     104             :     FDEBUG(("+++ startup +++\n"));
     105      931554 :     if (cp == v->start)
     106             :     {
     107        3282 :         co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
     108             :         FDEBUG(("color %ld\n", (long) co));
     109             :     }
     110             :     else
     111             :     {
     112      928272 :         co = GETCOLOR(cm, *(cp - 1));
     113             :         FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
     114             :     }
     115      931554 :     css = miss(v, d, css, co, cp, start);
     116      931554 :     if (css == NULL)
     117         432 :         return NULL;
     118      931122 :     css->lastseen = cp;
     119             : 
     120             :     /*
     121             :      * This is the main text-scanning loop.  It seems worth having two copies
     122             :      * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
     123             :      * builds, when you're not actively tracing.
     124             :      */
     125             : #ifdef REG_DEBUG
     126             :     if (v->eflags & REG_FTRACE)
     127             :     {
     128             :         while (cp < realstop)
     129             :         {
     130             :             FDEBUG(("+++ at c%d +++\n", (int) (css - d->ssets)));
     131             :             co = GETCOLOR(cm, *cp);
     132             :             FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
     133             :             ss = css->outs[co];
     134             :             if (ss == NULL)
     135             :             {
     136             :                 ss = miss(v, d, css, co, cp + 1, start);
     137             :                 if (ss == NULL)
     138             :                     break;      /* NOTE BREAK OUT */
     139             :             }
     140             :             cp++;
     141             :             ss->lastseen = cp;
     142             :             css = ss;
     143             :         }
     144             :     }
     145             :     else
     146             : #endif
     147             :     {
     148    11850698 :         while (cp < realstop)
     149             :         {
     150    11825660 :             co = GETCOLOR(cm, *cp);
     151    11825660 :             ss = css->outs[co];
     152    11825660 :             if (ss == NULL)
     153             :             {
     154     2970300 :                 ss = miss(v, d, css, co, cp + 1, start);
     155     2970300 :                 if (ss == NULL)
     156      906084 :                     break;      /* NOTE BREAK OUT */
     157             :             }
     158    10919576 :             cp++;
     159    10919576 :             ss->lastseen = cp;
     160    10919576 :             css = ss;
     161             :         }
     162             :     }
     163             : 
     164      931122 :     if (ISERR())
     165           0 :         return NULL;
     166             : 
     167             :     /* shutdown */
     168             :     FDEBUG(("+++ shutdown at c%d +++\n", (int) (css - d->ssets)));
     169      931122 :     if (cp == v->stop && stop == v->stop)
     170             :     {
     171       13134 :         if (hitstopp != NULL)
     172        6290 :             *hitstopp = 1;
     173       13134 :         co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
     174             :         FDEBUG(("color %ld\n", (long) co));
     175       13134 :         ss = miss(v, d, css, co, cp, start);
     176       13134 :         if (ISERR())
     177           0 :             return NULL;
     178             :         /* special case:  match ended at eol? */
     179       13134 :         if (ss != NULL && (ss->flags & POSTSTATE))
     180        6980 :             return cp;
     181        6154 :         else if (ss != NULL)
     182           0 :             ss->lastseen = cp;   /* to be tidy */
     183             :     }
     184             : 
     185             :     /* find last match, if any */
     186      924142 :     post = d->lastpost;
     187     4789640 :     for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
     188     3865498 :         if ((ss->flags & POSTSTATE) && post != ss->lastseen &&
     189       13908 :             (post == NULL || post < ss->lastseen))
     190      933630 :             post = ss->lastseen;
     191      924142 :     if (post != NULL)           /* found one */
     192      920486 :         return post - 1;
     193             : 
     194        3656 :     return NULL;
     195             : }
     196             : 
     197             : /*
     198             :  * shortest - shortest-preferred matching engine
     199             :  *
     200             :  * On success, returns match endpoint address.  Returns NULL on no match.
     201             :  * Internal errors also return NULL, with v->err set.
     202             :  */
     203             : static chr *
     204     1874098 : shortest(struct vars *v,
     205             :          struct dfa *d,
     206             :          chr *start,            /* where the match should start */
     207             :          chr *min,              /* match must end at or after here */
     208             :          chr *max,              /* match must end at or before here */
     209             :          chr **coldp,           /* store coldstart pointer here, if non-NULL */
     210             :          int *hitstopp)         /* record whether hit v->stop, if non-NULL */
     211             : {
     212             :     chr        *cp;
     213     1874098 :     chr        *realmin = (min == v->stop) ? min : min + 1;
     214     1874098 :     chr        *realmax = (max == v->stop) ? max : max + 1;
     215             :     color       co;
     216             :     struct sset *css;
     217             :     struct sset *ss;
     218     1874098 :     struct colormap *cm = d->cm;
     219             : 
     220             :     /* prevent "uninitialized variable" warnings */
     221     1874098 :     if (coldp != NULL)
     222     1872438 :         *coldp = NULL;
     223     1874098 :     if (hitstopp != NULL)
     224         318 :         *hitstopp = 0;
     225             : 
     226             :     /* if this is a backref to a known string, just match against that */
     227     1874098 :     if (d->backno >= 0)
     228             :     {
     229             :         assert((size_t) d->backno < v->nmatch);
     230           0 :         if (v->pmatch[d->backno].rm_so >= 0)
     231             :         {
     232           0 :             cp = dfa_backref(v, d, start, min, max, true);
     233           0 :             if (cp != NULL && coldp != NULL)
     234           0 :                 *coldp = start;
     235             :             /* there is no case where we should set *hitstopp */
     236           0 :             return cp;
     237             :         }
     238             :     }
     239             : 
     240             :     /* fast path for matchall NFAs */
     241     1874098 :     if (d->cnfa->flags & MATCHALL)
     242             :     {
     243        1972 :         size_t      nchr = min - start;
     244             : 
     245        1972 :         if (d->cnfa->maxmatchall != DUPINF &&
     246          24 :             nchr > d->cnfa->maxmatchall)
     247           0 :             return NULL;
     248        1972 :         if ((max - start) < d->cnfa->minmatchall)
     249          18 :             return NULL;
     250        1954 :         if (nchr < d->cnfa->minmatchall)
     251         130 :             min = start + d->cnfa->minmatchall;
     252        1954 :         if (coldp != NULL)
     253         894 :             *coldp = start;
     254             :         /* there is no case where we should set *hitstopp */
     255        1954 :         return min;
     256             :     }
     257             : 
     258             :     /* initialize */
     259     1872126 :     css = initialize(v, d, start);
     260     1872126 :     if (css == NULL)
     261           0 :         return NULL;
     262     1872126 :     cp = start;
     263             : 
     264             :     /* startup */
     265             :     FDEBUG(("--- startup ---\n"));
     266     1872126 :     if (cp == v->start)
     267             :     {
     268      975824 :         co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
     269             :         FDEBUG(("color %ld\n", (long) co));
     270             :     }
     271             :     else
     272             :     {
     273      896302 :         co = GETCOLOR(cm, *(cp - 1));
     274             :         FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
     275             :     }
     276     1872126 :     css = miss(v, d, css, co, cp, start);
     277     1872126 :     if (css == NULL)
     278          12 :         return NULL;
     279     1872114 :     css->lastseen = cp;
     280     1872114 :     ss = css;
     281             : 
     282             :     /*
     283             :      * This is the main text-scanning loop.  It seems worth having two copies
     284             :      * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
     285             :      * builds, when you're not actively tracing.
     286             :      */
     287             : #ifdef REG_DEBUG
     288             :     if (v->eflags & REG_FTRACE)
     289             :     {
     290             :         while (cp < realmax)
     291             :         {
     292             :             FDEBUG(("--- at c%d ---\n", (int) (css - d->ssets)));
     293             :             co = GETCOLOR(cm, *cp);
     294             :             FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
     295             :             ss = css->outs[co];
     296             :             if (ss == NULL)
     297             :             {
     298             :                 ss = miss(v, d, css, co, cp + 1, start);
     299             :                 if (ss == NULL)
     300             :                     break;      /* NOTE BREAK OUT */
     301             :             }
     302             :             cp++;
     303             :             ss->lastseen = cp;
     304             :             css = ss;
     305             :             if ((ss->flags & POSTSTATE) && cp >= realmin)
     306             :                 break;          /* NOTE BREAK OUT */
     307             :         }
     308             :     }
     309             :     else
     310             : #endif
     311             :     {
     312    36030010 :         while (cp < realmax)
     313             :         {
     314    35552972 :             co = GETCOLOR(cm, *cp);
     315    35552972 :             ss = css->outs[co];
     316    35552972 :             if (ss == NULL)
     317             :             {
     318     6332690 :                 ss = miss(v, d, css, co, cp + 1, start);
     319     6332690 :                 if (ss == NULL)
     320      268322 :                     break;      /* NOTE BREAK OUT */
     321             :             }
     322    35284650 :             cp++;
     323    35284650 :             ss->lastseen = cp;
     324    35284650 :             css = ss;
     325    35284650 :             if ((ss->flags & POSTSTATE) && cp >= realmin)
     326     1126754 :                 break;          /* NOTE BREAK OUT */
     327             :         }
     328             :     }
     329             : 
     330     1872114 :     if (ss == NULL)
     331      268322 :         return NULL;
     332             : 
     333     1603792 :     if (coldp != NULL)          /* report last no-progress state set, if any */
     334     1603256 :         *coldp = lastcold(v, d);
     335             : 
     336     1603792 :     if ((ss->flags & POSTSTATE) && cp > min)
     337             :     {
     338             :         assert(cp >= realmin);
     339     1126722 :         cp--;
     340             :     }
     341      477070 :     else if (cp == v->stop && max == v->stop)
     342             :     {
     343      477070 :         co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
     344             :         FDEBUG(("color %ld\n", (long) co));
     345      477070 :         ss = miss(v, d, css, co, cp, start);
     346             :         /* match might have ended at eol */
     347      477070 :         if ((ss == NULL || !(ss->flags & POSTSTATE)) && hitstopp != NULL)
     348          12 :             *hitstopp = 1;
     349             :     }
     350             : 
     351     1603792 :     if (ss == NULL || !(ss->flags & POSTSTATE))
     352      458412 :         return NULL;
     353             : 
     354     1145380 :     return cp;
     355             : }
     356             : 
     357             : /*
     358             :  * matchuntil - incremental matching engine
     359             :  *
     360             :  * This is meant for use with a search-style NFA (that is, the pattern is
     361             :  * known to act as though it had a leading .*).  We determine whether a
     362             :  * match exists starting at v->start and ending at probe.  Multiple calls
     363             :  * require only O(N) time not O(N^2) so long as the probe values are
     364             :  * nondecreasing.  *lastcss and *lastcp must be initialized to NULL before
     365             :  * starting a series of calls.
     366             :  *
     367             :  * Returns 1 if a match exists, 0 if not.
     368             :  * Internal errors also return 0, with v->err set.
     369             :  */
     370             : static int
     371         120 : matchuntil(struct vars *v,
     372             :            struct dfa *d,
     373             :            chr *probe,          /* we want to know if a match ends here */
     374             :            struct sset **lastcss,   /* state storage across calls */
     375             :            chr **lastcp)        /* state storage across calls */
     376             : {
     377         120 :     chr        *cp = *lastcp;
     378             :     color       co;
     379         120 :     struct sset *css = *lastcss;
     380             :     struct sset *ss;
     381         120 :     struct colormap *cm = d->cm;
     382             : 
     383             :     /* fast path for matchall NFAs */
     384         120 :     if (d->cnfa->flags & MATCHALL)
     385             :     {
     386          36 :         size_t      nchr = probe - v->start;
     387             : 
     388          36 :         if (nchr < d->cnfa->minmatchall)
     389          18 :             return 0;
     390             :         /* maxmatchall will always be infinity, cf. makesearch() */
     391             :         assert(d->cnfa->maxmatchall == DUPINF);
     392          18 :         return 1;
     393             :     }
     394             : 
     395             :     /* initialize and startup, or restart, if necessary */
     396          84 :     if (cp == NULL || cp > probe)
     397             :     {
     398          24 :         cp = v->start;
     399          24 :         css = initialize(v, d, cp);
     400          24 :         if (css == NULL)
     401           0 :             return 0;
     402             : 
     403             :         FDEBUG((">>> startup >>>\n"));
     404          24 :         co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
     405             :         FDEBUG(("color %ld\n", (long) co));
     406             : 
     407          24 :         css = miss(v, d, css, co, cp, v->start);
     408          24 :         if (css == NULL)
     409           0 :             return 0;
     410          24 :         css->lastseen = cp;
     411             :     }
     412          60 :     else if (css == NULL)
     413             :     {
     414             :         /* we previously found that no match is possible beyond *lastcp */
     415           0 :         return 0;
     416             :     }
     417          84 :     ss = css;
     418             : 
     419             :     /*
     420             :      * This is the main text-scanning loop.  It seems worth having two copies
     421             :      * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
     422             :      * builds, when you're not actively tracing.
     423             :      */
     424             : #ifdef REG_DEBUG
     425             :     if (v->eflags & REG_FTRACE)
     426             :     {
     427             :         while (cp < probe)
     428             :         {
     429             :             FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
     430             :             co = GETCOLOR(cm, *cp);
     431             :             FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
     432             :             ss = css->outs[co];
     433             :             if (ss == NULL)
     434             :             {
     435             :                 ss = miss(v, d, css, co, cp + 1, v->start);
     436             :                 if (ss == NULL)
     437             :                     break;      /* NOTE BREAK OUT */
     438             :             }
     439             :             cp++;
     440             :             ss->lastseen = cp;
     441             :             css = ss;
     442             :         }
     443             :     }
     444             :     else
     445             : #endif
     446             :     {
     447         180 :         while (cp < probe)
     448             :         {
     449          96 :             co = GETCOLOR(cm, *cp);
     450          96 :             ss = css->outs[co];
     451          96 :             if (ss == NULL)
     452             :             {
     453          12 :                 ss = miss(v, d, css, co, cp + 1, v->start);
     454          12 :                 if (ss == NULL)
     455           0 :                     break;      /* NOTE BREAK OUT */
     456             :             }
     457          96 :             cp++;
     458          96 :             ss->lastseen = cp;
     459          96 :             css = ss;
     460             :         }
     461             :     }
     462             : 
     463          84 :     *lastcss = ss;
     464          84 :     *lastcp = cp;
     465             : 
     466          84 :     if (ss == NULL)
     467           0 :         return 0;               /* impossible match, or internal error */
     468             : 
     469             :     /* We need to process one more chr, or the EOS symbol, to check match */
     470          84 :     if (cp < v->stop)
     471             :     {
     472             :         FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
     473          84 :         co = GETCOLOR(cm, *cp);
     474             :         FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
     475          84 :         ss = css->outs[co];
     476          84 :         if (ss == NULL)
     477          54 :             ss = miss(v, d, css, co, cp + 1, v->start);
     478             :     }
     479             :     else
     480             :     {
     481             :         assert(cp == v->stop);
     482           0 :         co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
     483             :         FDEBUG(("color %ld\n", (long) co));
     484           0 :         ss = miss(v, d, css, co, cp, v->start);
     485             :     }
     486             : 
     487          84 :     if (ss == NULL || !(ss->flags & POSTSTATE))
     488          60 :         return 0;
     489             : 
     490          24 :     return 1;
     491             : }
     492             : 
     493             : /*
     494             :  * dfa_backref - find best match length for a known backref string
     495             :  *
     496             :  * When the backref's referent is already available, we can deliver an exact
     497             :  * answer with considerably less work than running the backref node's NFA.
     498             :  *
     499             :  * Return match endpoint for longest or shortest valid repeated match,
     500             :  * or NULL if there is no valid match.
     501             :  *
     502             :  * Should be in sync with cbrdissect(), although that has the different task
     503             :  * of checking a match to a predetermined section of the string.
     504             :  */
     505             : static chr *
     506        1232 : dfa_backref(struct vars *v,
     507             :             struct dfa *d,
     508             :             chr *start,         /* where the match should start */
     509             :             chr *min,           /* match must end at or after here */
     510             :             chr *max,           /* match must end at or before here */
     511             :             bool shortest)
     512             : {
     513        1232 :     int         n = d->backno;
     514        1232 :     int         backmin = d->backmin;
     515        1232 :     int         backmax = d->backmax;
     516             :     size_t      numreps;
     517             :     size_t      minreps;
     518             :     size_t      maxreps;
     519             :     size_t      brlen;
     520             :     chr        *brstring;
     521             :     chr        *p;
     522             : 
     523             :     /* get the backreferenced string (caller should have checked this) */
     524        1232 :     if (v->pmatch[n].rm_so == -1)
     525           0 :         return NULL;
     526        1232 :     brstring = v->start + v->pmatch[n].rm_so;
     527        1232 :     brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
     528             : 
     529             :     /* special-case zero-length backreference to avoid divide by zero */
     530        1232 :     if (brlen == 0)
     531             :     {
     532             :         /*
     533             :          * matches only a zero-length string, but any number of repetitions
     534             :          * can be considered to be present
     535             :          */
     536           2 :         if (min == start && backmin <= backmax)
     537           2 :             return start;
     538           0 :         return NULL;
     539             :     }
     540             : 
     541             :     /*
     542             :      * convert min and max into numbers of possible repetitions of the backref
     543             :      * string, rounding appropriately
     544             :      */
     545        1230 :     if (min <= start)
     546        1230 :         minreps = 0;
     547             :     else
     548           0 :         minreps = (min - start - 1) / brlen + 1;
     549        1230 :     maxreps = (max - start) / brlen;
     550             : 
     551             :     /* apply bounds, then see if there is any allowed match length */
     552        1230 :     if (minreps < backmin)
     553        1194 :         minreps = backmin;
     554        1230 :     if (backmax != DUPINF && maxreps > backmax)
     555         594 :         maxreps = backmax;
     556        1230 :     if (maxreps < minreps)
     557         268 :         return NULL;
     558             : 
     559             :     /* quick exit if zero-repetitions match is valid and preferred */
     560         962 :     if (shortest && minreps == 0)
     561           0 :         return start;
     562             : 
     563             :     /* okay, compare the actual string contents */
     564         962 :     p = start;
     565         962 :     numreps = 0;
     566        1140 :     while (numreps < maxreps)
     567             :     {
     568         984 :         if ((*v->g->compare) (brstring, p, brlen) != 0)
     569         806 :             break;
     570         178 :         p += brlen;
     571         178 :         numreps++;
     572         178 :         if (shortest && numreps >= minreps)
     573           0 :             break;
     574             :     }
     575             : 
     576         962 :     if (numreps >= minreps)
     577         164 :         return p;
     578         798 :     return NULL;
     579             : }
     580             : 
     581             : /*
     582             :  * lastcold - determine last point at which no progress had been made
     583             :  */
     584             : static chr *                    /* endpoint, or NULL */
     585     1603256 : lastcold(struct vars *v,
     586             :          struct dfa *d)
     587             : {
     588             :     struct sset *ss;
     589             :     chr        *nopr;
     590             :     int         i;
     591             : 
     592     1603256 :     nopr = d->lastnopr;
     593     1603256 :     if (nopr == NULL)
     594     1603252 :         nopr = v->start;
     595     8587324 :     for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
     596     6984068 :         if ((ss->flags & NOPROGRESS) && nopr < ss->lastseen)
     597     2323616 :             nopr = ss->lastseen;
     598     1603256 :     return nopr;
     599             : }
     600             : 
     601             : /*
     602             :  * newdfa - set up a fresh DFA
     603             :  *
     604             :  * Returns NULL (and sets v->err) on failure.
     605             :  */
     606             : static struct dfa *
     607     2800806 : newdfa(struct vars *v,
     608             :        struct cnfa *cnfa,
     609             :        struct colormap *cm,
     610             :        struct smalldfa *sml)    /* preallocated space, may be NULL */
     611             : {
     612             :     struct dfa *d;
     613     2800806 :     size_t      nss = cnfa->nstates * 2;
     614     2800806 :     int         wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
     615     2800806 :     bool        ismalloced = false;
     616             : 
     617             :     assert(cnfa != NULL && cnfa->nstates != 0);
     618             : 
     619     2800806 :     if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS)
     620             :     {
     621             :         assert(wordsper == 1);
     622     2541208 :         if (sml == NULL)
     623             :         {
     624       23246 :             sml = (struct smalldfa *) MALLOC(sizeof(struct smalldfa));
     625       23246 :             if (sml == NULL)
     626             :             {
     627           0 :                 ERR(REG_ESPACE);
     628           0 :                 return NULL;
     629             :             }
     630       23246 :             ismalloced = true;
     631             :         }
     632     2541208 :         d = &sml->dfa;
     633     2541208 :         d->ssets = sml->ssets;
     634     2541208 :         d->statesarea = sml->statesarea;
     635     2541208 :         d->work = &d->statesarea[nss];
     636     2541208 :         d->outsarea = sml->outsarea;
     637     2541208 :         d->incarea = sml->incarea;
     638     2541208 :         d->ismalloced = ismalloced;
     639     2541208 :         d->arraysmalloced = false;   /* not separately allocated, anyway */
     640             :     }
     641             :     else
     642             :     {
     643      259598 :         d = (struct dfa *) MALLOC(sizeof(struct dfa));
     644      259598 :         if (d == NULL)
     645             :         {
     646           0 :             ERR(REG_ESPACE);
     647           0 :             return NULL;
     648             :         }
     649      259598 :         d->ssets = (struct sset *) MALLOC(nss * sizeof(struct sset));
     650      259598 :         d->statesarea = (unsigned *) MALLOC((nss + WORK) * wordsper *
     651             :                                             sizeof(unsigned));
     652      259598 :         d->work = &d->statesarea[nss * wordsper];
     653      259598 :         d->outsarea = (struct sset **) MALLOC(nss * cnfa->ncolors *
     654             :                                               sizeof(struct sset *));
     655      259598 :         d->incarea = (struct arcp *) MALLOC(nss * cnfa->ncolors *
     656             :                                             sizeof(struct arcp));
     657      259598 :         d->ismalloced = true;
     658      259598 :         d->arraysmalloced = true;
     659             :         /* now freedfa() will behave sanely */
     660      259598 :         if (d->ssets == NULL || d->statesarea == NULL ||
     661      259598 :             d->outsarea == NULL || d->incarea == NULL)
     662             :         {
     663           0 :             freedfa(d);
     664           0 :             ERR(REG_ESPACE);
     665           0 :             return NULL;
     666             :         }
     667             :     }
     668             : 
     669     2800806 :     d->nssets = (v->eflags & REG_SMALL) ? 7 : nss;
     670     2800806 :     d->nssused = 0;
     671     2800806 :     d->nstates = cnfa->nstates;
     672     2800806 :     d->ncolors = cnfa->ncolors;
     673     2800806 :     d->wordsper = wordsper;
     674     2800806 :     d->cnfa = cnfa;
     675     2800806 :     d->cm = cm;
     676     2800806 :     d->lastpost = NULL;
     677     2800806 :     d->lastnopr = NULL;
     678     2800806 :     d->search = d->ssets;
     679     2800806 :     d->backno = -1;              /* may be set by caller */
     680     2800806 :     d->backmin = d->backmax = 0;
     681             : 
     682             :     /* initialization of sset fields is done as needed */
     683             : 
     684     2800806 :     return d;
     685             : }
     686             : 
     687             : /*
     688             :  * freedfa - free a DFA
     689             :  */
     690             : static void
     691     2800806 : freedfa(struct dfa *d)
     692             : {
     693     2800806 :     if (d->arraysmalloced)
     694             :     {
     695      259598 :         if (d->ssets != NULL)
     696      259598 :             FREE(d->ssets);
     697      259598 :         if (d->statesarea != NULL)
     698      259598 :             FREE(d->statesarea);
     699      259598 :         if (d->outsarea != NULL)
     700      259598 :             FREE(d->outsarea);
     701      259598 :         if (d->incarea != NULL)
     702      259598 :             FREE(d->incarea);
     703             :     }
     704             : 
     705     2800806 :     if (d->ismalloced)
     706      282844 :         FREE(d);
     707     2800806 : }
     708             : 
     709             : /*
     710             :  * hash - construct a hash code for a bitvector
     711             :  *
     712             :  * There are probably better ways, but they're more expensive.
     713             :  */
     714             : static unsigned
     715       30742 : hash(unsigned *uv,
     716             :      int n)
     717             : {
     718             :     int         i;
     719             :     unsigned    h;
     720             : 
     721       30742 :     h = 0;
     722      148146 :     for (i = 0; i < n; i++)
     723      117404 :         h ^= uv[i];
     724       30742 :     return h;
     725             : }
     726             : 
     727             : /*
     728             :  * initialize - hand-craft a cache entry for startup, otherwise get ready
     729             :  */
     730             : static struct sset *
     731     2803704 : initialize(struct vars *v,
     732             :            struct dfa *d,
     733             :            chr *start)
     734             : {
     735             :     struct sset *ss;
     736             :     int         i;
     737             : 
     738             :     /* is previous one still there? */
     739     2803704 :     if (d->nssused > 0 && (d->ssets[0].flags & STARTER))
     740        5488 :         ss = &d->ssets[0];
     741             :     else
     742             :     {                           /* no, must (re)build it */
     743     2798216 :         ss = getvacant(v, d, start, start);
     744     2798216 :         if (ss == NULL)
     745           0 :             return NULL;
     746     5598732 :         for (i = 0; i < d->wordsper; i++)
     747     2800516 :             ss->states[i] = 0;
     748     2798216 :         BSET(ss->states, d->cnfa->pre);
     749     2798216 :         ss->hash = HASH(ss->states, d->wordsper);
     750             :         assert(d->cnfa->pre != d->cnfa->post);
     751     2798216 :         ss->flags = STARTER | LOCKED | NOPROGRESS;
     752             :         /* lastseen dealt with below */
     753             :     }
     754             : 
     755     5682368 :     for (i = 0; i < d->nssused; i++)
     756     2878664 :         d->ssets[i].lastseen = NULL;
     757     2803704 :     ss->lastseen = start;        /* maybe untrue, but harmless */
     758     2803704 :     d->lastpost = NULL;
     759     2803704 :     d->lastnopr = NULL;
     760     2803704 :     return ss;
     761             : }
     762             : 
     763             : /*
     764             :  * miss - handle a stateset cache miss
     765             :  *
     766             :  * css is the current stateset, co is the color of the current input character,
     767             :  * cp points to the character after that (which is where we may need to test
     768             :  * LACONs).  start does not affect matching behavior but is needed for pickss'
     769             :  * heuristics about which stateset cache entry to replace.
     770             :  *
     771             :  * Ordinarily, returns the address of the next stateset (the one that is
     772             :  * valid after consuming the input character).  Returns NULL if no valid
     773             :  * NFA states remain, ie we have a certain match failure.
     774             :  * Internal errors also return NULL, with v->err set.
     775             :  */
     776             : static struct sset *
     777    12596964 : miss(struct vars *v,
     778             :      struct dfa *d,
     779             :      struct sset *css,
     780             :      color co,
     781             :      chr *cp,                   /* next chr */
     782             :      chr *start)                /* where the attempt got started */
     783             : {
     784    12596964 :     struct cnfa *cnfa = d->cnfa;
     785             :     int         i;
     786             :     unsigned    h;
     787             :     struct carc *ca;
     788             :     struct sset *p;
     789             :     int         ispseudocolor;
     790             :     int         ispost;
     791             :     int         noprogress;
     792             :     int         gotstate;
     793             :     int         dolacons;
     794             :     int         sawlacons;
     795             : 
     796             :     /* for convenience, we can be called even if it might not be a miss */
     797    12596964 :     if (css->outs[co] != NULL)
     798             :     {
     799             :         FDEBUG(("hit\n"));
     800        4492 :         return css->outs[co];
     801             :     }
     802             :     FDEBUG(("miss\n"));
     803             : 
     804             :     /*
     805             :      * Checking for operation cancel in the inner text search loop seems
     806             :      * unduly expensive.  As a compromise, check during cache misses.
     807             :      */
     808    12592472 :     INTERRUPT(v->re);
     809             : 
     810             :     /*
     811             :      * What set of states would we end up in after consuming the co character?
     812             :      * We first consider PLAIN arcs that consume the character, and then look
     813             :      * to see what LACON arcs could be traversed after consuming it.
     814             :      */
     815    25274634 :     for (i = 0; i < d->wordsper; i++)
     816    12682162 :         d->work[i] = 0;          /* build new stateset bitmap in d->work */
     817    12592472 :     ispseudocolor = d->cm->cd[co].flags & PSEUDO;
     818    12592472 :     ispost = 0;
     819    12592472 :     noprogress = 1;
     820    12592472 :     gotstate = 0;
     821    97867778 :     for (i = 0; i < d->nstates; i++)
     822    85275306 :         if (ISBSET(css->states, i))
     823    87284672 :             for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
     824    65512584 :                 if (ca->co == co ||
     825    59768928 :                     (ca->co == RAINBOW && !ispseudocolor))
     826             :                 {
     827    22732324 :                     BSET(d->work, ca->to);
     828    22732324 :                     gotstate = 1;
     829    22732324 :                     if (ca->to == cnfa->post)
     830     2099310 :                         ispost = 1;
     831    22732324 :                     if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
     832     7630160 :                         noprogress = 0;
     833             :                     FDEBUG(("%d -> %d\n", i, ca->to));
     834             :                 }
     835    12592472 :     if (!gotstate)
     836     1639416 :         return NULL;            /* character cannot reach any new state */
     837    10953056 :     dolacons = (cnfa->flags & HASLACONS);
     838    10953056 :     sawlacons = 0;
     839             :     /* outer loop handles transitive closure of reachable-by-LACON states */
     840    10954660 :     while (dolacons)
     841             :     {
     842        1604 :         dolacons = 0;
     843       14692 :         for (i = 0; i < d->nstates; i++)
     844       13088 :             if (ISBSET(d->work, i))
     845        5200 :                 for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
     846             :                 {
     847        3120 :                     if (ca->co < cnfa->ncolors)
     848        2748 :                         continue;   /* not a LACON arc */
     849         372 :                     if (ISBSET(d->work, ca->to))
     850         132 :                         continue;   /* arc would be a no-op anyway */
     851         240 :                     sawlacons = 1;  /* this LACON affects our result */
     852         240 :                     if (!lacon(v, cnfa, cp, ca->co))
     853             :                     {
     854         126 :                         if (ISERR())
     855           0 :                             return NULL;
     856         126 :                         continue;   /* LACON arc cannot be traversed */
     857             :                     }
     858         114 :                     if (ISERR())
     859           0 :                         return NULL;
     860         114 :                     BSET(d->work, ca->to);
     861         114 :                     dolacons = 1;
     862         114 :                     if (ca->to == cnfa->post)
     863           0 :                         ispost = 1;
     864         114 :                     if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
     865         114 :                         noprogress = 0;
     866             :                     FDEBUG(("%d :> %d\n", i, ca->to));
     867             :                 }
     868             :     }
     869    10953056 :     h = HASH(d->work, d->wordsper);
     870             : 
     871             :     /* Is this stateset already in the cache? */
     872    38328488 :     for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
     873    29682010 :         if (HIT(h, d->work, p, d->wordsper))
     874             :         {
     875             :             FDEBUG(("cached c%d\n", (int) (p - d->ssets)));
     876             :             break;              /* NOTE BREAK OUT */
     877             :         }
     878    10953056 :     if (i == 0)
     879             :     {                           /* nope, need a new cache entry */
     880     8646478 :         p = getvacant(v, d, cp, start);
     881     8646478 :         if (p == NULL)
     882           0 :             return NULL;
     883             :         assert(p != css);
     884    17368564 :         for (i = 0; i < d->wordsper; i++)
     885     8722086 :             p->states[i] = d->work[i];
     886     8646478 :         p->hash = h;
     887     8646478 :         p->flags = (ispost) ? POSTSTATE : 0;
     888     8646478 :         if (noprogress)
     889     2801294 :             p->flags |= NOPROGRESS;
     890             :         /* lastseen to be dealt with by caller */
     891             :     }
     892             : 
     893             :     /*
     894             :      * Link new stateset to old, unless a LACON affected the result, in which
     895             :      * case we don't create the link.  That forces future transitions across
     896             :      * this same arc (same prior stateset and character color) to come through
     897             :      * miss() again, so that we can recheck the LACON(s), which might or might
     898             :      * not pass since context will be different.
     899             :      */
     900    10953056 :     if (!sawlacons)
     901             :     {
     902             :         FDEBUG(("c%d[%d]->c%d\n",
     903             :                 (int) (css - d->ssets), co, (int) (p - d->ssets)));
     904    10952880 :         css->outs[co] = p;
     905    10952880 :         css->inchain[co] = p->ins;
     906    10952880 :         p->ins.ss = css;
     907    10952880 :         p->ins.co = co;
     908             :     }
     909    10953056 :     return p;
     910             : }
     911             : 
     912             : /*
     913             :  * lacon - lookaround-constraint checker for miss()
     914             :  */
     915             : static int                      /* predicate:  constraint satisfied? */
     916         240 : lacon(struct vars *v,
     917             :       struct cnfa *pcnfa,       /* parent cnfa */
     918             :       chr *cp,
     919             :       color co)                 /* "color" of the lookaround constraint */
     920             : {
     921             :     int         n;
     922             :     struct subre *sub;
     923             :     struct dfa *d;
     924             :     chr        *end;
     925             :     int         satisfied;
     926             : 
     927             :     /* Since this is recursive, it could be driven to stack overflow */
     928         240 :     if (STACK_TOO_DEEP(v->re))
     929             :     {
     930           0 :         ERR(REG_ETOOBIG);
     931           0 :         return 0;
     932             :     }
     933             : 
     934         240 :     n = co - pcnfa->ncolors;
     935             :     assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
     936             :     FDEBUG(("=== testing lacon %d\n", n));
     937         240 :     sub = &v->g->lacons[n];
     938         240 :     d = getladfa(v, n);
     939         240 :     if (d == NULL)
     940           0 :         return 0;
     941         240 :     if (LATYPE_IS_AHEAD(sub->latype))
     942             :     {
     943             :         /* used to use longest() here, but shortest() could be much cheaper */
     944         120 :         end = shortest(v, d, cp, cp, v->stop,
     945             :                        (chr **) NULL, (int *) NULL);
     946         120 :         satisfied = LATYPE_IS_POS(sub->latype) ? (end != NULL) : (end == NULL);
     947             :     }
     948             :     else
     949             :     {
     950             :         /*
     951             :          * To avoid doing O(N^2) work when repeatedly testing a lookbehind
     952             :          * constraint in an N-character string, we use matchuntil() which can
     953             :          * cache the DFA state across calls.  We only need to restart if the
     954             :          * probe point decreases, which is not common.  The NFA we're using is
     955             :          * a search NFA, so it doesn't mind scanning over stuff before the
     956             :          * nominal match.
     957             :          */
     958         120 :         satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]);
     959         120 :         if (!LATYPE_IS_POS(sub->latype))
     960           0 :             satisfied = !satisfied;
     961             :     }
     962             :     FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied));
     963         240 :     return satisfied;
     964             : }
     965             : 
     966             : /*
     967             :  * getvacant - get a vacant state set
     968             :  *
     969             :  * This routine clears out the inarcs and outarcs, but does not otherwise
     970             :  * clear the innards of the state set -- that's up to the caller.
     971             :  */
     972             : static struct sset *
     973    11444694 : getvacant(struct vars *v,
     974             :           struct dfa *d,
     975             :           chr *cp,
     976             :           chr *start)
     977             : {
     978             :     int         i;
     979             :     struct sset *ss;
     980             :     struct sset *p;
     981             :     struct arcp ap;
     982             :     color       co;
     983             : 
     984    11444694 :     ss = pickss(v, d, cp, start);
     985    11444694 :     if (ss == NULL)
     986           0 :         return NULL;
     987             :     assert(!(ss->flags & LOCKED));
     988             : 
     989             :     /* clear out its inarcs, including self-referential ones */
     990    11444694 :     ap = ss->ins;
     991    11444718 :     while ((p = ap.ss) != NULL)
     992             :     {
     993          24 :         co = ap.co;
     994             :         FDEBUG(("zapping c%d's %ld outarc\n", (int) (p - d->ssets), (long) co));
     995          24 :         p->outs[co] = NULL;
     996          24 :         ap = p->inchain[co];
     997          24 :         p->inchain[co].ss = NULL;    /* paranoia */
     998             :     }
     999    11444694 :     ss->ins.ss = NULL;
    1000             : 
    1001             :     /* take it off the inarc chains of the ssets reached by its outarcs */
    1002    99926822 :     for (i = 0; i < d->ncolors; i++)
    1003             :     {
    1004    88482128 :         p = ss->outs[i];
    1005             :         assert(p != ss);        /* not self-referential */
    1006    88482128 :         if (p == NULL)
    1007    88481996 :             continue;           /* NOTE CONTINUE */
    1008             :         FDEBUG(("del outarc %d from c%d's in chn\n", i, (int) (p - d->ssets)));
    1009         132 :         if (p->ins.ss == ss && p->ins.co == i)
    1010         120 :             p->ins = ss->inchain[i];
    1011             :         else
    1012             :         {
    1013          12 :             struct arcp lastap = {NULL, 0};
    1014             : 
    1015             :             assert(p->ins.ss != NULL);
    1016          24 :             for (ap = p->ins; ap.ss != NULL &&
    1017          24 :                  !(ap.ss == ss && ap.co == i);
    1018          12 :                  ap = ap.ss->inchain[ap.co])
    1019          12 :                 lastap = ap;
    1020             :             assert(ap.ss != NULL);
    1021          12 :             lastap.ss->inchain[lastap.co] = ss->inchain[i];
    1022             :         }
    1023         132 :         ss->outs[i] = NULL;
    1024         132 :         ss->inchain[i].ss = NULL;
    1025             :     }
    1026             : 
    1027             :     /* if ss was a success state, may need to remember location */
    1028    11444694 :     if ((ss->flags & POSTSTATE) && ss->lastseen != d->lastpost &&
    1029          36 :         (d->lastpost == NULL || d->lastpost < ss->lastseen))
    1030          36 :         d->lastpost = ss->lastseen;
    1031             : 
    1032             :     /* likewise for a no-progress state */
    1033    11444694 :     if ((ss->flags & NOPROGRESS) && ss->lastseen != d->lastnopr &&
    1034          12 :         (d->lastnopr == NULL || d->lastnopr < ss->lastseen))
    1035          12 :         d->lastnopr = ss->lastseen;
    1036             : 
    1037    11444694 :     return ss;
    1038             : }
    1039             : 
    1040             : /*
    1041             :  * pickss - pick the next stateset to be used
    1042             :  */
    1043             : static struct sset *
    1044    11444694 : pickss(struct vars *v,
    1045             :        struct dfa *d,
    1046             :        chr *cp,
    1047             :        chr *start)
    1048             : {
    1049             :     int         i;
    1050             :     struct sset *ss;
    1051             :     struct sset *end;
    1052             :     chr        *ancient;
    1053             : 
    1054             :     /* shortcut for cases where cache isn't full */
    1055    11444694 :     if (d->nssused < d->nssets)
    1056             :     {
    1057    11444562 :         i = d->nssused;
    1058    11444562 :         d->nssused++;
    1059    11444562 :         ss = &d->ssets[i];
    1060             :         FDEBUG(("new c%d\n", i));
    1061             :         /* set up innards */
    1062    11444562 :         ss->states = &d->statesarea[i * d->wordsper];
    1063    11444562 :         ss->flags = 0;
    1064    11444562 :         ss->ins.ss = NULL;
    1065    11444562 :         ss->ins.co = WHITE;      /* give it some value */
    1066    11444562 :         ss->outs = &d->outsarea[i * d->ncolors];
    1067    11444562 :         ss->inchain = &d->incarea[i * d->ncolors];
    1068    99923618 :         for (i = 0; i < d->ncolors; i++)
    1069             :         {
    1070    88479056 :             ss->outs[i] = NULL;
    1071    88479056 :             ss->inchain[i].ss = NULL;
    1072             :         }
    1073    11444562 :         return ss;
    1074             :     }
    1075             : 
    1076             :     /* look for oldest, or old enough anyway */
    1077         132 :     if (cp - start > d->nssets * 2 / 3) /* oldest 33% are expendable */
    1078         132 :         ancient = cp - d->nssets * 2 / 3;
    1079             :     else
    1080           0 :         ancient = start;
    1081         144 :     for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++)
    1082         132 :         if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
    1083         132 :             !(ss->flags & LOCKED))
    1084             :         {
    1085         120 :             d->search = ss + 1;
    1086             :             FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
    1087         120 :             return ss;
    1088             :         }
    1089          24 :     for (ss = d->ssets, end = d->search; ss < end; ss++)
    1090          24 :         if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
    1091          24 :             !(ss->flags & LOCKED))
    1092             :         {
    1093          12 :             d->search = ss + 1;
    1094             :             FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
    1095          12 :             return ss;
    1096             :         }
    1097             : 
    1098             :     /* nobody's old enough?!? -- something's really wrong */
    1099             :     FDEBUG(("cannot find victim to replace!\n"));
    1100           0 :     ERR(REG_ASSERT);
    1101           0 :     return NULL;
    1102             : }

Generated by: LCOV version 1.14