LCOV - code coverage report
Current view: top level - src/backend/regex - rege_dfa.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 90.2 % 448 404
Test Date: 2026-03-02 08:16:13 Functions: 100.0 % 13 13
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*
       2              :  * DFA routines
       3              :  * This file is #included by regexec.c.
       4              :  *
       5              :  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
       6              :  *
       7              :  * Development of this software was funded, in part, by Cray Research Inc.,
       8              :  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
       9              :  * Corporation, none of whom are responsible for the results.  The author
      10              :  * thanks all of them.
      11              :  *
      12              :  * Redistribution and use in source and binary forms -- with or without
      13              :  * modification -- are permitted for any purpose, provided that
      14              :  * redistributions in source form retain this entire copyright notice and
      15              :  * indicate the origin and nature of any modifications.
      16              :  *
      17              :  * I'd appreciate being given credit for this package in the documentation
      18              :  * of software which uses it, but that is not a requirement.
      19              :  *
      20              :  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
      21              :  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
      22              :  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
      23              :  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
      24              :  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
      25              :  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
      26              :  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
      27              :  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
      28              :  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
      29              :  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
      30              :  *
      31              :  * src/backend/regex/rege_dfa.c
      32              :  *
      33              :  */
      34              : 
      35              : /*
      36              :  * longest - longest-preferred matching engine
      37              :  *
      38              :  * On success, returns match endpoint address.  Returns NULL on no match.
      39              :  * Internal errors also return NULL, with v->err set.
      40              :  */
      41              : static chr *
      42       470015 : longest(struct vars *v,
      43              :         struct dfa *d,
      44              :         chr *start,             /* where the match should start */
      45              :         chr *stop,              /* match must end at or before here */
      46              :         int *hitstopp)          /* record whether hit v->stop, if non-NULL */
      47              : {
      48              :     chr        *cp;
      49       470015 :     chr        *realstop = (stop == v->stop) ? stop : stop + 1;
      50              :     color       co;
      51              :     struct sset *css;
      52              :     struct sset *ss;
      53              :     chr        *post;
      54              :     int         i;
      55       470015 :     struct colormap *cm = d->cm;
      56              : 
      57              :     /* prevent "uninitialized variable" warnings */
      58       470015 :     if (hitstopp != NULL)
      59       453356 :         *hitstopp = 0;
      60              : 
      61              :     /* if this is a backref to a known string, just match against that */
      62       470015 :     if (d->backno >= 0)
      63              :     {
      64              :         assert((size_t) d->backno < v->nmatch);
      65          793 :         if (v->pmatch[d->backno].rm_so >= 0)
      66              :         {
      67          616 :             cp = dfa_backref(v, d, start, start, stop, false);
      68          616 :             if (cp == v->stop && stop == v->stop && hitstopp != NULL)
      69            0 :                 *hitstopp = 1;
      70          616 :             return cp;
      71              :         }
      72              :     }
      73              : 
      74              :     /* fast path for matchall NFAs */
      75       469399 :     if (d->cnfa->flags & MATCHALL)
      76              :     {
      77         2295 :         size_t      nchr = stop - start;
      78         2295 :         size_t      maxmatchall = d->cnfa->maxmatchall;
      79              : 
      80         2295 :         if (nchr < d->cnfa->minmatchall)
      81          165 :             return NULL;
      82         2130 :         if (maxmatchall == DUPINF)
      83              :         {
      84         1278 :             if (stop == v->stop && hitstopp != NULL)
      85            5 :                 *hitstopp = 1;
      86              :         }
      87              :         else
      88              :         {
      89          852 :             if (stop == v->stop && nchr <= maxmatchall + 1 && hitstopp != NULL)
      90           84 :                 *hitstopp = 1;
      91          852 :             if (nchr > maxmatchall)
      92          576 :                 return start + maxmatchall;
      93              :         }
      94         1554 :         return stop;
      95              :     }
      96              : 
      97              :     /* initialize */
      98       467104 :     css = initialize(v, d, start);
      99       467104 :     if (css == NULL)
     100            0 :         return NULL;
     101       467104 :     cp = start;
     102              : 
     103              :     /* startup */
     104              :     FDEBUG(("+++ startup +++\n"));
     105       467104 :     if (cp == v->start)
     106              :     {
     107         1675 :         co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
     108              :         FDEBUG(("color %ld\n", (long) co));
     109              :     }
     110              :     else
     111              :     {
     112       465429 :         co = GETCOLOR(cm, *(cp - 1));
     113              :         FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
     114              :     }
     115       467104 :     css = miss(v, d, css, co, cp, start);
     116       467104 :     if (css == NULL)
     117          216 :         return NULL;
     118       466888 :     css->lastseen = cp;
     119              : 
     120              :     /*
     121              :      * This is the main text-scanning loop.  It seems worth having two copies
     122              :      * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
     123              :      * builds, when you're not actively tracing.
     124              :      */
     125              : #ifdef REG_DEBUG
     126              :     if (v->eflags & REG_FTRACE)
     127              :     {
     128              :         while (cp < realstop)
     129              :         {
     130              :             FDEBUG(("+++ at c%d +++\n", (int) (css - d->ssets)));
     131              :             co = GETCOLOR(cm, *cp);
     132              :             FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
     133              :             ss = css->outs[co];
     134              :             if (ss == NULL)
     135              :             {
     136              :                 ss = miss(v, d, css, co, cp + 1, start);
     137              :                 if (ss == NULL)
     138              :                     break;      /* NOTE BREAK OUT */
     139              :             }
     140              :             cp++;
     141              :             ss->lastseen = cp;
     142              :             css = ss;
     143              :         }
     144              :     }
     145              :     else
     146              : #endif
     147              :     {
     148      5952717 :         while (cp < realstop)
     149              :         {
     150      5939649 :             co = GETCOLOR(cm, *cp);
     151      5939649 :             ss = css->outs[co];
     152      5939649 :             if (ss == NULL)
     153              :             {
     154      1496964 :                 ss = miss(v, d, css, co, cp + 1, start);
     155      1496964 :                 if (ss == NULL)
     156       453820 :                     break;      /* NOTE BREAK OUT */
     157              :             }
     158      5485829 :             cp++;
     159      5485829 :             ss->lastseen = cp;
     160      5485829 :             css = ss;
     161              :         }
     162              :     }
     163              : 
     164       466888 :     if (ISERR())
     165            0 :         return NULL;
     166              : 
     167              :     /* shutdown */
     168              :     FDEBUG(("+++ shutdown at c%d +++\n", (int) (css - d->ssets)));
     169       466888 :     if (cp == v->stop && stop == v->stop)
     170              :     {
     171         7062 :         if (hitstopp != NULL)
     172         3615 :             *hitstopp = 1;
     173         7062 :         co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
     174              :         FDEBUG(("color %ld\n", (long) co));
     175         7062 :         ss = miss(v, d, css, co, cp, start);
     176         7062 :         if (ISERR())
     177            0 :             return NULL;
     178              :         /* special case:  match ended at eol? */
     179         7062 :         if (ss != NULL && (ss->flags & POSTSTATE))
     180         3836 :             return cp;
     181         3226 :         else if (ss != NULL)
     182            0 :             ss->lastseen = cp;   /* to be tidy */
     183              :     }
     184              : 
     185              :     /* find last match, if any */
     186       463052 :     post = d->lastpost;
     187      2402998 :     for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
     188      1939946 :         if ((ss->flags & POSTSTATE) && post != ss->lastseen &&
     189         7080 :             (post == NULL || post < ss->lastseen))
     190       467910 :             post = ss->lastseen;
     191       463052 :     if (post != NULL)           /* found one */
     192       461218 :         return post - 1;
     193              : 
     194         1834 :     return NULL;
     195              : }
     196              : 
     197              : /*
     198              :  * shortest - shortest-preferred matching engine
     199              :  *
     200              :  * On success, returns match endpoint address.  Returns NULL on no match.
     201              :  * Internal errors also return NULL, with v->err set.
     202              :  */
     203              : static chr *
     204      4154107 : shortest(struct vars *v,
     205              :          struct dfa *d,
     206              :          chr *start,            /* where the match should start */
     207              :          chr *min,              /* match must end at or after here */
     208              :          chr *max,              /* match must end at or before here */
     209              :          chr **coldp,           /* store coldstart pointer here, if non-NULL */
     210              :          int *hitstopp)         /* record whether hit v->stop, if non-NULL */
     211              : {
     212              :     chr        *cp;
     213      4154107 :     chr        *realmin = (min == v->stop) ? min : min + 1;
     214      4154107 :     chr        *realmax = (max == v->stop) ? max : max + 1;
     215              :     color       co;
     216              :     struct sset *css;
     217              :     struct sset *ss;
     218      4154107 :     struct colormap *cm = d->cm;
     219              : 
     220              :     /* prevent "uninitialized variable" warnings */
     221      4154107 :     if (coldp != NULL)
     222      4153277 :         *coldp = NULL;
     223      4154107 :     if (hitstopp != NULL)
     224          159 :         *hitstopp = 0;
     225              : 
     226              :     /* if this is a backref to a known string, just match against that */
     227      4154107 :     if (d->backno >= 0)
     228              :     {
     229              :         assert((size_t) d->backno < v->nmatch);
     230            0 :         if (v->pmatch[d->backno].rm_so >= 0)
     231              :         {
     232            0 :             cp = dfa_backref(v, d, start, min, max, true);
     233            0 :             if (cp != NULL && coldp != NULL)
     234            0 :                 *coldp = start;
     235              :             /* there is no case where we should set *hitstopp */
     236            0 :             return cp;
     237              :         }
     238              :     }
     239              : 
     240              :     /* fast path for matchall NFAs */
     241      4154107 :     if (d->cnfa->flags & MATCHALL)
     242              :     {
     243          987 :         size_t      nchr = min - start;
     244              : 
     245          987 :         if (d->cnfa->maxmatchall != DUPINF &&
     246           12 :             nchr > d->cnfa->maxmatchall)
     247            0 :             return NULL;
     248          987 :         if ((max - start) < d->cnfa->minmatchall)
     249            9 :             return NULL;
     250          978 :         if (nchr < d->cnfa->minmatchall)
     251           65 :             min = start + d->cnfa->minmatchall;
     252          978 :         if (coldp != NULL)
     253          448 :             *coldp = start;
     254              :         /* there is no case where we should set *hitstopp */
     255          978 :         return min;
     256              :     }
     257              : 
     258              :     /* initialize */
     259      4153120 :     css = initialize(v, d, start);
     260      4153120 :     if (css == NULL)
     261            0 :         return NULL;
     262      4153120 :     cp = start;
     263              : 
     264              :     /* startup */
     265              :     FDEBUG(("--- startup ---\n"));
     266      4153120 :     if (cp == v->start)
     267              :     {
     268      3704236 :         co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
     269              :         FDEBUG(("color %ld\n", (long) co));
     270              :     }
     271              :     else
     272              :     {
     273       448884 :         co = GETCOLOR(cm, *(cp - 1));
     274              :         FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
     275              :     }
     276      4153120 :     css = miss(v, d, css, co, cp, start);
     277      4153120 :     if (css == NULL)
     278            6 :         return NULL;
     279      4153114 :     css->lastseen = cp;
     280      4153114 :     ss = css;
     281              : 
     282              :     /*
     283              :      * This is the main text-scanning loop.  It seems worth having two copies
     284              :      * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
     285              :      * builds, when you're not actively tracing.
     286              :      */
     287              : #ifdef REG_DEBUG
     288              :     if (v->eflags & REG_FTRACE)
     289              :     {
     290              :         while (cp < realmax)
     291              :         {
     292              :             FDEBUG(("--- at c%d ---\n", (int) (css - d->ssets)));
     293              :             co = GETCOLOR(cm, *cp);
     294              :             FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
     295              :             ss = css->outs[co];
     296              :             if (ss == NULL)
     297              :             {
     298              :                 ss = miss(v, d, css, co, cp + 1, start);
     299              :                 if (ss == NULL)
     300              :                     break;      /* NOTE BREAK OUT */
     301              :             }
     302              :             cp++;
     303              :             ss->lastseen = cp;
     304              :             css = ss;
     305              :             if ((ss->flags & POSTSTATE) && cp >= realmin)
     306              :                 break;          /* NOTE BREAK OUT */
     307              :         }
     308              :     }
     309              :     else
     310              : #endif
     311              :     {
     312     22619259 :         while (cp < realmax)
     313              :         {
     314     22358339 :             co = GETCOLOR(cm, *cp);
     315     22358339 :             ss = css->outs[co];
     316     22358339 :             if (ss == NULL)
     317              :             {
     318      7012100 :                 ss = miss(v, d, css, co, cp + 1, start);
     319      7012100 :                 if (ss == NULL)
     320      3323463 :                     break;      /* NOTE BREAK OUT */
     321              :             }
     322     19034876 :             cp++;
     323     19034876 :             ss->lastseen = cp;
     324     19034876 :             css = ss;
     325     19034876 :             if ((ss->flags & POSTSTATE) && cp >= realmin)
     326       568731 :                 break;          /* NOTE BREAK OUT */
     327              :         }
     328              :     }
     329              : 
     330      4153114 :     if (ss == NULL)
     331      3323463 :         return NULL;
     332              : 
     333       829651 :     if (coldp != NULL)          /* report last no-progress state set, if any */
     334       829383 :         *coldp = lastcold(v, d);
     335              : 
     336       829651 :     if ((ss->flags & POSTSTATE) && cp > min)
     337              :     {
     338              :         assert(cp >= realmin);
     339       568715 :         cp--;
     340              :     }
     341       260936 :     else if (cp == v->stop && max == v->stop)
     342              :     {
     343       260936 :         co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
     344              :         FDEBUG(("color %ld\n", (long) co));
     345       260936 :         ss = miss(v, d, css, co, cp, start);
     346              :         /* match might have ended at eol */
     347       260936 :         if ((ss == NULL || !(ss->flags & POSTSTATE)) && hitstopp != NULL)
     348            6 :             *hitstopp = 1;
     349              :     }
     350              : 
     351       829651 :     if (ss == NULL || !(ss->flags & POSTSTATE))
     352       251150 :         return NULL;
     353              : 
     354       578501 :     return cp;
     355              : }
     356              : 
     357              : /*
     358              :  * matchuntil - incremental matching engine
     359              :  *
     360              :  * This is meant for use with a search-style NFA (that is, the pattern is
     361              :  * known to act as though it had a leading .*).  We determine whether a
     362              :  * match exists starting at v->start and ending at probe.  Multiple calls
     363              :  * require only O(N) time not O(N^2) so long as the probe values are
     364              :  * nondecreasing.  *lastcss and *lastcp must be initialized to NULL before
     365              :  * starting a series of calls.
     366              :  *
     367              :  * Returns 1 if a match exists, 0 if not.
     368              :  * Internal errors also return 0, with v->err set.
     369              :  */
     370              : static int
     371           60 : matchuntil(struct vars *v,
     372              :            struct dfa *d,
     373              :            chr *probe,          /* we want to know if a match ends here */
     374              :            struct sset **lastcss,   /* state storage across calls */
     375              :            chr **lastcp)        /* state storage across calls */
     376              : {
     377           60 :     chr        *cp = *lastcp;
     378              :     color       co;
     379           60 :     struct sset *css = *lastcss;
     380              :     struct sset *ss;
     381           60 :     struct colormap *cm = d->cm;
     382              : 
     383              :     /* fast path for matchall NFAs */
     384           60 :     if (d->cnfa->flags & MATCHALL)
     385              :     {
     386           18 :         size_t      nchr = probe - v->start;
     387              : 
     388           18 :         if (nchr < d->cnfa->minmatchall)
     389            9 :             return 0;
     390              :         /* maxmatchall will always be infinity, cf. makesearch() */
     391              :         assert(d->cnfa->maxmatchall == DUPINF);
     392            9 :         return 1;
     393              :     }
     394              : 
     395              :     /* initialize and startup, or restart, if necessary */
     396           42 :     if (cp == NULL || cp > probe)
     397              :     {
     398           12 :         cp = v->start;
     399           12 :         css = initialize(v, d, cp);
     400           12 :         if (css == NULL)
     401            0 :             return 0;
     402              : 
     403              :         FDEBUG((">>> startup >>>\n"));
     404           12 :         co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
     405              :         FDEBUG(("color %ld\n", (long) co));
     406              : 
     407           12 :         css = miss(v, d, css, co, cp, v->start);
     408           12 :         if (css == NULL)
     409            0 :             return 0;
     410           12 :         css->lastseen = cp;
     411              :     }
     412           30 :     else if (css == NULL)
     413              :     {
     414              :         /* we previously found that no match is possible beyond *lastcp */
     415            0 :         return 0;
     416              :     }
     417           42 :     ss = css;
     418              : 
     419              :     /*
     420              :      * This is the main text-scanning loop.  It seems worth having two copies
     421              :      * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
     422              :      * builds, when you're not actively tracing.
     423              :      */
     424              : #ifdef REG_DEBUG
     425              :     if (v->eflags & REG_FTRACE)
     426              :     {
     427              :         while (cp < probe)
     428              :         {
     429              :             FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
     430              :             co = GETCOLOR(cm, *cp);
     431              :             FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
     432              :             ss = css->outs[co];
     433              :             if (ss == NULL)
     434              :             {
     435              :                 ss = miss(v, d, css, co, cp + 1, v->start);
     436              :                 if (ss == NULL)
     437              :                     break;      /* NOTE BREAK OUT */
     438              :             }
     439              :             cp++;
     440              :             ss->lastseen = cp;
     441              :             css = ss;
     442              :         }
     443              :     }
     444              :     else
     445              : #endif
     446              :     {
     447           90 :         while (cp < probe)
     448              :         {
     449           48 :             co = GETCOLOR(cm, *cp);
     450           48 :             ss = css->outs[co];
     451           48 :             if (ss == NULL)
     452              :             {
     453            6 :                 ss = miss(v, d, css, co, cp + 1, v->start);
     454            6 :                 if (ss == NULL)
     455            0 :                     break;      /* NOTE BREAK OUT */
     456              :             }
     457           48 :             cp++;
     458           48 :             ss->lastseen = cp;
     459           48 :             css = ss;
     460              :         }
     461              :     }
     462              : 
     463           42 :     *lastcss = ss;
     464           42 :     *lastcp = cp;
     465              : 
     466           42 :     if (ss == NULL)
     467            0 :         return 0;               /* impossible match, or internal error */
     468              : 
     469              :     /* We need to process one more chr, or the EOS symbol, to check match */
     470           42 :     if (cp < v->stop)
     471              :     {
     472              :         FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
     473           42 :         co = GETCOLOR(cm, *cp);
     474              :         FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
     475           42 :         ss = css->outs[co];
     476           42 :         if (ss == NULL)
     477           27 :             ss = miss(v, d, css, co, cp + 1, v->start);
     478              :     }
     479              :     else
     480              :     {
     481              :         assert(cp == v->stop);
     482            0 :         co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
     483              :         FDEBUG(("color %ld\n", (long) co));
     484            0 :         ss = miss(v, d, css, co, cp, v->start);
     485              :     }
     486              : 
     487           42 :     if (ss == NULL || !(ss->flags & POSTSTATE))
     488           30 :         return 0;
     489              : 
     490           12 :     return 1;
     491              : }
     492              : 
     493              : /*
     494              :  * dfa_backref - find best match length for a known backref string
     495              :  *
     496              :  * When the backref's referent is already available, we can deliver an exact
     497              :  * answer with considerably less work than running the backref node's NFA.
     498              :  *
     499              :  * Return match endpoint for longest or shortest valid repeated match,
     500              :  * or NULL if there is no valid match.
     501              :  *
     502              :  * Should be in sync with cbrdissect(), although that has the different task
     503              :  * of checking a match to a predetermined section of the string.
     504              :  */
     505              : static chr *
     506          616 : dfa_backref(struct vars *v,
     507              :             struct dfa *d,
     508              :             chr *start,         /* where the match should start */
     509              :             chr *min,           /* match must end at or after here */
     510              :             chr *max,           /* match must end at or before here */
     511              :             bool shortest)
     512              : {
     513          616 :     int         n = d->backno;
     514          616 :     int         backmin = d->backmin;
     515          616 :     int         backmax = d->backmax;
     516              :     size_t      numreps;
     517              :     size_t      minreps;
     518              :     size_t      maxreps;
     519              :     size_t      brlen;
     520              :     chr        *brstring;
     521              :     chr        *p;
     522              : 
     523              :     /* get the backreferenced string (caller should have checked this) */
     524          616 :     if (v->pmatch[n].rm_so == -1)
     525            0 :         return NULL;
     526          616 :     brstring = v->start + v->pmatch[n].rm_so;
     527          616 :     brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
     528              : 
     529              :     /* special-case zero-length backreference to avoid divide by zero */
     530          616 :     if (brlen == 0)
     531              :     {
     532              :         /*
     533              :          * matches only a zero-length string, but any number of repetitions
     534              :          * can be considered to be present
     535              :          */
     536            1 :         if (min == start && backmin <= backmax)
     537            1 :             return start;
     538            0 :         return NULL;
     539              :     }
     540              : 
     541              :     /*
     542              :      * convert min and max into numbers of possible repetitions of the backref
     543              :      * string, rounding appropriately
     544              :      */
     545          615 :     if (min <= start)
     546          615 :         minreps = 0;
     547              :     else
     548            0 :         minreps = (min - start - 1) / brlen + 1;
     549          615 :     maxreps = (max - start) / brlen;
     550              : 
     551              :     /* apply bounds, then see if there is any allowed match length */
     552          615 :     if (minreps < backmin)
     553          597 :         minreps = backmin;
     554          615 :     if (backmax != DUPINF && maxreps > backmax)
     555          297 :         maxreps = backmax;
     556          615 :     if (maxreps < minreps)
     557          134 :         return NULL;
     558              : 
     559              :     /* quick exit if zero-repetitions match is valid and preferred */
     560          481 :     if (shortest && minreps == 0)
     561            0 :         return start;
     562              : 
     563              :     /* okay, compare the actual string contents */
     564          481 :     p = start;
     565          481 :     numreps = 0;
     566          570 :     while (numreps < maxreps)
     567              :     {
     568          492 :         if ((*v->g->compare) (brstring, p, brlen) != 0)
     569          403 :             break;
     570           89 :         p += brlen;
     571           89 :         numreps++;
     572           89 :         if (shortest && numreps >= minreps)
     573            0 :             break;
     574              :     }
     575              : 
     576          481 :     if (numreps >= minreps)
     577           82 :         return p;
     578          399 :     return NULL;
     579              : }
     580              : 
     581              : /*
     582              :  * lastcold - determine last point at which no progress had been made
     583              :  */
     584              : static chr *                    /* endpoint, or NULL */
     585       829383 : lastcold(struct vars *v,
     586              :          struct dfa *d)
     587              : {
     588              :     struct sset *ss;
     589              :     chr        *nopr;
     590              :     int         i;
     591              : 
     592       829383 :     nopr = d->lastnopr;
     593       829383 :     if (nopr == NULL)
     594       829381 :         nopr = v->start;
     595      4450107 :     for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
     596      3620724 :         if ((ss->flags & NOPROGRESS) && nopr < ss->lastseen)
     597      1182628 :             nopr = ss->lastseen;
     598       829383 :     return nopr;
     599              : }
     600              : 
     601              : /*
     602              :  * newdfa - set up a fresh DFA
     603              :  *
     604              :  * Returns NULL (and sets v->err) on failure.
     605              :  */
     606              : static struct dfa *
     607      4618783 : newdfa(struct vars *v,
     608              :        struct cnfa *cnfa,
     609              :        struct colormap *cm,
     610              :        struct smalldfa *sml)    /* preallocated space, may be NULL */
     611              : {
     612              :     struct dfa *d;
     613      4618783 :     size_t      nss = cnfa->nstates * 2;
     614      4618783 :     int         wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
     615      4618783 :     bool        ismalloced = false;
     616              : 
     617              :     assert(cnfa != NULL && cnfa->nstates != 0);
     618              : 
     619      4618783 :     if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS)
     620              :     {
     621              :         assert(wordsper == 1);
     622      2280277 :         if (sml == NULL)
     623              :         {
     624        11729 :             sml = (struct smalldfa *) MALLOC(sizeof(struct smalldfa));
     625        11729 :             if (sml == NULL)
     626              :             {
     627            0 :                 ERR(REG_ESPACE);
     628            0 :                 return NULL;
     629              :             }
     630        11729 :             ismalloced = true;
     631              :         }
     632      2280277 :         d = &sml->dfa;
     633      2280277 :         d->ssets = sml->ssets;
     634      2280277 :         d->statesarea = sml->statesarea;
     635      2280277 :         d->work = &d->statesarea[nss];
     636      2280277 :         d->outsarea = sml->outsarea;
     637      2280277 :         d->incarea = sml->incarea;
     638      2280277 :         d->ismalloced = ismalloced;
     639      2280277 :         d->arraysmalloced = false;   /* not separately allocated, anyway */
     640              :     }
     641              :     else
     642              :     {
     643      2338506 :         d = (struct dfa *) MALLOC(sizeof(struct dfa));
     644      2338506 :         if (d == NULL)
     645              :         {
     646            0 :             ERR(REG_ESPACE);
     647            0 :             return NULL;
     648              :         }
     649      2338506 :         d->ssets = (struct sset *) MALLOC(nss * sizeof(struct sset));
     650      2338506 :         d->statesarea = (unsigned *) MALLOC((nss + WORK) * wordsper *
     651              :                                             sizeof(unsigned));
     652      2338506 :         d->work = &d->statesarea[nss * wordsper];
     653      2338506 :         d->outsarea = (struct sset **) MALLOC(nss * cnfa->ncolors *
     654              :                                               sizeof(struct sset *));
     655      2338506 :         d->incarea = (struct arcp *) MALLOC(nss * cnfa->ncolors *
     656              :                                             sizeof(struct arcp));
     657      2338506 :         d->ismalloced = true;
     658      2338506 :         d->arraysmalloced = true;
     659              :         /* now freedfa() will behave sanely */
     660      2338506 :         if (d->ssets == NULL || d->statesarea == NULL ||
     661      2338506 :             d->outsarea == NULL || d->incarea == NULL)
     662              :         {
     663            0 :             freedfa(d);
     664            0 :             ERR(REG_ESPACE);
     665            0 :             return NULL;
     666              :         }
     667              :     }
     668              : 
     669      4618783 :     d->nssets = (v->eflags & REG_SMALL) ? 7 : nss;
     670      4618783 :     d->nssused = 0;
     671      4618783 :     d->nstates = cnfa->nstates;
     672      4618783 :     d->ncolors = cnfa->ncolors;
     673      4618783 :     d->wordsper = wordsper;
     674      4618783 :     d->cnfa = cnfa;
     675      4618783 :     d->cm = cm;
     676      4618783 :     d->lastpost = NULL;
     677      4618783 :     d->lastnopr = NULL;
     678      4618783 :     d->search = d->ssets;
     679      4618783 :     d->backno = -1;              /* may be set by caller */
     680      4618783 :     d->backmin = d->backmax = 0;
     681              : 
     682              :     /* initialization of sset fields is done as needed */
     683              : 
     684      4618783 :     return d;
     685              : }
     686              : 
     687              : /*
     688              :  * freedfa - free a DFA
     689              :  */
     690              : static void
     691      4618783 : freedfa(struct dfa *d)
     692              : {
     693      4618783 :     if (d->arraysmalloced)
     694              :     {
     695      2338506 :         if (d->ssets != NULL)
     696      2338506 :             FREE(d->ssets);
     697      2338506 :         if (d->statesarea != NULL)
     698      2338506 :             FREE(d->statesarea);
     699      2338506 :         if (d->outsarea != NULL)
     700      2338506 :             FREE(d->outsarea);
     701      2338506 :         if (d->incarea != NULL)
     702      2338506 :             FREE(d->incarea);
     703              :     }
     704              : 
     705      4618783 :     if (d->ismalloced)
     706      2350235 :         FREE(d);
     707      4618783 : }
     708              : 
     709              : /*
     710              :  * hash - construct a hash code for a bitvector
     711              :  *
     712              :  * There are probably better ways, but they're more expensive.
     713              :  */
     714              : static unsigned
     715        16630 : hash(unsigned *uv,
     716              :      int n)
     717              : {
     718              :     int         i;
     719              :     unsigned    h;
     720              : 
     721        16630 :     h = 0;
     722        76698 :     for (i = 0; i < n; i++)
     723        60068 :         h ^= uv[i];
     724        16630 :     return h;
     725              : }
     726              : 
     727              : /*
     728              :  * initialize - hand-craft a cache entry for startup, otherwise get ready
     729              :  */
     730              : static struct sset *
     731      4620236 : initialize(struct vars *v,
     732              :            struct dfa *d,
     733              :            chr *start)
     734              : {
     735              :     struct sset *ss;
     736              :     int         i;
     737              : 
     738              :     /* is previous one still there? */
     739      4620236 :     if (d->nssused > 0 && (d->ssets[0].flags & STARTER))
     740         2756 :         ss = &d->ssets[0];
     741              :     else
     742              :     {                           /* no, must (re)build it */
     743      4617480 :         ss = getvacant(v, d, start, start);
     744      4617480 :         if (ss == NULL)
     745            0 :             return NULL;
     746      9236345 :         for (i = 0; i < d->wordsper; i++)
     747      4618865 :             ss->states[i] = 0;
     748      4617480 :         BSET(ss->states, d->cnfa->pre);
     749      4617480 :         ss->hash = HASH(ss->states, d->wordsper);
     750              :         assert(d->cnfa->pre != d->cnfa->post);
     751      4617480 :         ss->flags = STARTER | LOCKED | NOPROGRESS;
     752              :         /* lastseen dealt with below */
     753              :     }
     754              : 
     755      9277976 :     for (i = 0; i < d->nssused; i++)
     756      4657740 :         d->ssets[i].lastseen = NULL;
     757      4620236 :     ss->lastseen = start;        /* maybe untrue, but harmless */
     758      4620236 :     d->lastpost = NULL;
     759      4620236 :     d->lastnopr = NULL;
     760      4620236 :     return ss;
     761              : }
     762              : 
     763              : /*
     764              :  * miss - handle a stateset cache miss
     765              :  *
     766              :  * css is the current stateset, co is the color of the current input character,
     767              :  * cp points to the character after that (which is where we may need to test
     768              :  * LACONs).  start does not affect matching behavior but is needed for pickss'
     769              :  * heuristics about which stateset cache entry to replace.
     770              :  *
     771              :  * Ordinarily, returns the address of the next stateset (the one that is
     772              :  * valid after consuming the input character).  Returns NULL if no valid
     773              :  * NFA states remain, ie we have a certain match failure.
     774              :  * Internal errors also return NULL, with v->err set.
     775              :  */
     776              : static struct sset *
     777     13397331 : miss(struct vars *v,
     778              :      struct dfa *d,
     779              :      struct sset *css,
     780              :      color co,
     781              :      chr *cp,                   /* next chr */
     782              :      chr *start)                /* where the attempt got started */
     783              : {
     784     13397331 :     struct cnfa *cnfa = d->cnfa;
     785              :     int         i;
     786              :     unsigned    h;
     787              :     struct carc *ca;
     788              :     struct sset *p;
     789              :     int         ispseudocolor;
     790              :     int         ispost;
     791              :     int         noprogress;
     792              :     int         gotstate;
     793              :     int         dolacons;
     794              :     int         sawlacons;
     795              : 
     796              :     /* for convenience, we can be called even if it might not be a miss */
     797     13397331 :     if (css->outs[co] != NULL)
     798              :     {
     799              :         FDEBUG(("hit\n"));
     800         2252 :         return css->outs[co];
     801              :     }
     802              :     FDEBUG(("miss\n"));
     803              : 
     804              :     /*
     805              :      * Checking for operation cancel in the inner text search loop seems
     806              :      * unduly expensive.  As a compromise, check during cache misses.
     807              :      */
     808     13395079 :     INTERRUPT(v->re);
     809              : 
     810              :     /*
     811              :      * What set of states would we end up in after consuming the co character?
     812              :      * We first consider PLAIN arcs that consume the character, and then look
     813              :      * to see what LACON arcs could be traversed after consuming it.
     814              :      */
     815     26835106 :     for (i = 0; i < d->wordsper; i++)
     816     13440027 :         d->work[i] = 0;          /* build new stateset bitmap in d->work */
     817     13395079 :     ispseudocolor = d->cm->cd[co].flags & PSEUDO;
     818     13395079 :     ispost = 0;
     819     13395079 :     noprogress = 1;
     820     13395079 :     gotstate = 0;
     821    157785817 :     for (i = 0; i < d->nstates; i++)
     822    144390738 :         if (ISBSET(css->states, i))
     823     59076462 :             for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
     824     40936999 :                 if (ca->co == co ||
     825     34273782 :                     (ca->co == RAINBOW && !ispseudocolor))
     826              :                 {
     827     15402901 :                     BSET(d->work, ca->to);
     828     15402901 :                     gotstate = 1;
     829     15402901 :                     if (ca->to == cnfa->post)
     830      1057275 :                         ispost = 1;
     831     15402901 :                     if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
     832      4351472 :                         noprogress = 0;
     833              :                     FDEBUG(("%d -> %d\n", i, ca->to));
     834              :                 }
     835     13395079 :     if (!gotstate)
     836      4031881 :         return NULL;            /* character cannot reach any new state */
     837      9363198 :     dolacons = (cnfa->flags & HASLACONS);
     838      9363198 :     sawlacons = 0;
     839              :     /* outer loop handles transitive closure of reachable-by-LACON states */
     840      9364003 :     while (dolacons)
     841              :     {
     842          805 :         dolacons = 0;
     843         7376 :         for (i = 0; i < d->nstates; i++)
     844         6571 :             if (ISBSET(d->work, i))
     845         2606 :                 for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
     846              :                 {
     847         1563 :                     if (ca->co < cnfa->ncolors)
     848         1377 :                         continue;   /* not a LACON arc */
     849          186 :                     if (ISBSET(d->work, ca->to))
     850           66 :                         continue;   /* arc would be a no-op anyway */
     851          120 :                     sawlacons = 1;  /* this LACON affects our result */
     852          120 :                     if (!lacon(v, cnfa, cp, ca->co))
     853              :                     {
     854           63 :                         if (ISERR())
     855            0 :                             return NULL;
     856           63 :                         continue;   /* LACON arc cannot be traversed */
     857              :                     }
     858           57 :                     if (ISERR())
     859            0 :                         return NULL;
     860           57 :                     BSET(d->work, ca->to);
     861           57 :                     dolacons = 1;
     862           57 :                     if (ca->to == cnfa->post)
     863            0 :                         ispost = 1;
     864           57 :                     if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
     865           57 :                         noprogress = 0;
     866              :                     FDEBUG(("%d :> %d\n", i, ca->to));
     867              :                 }
     868              :     }
     869      9363198 :     h = HASH(d->work, d->wordsper);
     870              : 
     871              :     /* Is this stateset already in the cache? */
     872     28583740 :     for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
     873     20489260 :         if (HIT(h, d->work, p, d->wordsper))
     874              :         {
     875              :             FDEBUG(("cached c%d\n", (int) (p - d->ssets)));
     876              :             break;              /* NOTE BREAK OUT */
     877              :         }
     878      9363198 :     if (i == 0)
     879              :     {                           /* nope, need a new cache entry */
     880      8094480 :         p = getvacant(v, d, cp, start);
     881      8094480 :         if (p == NULL)
     882            0 :             return NULL;
     883              :         assert(p != css);
     884     16225844 :         for (i = 0; i < d->wordsper; i++)
     885      8131364 :             p->states[i] = d->work[i];
     886      8094480 :         p->hash = h;
     887      8094480 :         p->flags = (ispost) ? POSTSTATE : 0;
     888      8094480 :         if (noprogress)
     889      4619624 :             p->flags |= NOPROGRESS;
     890              :         /* lastseen to be dealt with by caller */
     891              :     }
     892              : 
     893              :     /*
     894              :      * Link new stateset to old, unless a LACON affected the result, in which
     895              :      * case we don't create the link.  That forces future transitions across
     896              :      * this same arc (same prior stateset and character color) to come through
     897              :      * miss() again, so that we can recheck the LACON(s), which might or might
     898              :      * not pass since context will be different.
     899              :      */
     900      9363198 :     if (!sawlacons)
     901              :     {
     902              :         FDEBUG(("c%d[%d]->c%d\n",
     903              :                 (int) (css - d->ssets), co, (int) (p - d->ssets)));
     904      9363110 :         css->outs[co] = p;
     905      9363110 :         css->inchain[co] = p->ins;
     906      9363110 :         p->ins.ss = css;
     907      9363110 :         p->ins.co = co;
     908              :     }
     909      9363198 :     return p;
     910              : }
     911              : 
     912              : /*
     913              :  * lacon - lookaround-constraint checker for miss()
     914              :  */
     915              : static int                      /* predicate:  constraint satisfied? */
     916          120 : lacon(struct vars *v,
     917              :       struct cnfa *pcnfa,       /* parent cnfa */
     918              :       chr *cp,
     919              :       color co)                 /* "color" of the lookaround constraint */
     920              : {
     921              :     int         n;
     922              :     struct subre *sub;
     923              :     struct dfa *d;
     924              :     chr        *end;
     925              :     int         satisfied;
     926              : 
     927              :     /* Since this is recursive, it could be driven to stack overflow */
     928          120 :     if (STACK_TOO_DEEP(v->re))
     929              :     {
     930            0 :         ERR(REG_ETOOBIG);
     931            0 :         return 0;
     932              :     }
     933              : 
     934          120 :     n = co - pcnfa->ncolors;
     935              :     assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
     936              :     FDEBUG(("=== testing lacon %d\n", n));
     937          120 :     sub = &v->g->lacons[n];
     938          120 :     d = getladfa(v, n);
     939          120 :     if (d == NULL)
     940            0 :         return 0;
     941          120 :     if (LATYPE_IS_AHEAD(sub->latype))
     942              :     {
     943              :         /* used to use longest() here, but shortest() could be much cheaper */
     944           60 :         end = shortest(v, d, cp, cp, v->stop,
     945              :                        (chr **) NULL, (int *) NULL);
     946           60 :         satisfied = LATYPE_IS_POS(sub->latype) ? (end != NULL) : (end == NULL);
     947              :     }
     948              :     else
     949              :     {
     950              :         /*
     951              :          * To avoid doing O(N^2) work when repeatedly testing a lookbehind
     952              :          * constraint in an N-character string, we use matchuntil() which can
     953              :          * cache the DFA state across calls.  We only need to restart if the
     954              :          * probe point decreases, which is not common.  The NFA we're using is
     955              :          * a search NFA, so it doesn't mind scanning over stuff before the
     956              :          * nominal match.
     957              :          */
     958           60 :         satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]);
     959           60 :         if (!LATYPE_IS_POS(sub->latype))
     960            0 :             satisfied = !satisfied;
     961              :     }
     962              :     FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied));
     963          120 :     return satisfied;
     964              : }
     965              : 
     966              : /*
     967              :  * getvacant - get a vacant state set
     968              :  *
     969              :  * This routine clears out the inarcs and outarcs, but does not otherwise
     970              :  * clear the innards of the state set -- that's up to the caller.
     971              :  */
     972              : static struct sset *
     973     12711960 : getvacant(struct vars *v,
     974              :           struct dfa *d,
     975              :           chr *cp,
     976              :           chr *start)
     977              : {
     978              :     int         i;
     979              :     struct sset *ss;
     980              :     struct sset *p;
     981              :     struct arcp ap;
     982              :     color       co;
     983              : 
     984     12711960 :     ss = pickss(v, d, cp, start);
     985     12711960 :     if (ss == NULL)
     986            0 :         return NULL;
     987              :     assert(!(ss->flags & LOCKED));
     988              : 
     989              :     /* clear out its inarcs, including self-referential ones */
     990     12711960 :     ap = ss->ins;
     991     12711972 :     while ((p = ap.ss) != NULL)
     992              :     {
     993           12 :         co = ap.co;
     994              :         FDEBUG(("zapping c%d's %ld outarc\n", (int) (p - d->ssets), (long) co));
     995           12 :         p->outs[co] = NULL;
     996           12 :         ap = p->inchain[co];
     997           12 :         p->inchain[co].ss = NULL;    /* paranoia */
     998              :     }
     999     12711960 :     ss->ins.ss = NULL;
    1000              : 
    1001              :     /* take it off the inarc chains of the ssets reached by its outarcs */
    1002    149874780 :     for (i = 0; i < d->ncolors; i++)
    1003              :     {
    1004    137162820 :         p = ss->outs[i];
    1005              :         assert(p != ss);        /* not self-referential */
    1006    137162820 :         if (p == NULL)
    1007    137162754 :             continue;           /* NOTE CONTINUE */
    1008              :         FDEBUG(("del outarc %d from c%d's in chn\n", i, (int) (p - d->ssets)));
    1009           66 :         if (p->ins.ss == ss && p->ins.co == i)
    1010           60 :             p->ins = ss->inchain[i];
    1011              :         else
    1012              :         {
    1013            6 :             struct arcp lastap = {NULL, 0};
    1014              : 
    1015              :             assert(p->ins.ss != NULL);
    1016           12 :             for (ap = p->ins; ap.ss != NULL &&
    1017           12 :                  !(ap.ss == ss && ap.co == i);
    1018            6 :                  ap = ap.ss->inchain[ap.co])
    1019            6 :                 lastap = ap;
    1020              :             assert(ap.ss != NULL);
    1021            6 :             lastap.ss->inchain[lastap.co] = ss->inchain[i];
    1022              :         }
    1023           66 :         ss->outs[i] = NULL;
    1024           66 :         ss->inchain[i].ss = NULL;
    1025              :     }
    1026              : 
    1027              :     /* if ss was a success state, may need to remember location */
    1028     12711960 :     if ((ss->flags & POSTSTATE) && ss->lastseen != d->lastpost &&
    1029           18 :         (d->lastpost == NULL || d->lastpost < ss->lastseen))
    1030           18 :         d->lastpost = ss->lastseen;
    1031              : 
    1032              :     /* likewise for a no-progress state */
    1033     12711960 :     if ((ss->flags & NOPROGRESS) && ss->lastseen != d->lastnopr &&
    1034            6 :         (d->lastnopr == NULL || d->lastnopr < ss->lastseen))
    1035            6 :         d->lastnopr = ss->lastseen;
    1036              : 
    1037     12711960 :     return ss;
    1038              : }
    1039              : 
    1040              : /*
    1041              :  * pickss - pick the next stateset to be used
    1042              :  */
    1043              : static struct sset *
    1044     12711960 : pickss(struct vars *v,
    1045              :        struct dfa *d,
    1046              :        chr *cp,
    1047              :        chr *start)
    1048              : {
    1049              :     int         i;
    1050              :     struct sset *ss;
    1051              :     struct sset *end;
    1052              :     chr        *ancient;
    1053              : 
    1054              :     /* shortcut for cases where cache isn't full */
    1055     12711960 :     if (d->nssused < d->nssets)
    1056              :     {
    1057     12711894 :         i = d->nssused;
    1058     12711894 :         d->nssused++;
    1059     12711894 :         ss = &d->ssets[i];
    1060              :         FDEBUG(("new c%d\n", i));
    1061              :         /* set up innards */
    1062     12711894 :         ss->states = &d->statesarea[i * d->wordsper];
    1063     12711894 :         ss->flags = 0;
    1064     12711894 :         ss->ins.ss = NULL;
    1065     12711894 :         ss->ins.co = WHITE;      /* give it some value */
    1066     12711894 :         ss->outs = &d->outsarea[i * d->ncolors];
    1067     12711894 :         ss->inchain = &d->incarea[i * d->ncolors];
    1068    149873178 :         for (i = 0; i < d->ncolors; i++)
    1069              :         {
    1070    137161284 :             ss->outs[i] = NULL;
    1071    137161284 :             ss->inchain[i].ss = NULL;
    1072              :         }
    1073     12711894 :         return ss;
    1074              :     }
    1075              : 
    1076              :     /* look for oldest, or old enough anyway */
    1077           66 :     if (cp - start > d->nssets * 2 / 3) /* oldest 33% are expendable */
    1078           66 :         ancient = cp - d->nssets * 2 / 3;
    1079              :     else
    1080            0 :         ancient = start;
    1081           72 :     for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++)
    1082           66 :         if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
    1083           66 :             !(ss->flags & LOCKED))
    1084              :         {
    1085           60 :             d->search = ss + 1;
    1086              :             FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
    1087           60 :             return ss;
    1088              :         }
    1089           12 :     for (ss = d->ssets, end = d->search; ss < end; ss++)
    1090           12 :         if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
    1091           12 :             !(ss->flags & LOCKED))
    1092              :         {
    1093            6 :             d->search = ss + 1;
    1094              :             FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
    1095            6 :             return ss;
    1096              :         }
    1097              : 
    1098              :     /* nobody's old enough?!? -- something's really wrong */
    1099              :     FDEBUG(("cannot find victim to replace!\n"));
    1100            0 :     ERR(REG_ASSERT);
    1101            0 :     return NULL;
    1102              : }
        

Generated by: LCOV version 2.0-1