LCOV - code coverage report
Current view: top level - src/backend/regex - regexec.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 431 506 85.2 %
Date: 2025-01-18 03:14:54 Functions: 16 16 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * re_*exec and friends - match REs
       3             :  *
       4             :  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
       5             :  *
       6             :  * Development of this software was funded, in part, by Cray Research Inc.,
       7             :  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
       8             :  * Corporation, none of whom are responsible for the results.  The author
       9             :  * thanks all of them.
      10             :  *
      11             :  * Redistribution and use in source and binary forms -- with or without
      12             :  * modification -- are permitted for any purpose, provided that
      13             :  * redistributions in source form retain this entire copyright notice and
      14             :  * indicate the origin and nature of any modifications.
      15             :  *
      16             :  * I'd appreciate being given credit for this package in the documentation
      17             :  * of software which uses it, but that is not a requirement.
      18             :  *
      19             :  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
      20             :  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
      21             :  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
      22             :  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
      23             :  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
      24             :  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
      25             :  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
      26             :  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
      27             :  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
      28             :  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
      29             :  *
      30             :  * src/backend/regex/regexec.c
      31             :  *
      32             :  */
      33             : 
      34             : #include "regex/regguts.h"
      35             : 
      36             : 
      37             : 
      38             : /* lazy-DFA representation */
      39             : struct arcp
      40             : {                               /* "pointer" to an outarc */
      41             :     struct sset *ss;
      42             :     color       co;
      43             : };
      44             : 
      45             : struct sset
      46             : {                               /* state set */
      47             :     unsigned   *states;         /* pointer to bitvector */
      48             :     unsigned    hash;           /* hash of bitvector */
      49             : #define  HASH(bv, nw)    (((nw) == 1) ? *(bv) : hash(bv, nw))
      50             : #define  HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \
      51             :         memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0))
      52             :     int         flags;
      53             : #define  STARTER     01         /* the initial state set */
      54             : #define  POSTSTATE   02         /* includes the goal state */
      55             : #define  LOCKED      04         /* locked in cache */
      56             : #define  NOPROGRESS  010        /* zero-progress state set */
      57             :     struct arcp ins;            /* chain of inarcs pointing here */
      58             :     chr        *lastseen;       /* last entered on arrival here */
      59             :     struct sset **outs;         /* outarc vector indexed by color */
      60             :     struct arcp *inchain;       /* chain-pointer vector for outarcs */
      61             : };
      62             : 
      63             : struct dfa
      64             : {
      65             :     int         nssets;         /* size of cache */
      66             :     int         nssused;        /* how many entries occupied yet */
      67             :     int         nstates;        /* number of states */
      68             :     int         ncolors;        /* length of outarc and inchain vectors */
      69             :     int         wordsper;       /* length of state-set bitvectors */
      70             :     struct sset *ssets;         /* state-set cache */
      71             :     unsigned   *statesarea;     /* bitvector storage */
      72             :     unsigned   *work;           /* pointer to work area within statesarea */
      73             :     struct sset **outsarea;     /* outarc-vector storage */
      74             :     struct arcp *incarea;       /* inchain storage */
      75             :     struct cnfa *cnfa;
      76             :     struct colormap *cm;
      77             :     chr        *lastpost;       /* location of last cache-flushed success */
      78             :     chr        *lastnopr;       /* location of last cache-flushed NOPROGRESS */
      79             :     struct sset *search;        /* replacement-search-pointer memory */
      80             :     int         backno;         /* if DFA for a backref, subno it refers to */
      81             :     short       backmin;        /* min repetitions for backref */
      82             :     short       backmax;        /* max repetitions for backref */
      83             :     bool        ismalloced;     /* should this struct dfa be freed? */
      84             :     bool        arraysmalloced; /* should its subsidiary arrays be freed? */
      85             : };
      86             : 
      87             : #define WORK    1               /* number of work bitvectors needed */
      88             : 
      89             : /* setup for non-malloc allocation for small cases */
      90             : #define FEWSTATES   20          /* must be less than UBITS */
      91             : #define FEWCOLORS   15
      92             : struct smalldfa
      93             : {
      94             :     struct dfa  dfa;            /* must be first */
      95             :     struct sset ssets[FEWSTATES * 2];
      96             :     unsigned    statesarea[FEWSTATES * 2 + WORK];
      97             :     struct sset *outsarea[FEWSTATES * 2 * FEWCOLORS];
      98             :     struct arcp incarea[FEWSTATES * 2 * FEWCOLORS];
      99             : };
     100             : 
     101             : #define DOMALLOC    ((struct smalldfa *)NULL)   /* force malloc */
     102             : 
     103             : 
     104             : 
     105             : /* internal variables, bundled for easy passing around */
     106             : struct vars
     107             : {
     108             :     regex_t    *re;
     109             :     struct guts *g;
     110             :     int         eflags;         /* copies of arguments */
     111             :     size_t      nmatch;
     112             :     regmatch_t *pmatch;
     113             :     rm_detail_t *details;
     114             :     chr        *start;          /* start of string */
     115             :     chr        *search_start;   /* search start of string */
     116             :     chr        *stop;           /* just past end of string */
     117             :     int         err;            /* error code if any (0 none) */
     118             :     struct dfa **subdfas;       /* per-tree-subre DFAs */
     119             :     struct dfa **ladfas;        /* per-lacon-subre DFAs */
     120             :     struct sset **lblastcss;    /* per-lacon-subre lookbehind restart data */
     121             :     chr       **lblastcp;       /* per-lacon-subre lookbehind restart data */
     122             :     struct smalldfa dfa1;
     123             :     struct smalldfa dfa2;
     124             : };
     125             : 
     126             : #define VISERR(vv)  ((vv)->err != 0) /* have we seen an error yet? */
     127             : #define ISERR() VISERR(v)
     128             : #define VERR(vv,e)  ((vv)->err = ((vv)->err ? (vv)->err : (e)))
     129             : #define ERR(e)  VERR(v, e)      /* record an error */
     130             : #define NOERR() {if (ISERR()) return v->err;}    /* if error seen, return it */
     131             : #define OFF(p)  ((p) - v->start)
     132             : #define LOFF(p) ((long)OFF(p))
     133             : 
     134             : 
     135             : 
     136             : /*
     137             :  * forward declarations
     138             :  */
     139             : /* === regexec.c === */
     140             : static struct dfa *getsubdfa(struct vars *v, struct subre *t);
     141             : static struct dfa *getladfa(struct vars *v, int n);
     142             : static int  find(struct vars *v, struct cnfa *cnfa, struct colormap *cm);
     143             : static int  cfind(struct vars *v, struct cnfa *cnfa, struct colormap *cm);
     144             : static int  cfindloop(struct vars *v, struct cnfa *cnfa, struct colormap *cm,
     145             :                       struct dfa *d, struct dfa *s, chr **coldp);
     146             : static void zapallsubs(regmatch_t *p, size_t n);
     147             : static void zaptreesubs(struct vars *v, struct subre *t);
     148             : static void subset(struct vars *v, struct subre *sub, chr *begin, chr *end);
     149             : static int  cdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
     150             : static int  ccondissect(struct vars *v, struct subre *t, chr *begin, chr *end);
     151             : static int  crevcondissect(struct vars *v, struct subre *t, chr *begin, chr *end);
     152             : static int  cbrdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
     153             : static int  caltdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
     154             : static int  citerdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
     155             : static int  creviterdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
     156             : 
     157             : /* === rege_dfa.c === */
     158             : static chr *longest(struct vars *v, struct dfa *d,
     159             :                     chr *start, chr *stop, int *hitstopp);
     160             : static chr *shortest(struct vars *v, struct dfa *d, chr *start, chr *min,
     161             :                      chr *max, chr **coldp, int *hitstopp);
     162             : static int  matchuntil(struct vars *v, struct dfa *d, chr *probe,
     163             :                        struct sset **lastcss, chr **lastcp);
     164             : static chr *dfa_backref(struct vars *v, struct dfa *d, chr *start,
     165             :                         chr *min, chr *max, bool shortest);
     166             : static chr *lastcold(struct vars *v, struct dfa *d);
     167             : static struct dfa *newdfa(struct vars *v, struct cnfa *cnfa,
     168             :                           struct colormap *cm, struct smalldfa *sml);
     169             : static void freedfa(struct dfa *d);
     170             : static unsigned hash(unsigned *uv, int n);
     171             : static struct sset *initialize(struct vars *v, struct dfa *d, chr *start);
     172             : static struct sset *miss(struct vars *v, struct dfa *d, struct sset *css,
     173             :                          color co, chr *cp, chr *start);
     174             : static int  lacon(struct vars *v, struct cnfa *pcnfa, chr *cp, color co);
     175             : static struct sset *getvacant(struct vars *v, struct dfa *d, chr *cp,
     176             :                               chr *start);
     177             : static struct sset *pickss(struct vars *v, struct dfa *d, chr *cp,
     178             :                            chr *start);
     179             : 
     180             : 
     181             : /*
     182             :  * pg_regexec - match regular expression
     183             :  */
     184             : int
     185     7424762 : pg_regexec(regex_t *re,
     186             :            const chr *string,
     187             :            size_t len,
     188             :            size_t search_start,
     189             :            rm_detail_t *details,
     190             :            size_t nmatch,
     191             :            regmatch_t pmatch[],
     192             :            int flags)
     193             : {
     194             :     struct vars var;
     195     7424762 :     struct vars *v = &var;
     196             :     int         st;
     197             :     size_t      n;
     198             :     size_t      i;
     199             :     int         backref;
     200             : 
     201             : #define  LOCALMAT    20
     202             :     regmatch_t  mat[LOCALMAT];
     203             : 
     204             : #define  LOCALDFAS   40
     205             :     struct dfa *subdfas[LOCALDFAS];
     206             : 
     207             :     /* sanity checks */
     208     7424762 :     if (re == NULL || string == NULL || re->re_magic != REMAGIC)
     209           0 :         return REG_INVARG;
     210     7424762 :     if (re->re_csize != sizeof(chr))
     211           0 :         return REG_MIXED;
     212     7424762 :     if (search_start > len)
     213           6 :         return REG_NOMATCH;
     214             : 
     215             :     /* Initialize locale-dependent support */
     216     7424756 :     pg_set_regex_collation(re->re_collation);
     217             : 
     218             :     /* setup */
     219     7424756 :     v->re = re;
     220     7424756 :     v->g = (struct guts *) re->re_guts;
     221     7424756 :     if ((v->g->cflags & REG_EXPECT) && details == NULL)
     222           0 :         return REG_INVARG;
     223     7424756 :     if (v->g->info & REG_UIMPOSSIBLE)
     224        1432 :         return REG_NOMATCH;
     225     7423324 :     backref = (v->g->info & REG_UBACKREF) ? 1 : 0;
     226     7423324 :     v->eflags = flags;
     227     7423324 :     if (backref && nmatch <= v->g->nsub)
     228             :     {
     229             :         /* need larger work area */
     230         182 :         v->nmatch = v->g->nsub + 1;
     231         182 :         if (v->nmatch <= LOCALMAT)
     232         180 :             v->pmatch = mat;
     233             :         else
     234           2 :             v->pmatch = (regmatch_t *) MALLOC(v->nmatch * sizeof(regmatch_t));
     235         182 :         if (v->pmatch == NULL)
     236           0 :             return REG_ESPACE;
     237         182 :         zapallsubs(v->pmatch, v->nmatch);
     238             :     }
     239             :     else
     240             :     {
     241             :         /* we can store results directly in caller's array */
     242     7423142 :         v->pmatch = pmatch;
     243             :         /* ensure any extra entries in caller's array are filled with -1 */
     244     7423142 :         if (nmatch > 0)
     245     1116684 :             zapallsubs(pmatch, nmatch);
     246             :         /* then forget about extra entries, to avoid useless work in find() */
     247     7423142 :         if (nmatch > v->g->nsub + 1)
     248        1002 :             nmatch = v->g->nsub + 1;
     249     7423142 :         v->nmatch = nmatch;
     250             :     }
     251     7423324 :     v->details = details;
     252     7423324 :     v->start = (chr *) string;
     253     7423324 :     v->search_start = (chr *) string + search_start;
     254     7423324 :     v->stop = (chr *) string + len;
     255     7423324 :     v->err = 0;
     256     7423324 :     v->subdfas = NULL;
     257     7423324 :     v->ladfas = NULL;
     258     7423324 :     v->lblastcss = NULL;
     259     7423324 :     v->lblastcp = NULL;
     260             :     /* below this point, "goto cleanup" will behave sanely */
     261             : 
     262             :     assert(v->g->ntree >= 0);
     263     7423324 :     n = (size_t) v->g->ntree;
     264     7423324 :     if (n <= LOCALDFAS)
     265     7423320 :         v->subdfas = subdfas;
     266             :     else
     267             :     {
     268           4 :         v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
     269           4 :         if (v->subdfas == NULL)
     270             :         {
     271           0 :             st = REG_ESPACE;
     272           0 :             goto cleanup;
     273             :         }
     274             :     }
     275    22302168 :     for (i = 0; i < n; i++)
     276    14878844 :         v->subdfas[i] = NULL;
     277             : 
     278             :     assert(v->g->nlacons >= 0);
     279     7423324 :     n = (size_t) v->g->nlacons;
     280     7423324 :     if (n > 0)
     281             :     {
     282        1260 :         v->ladfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
     283        1260 :         if (v->ladfas == NULL)
     284             :         {
     285           0 :             st = REG_ESPACE;
     286           0 :             goto cleanup;
     287             :         }
     288        3844 :         for (i = 0; i < n; i++)
     289        2584 :             v->ladfas[i] = NULL;
     290        1260 :         v->lblastcss = (struct sset **) MALLOC(n * sizeof(struct sset *));
     291        1260 :         v->lblastcp = (chr **) MALLOC(n * sizeof(chr *));
     292        1260 :         if (v->lblastcss == NULL || v->lblastcp == NULL)
     293             :         {
     294           0 :             st = REG_ESPACE;
     295           0 :             goto cleanup;
     296             :         }
     297        3844 :         for (i = 0; i < n; i++)
     298             :         {
     299        2584 :             v->lblastcss[i] = NULL;
     300        2584 :             v->lblastcp[i] = NULL;
     301             :         }
     302             :     }
     303             : 
     304             :     /* do it */
     305             :     assert(v->g->tree != NULL);
     306     7423324 :     if (backref)
     307         290 :         st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
     308             :     else
     309     7423034 :         st = find(v, &v->g->tree->cnfa, &v->g->cmap);
     310             : 
     311             :     /* on success, ensure caller's match vector is filled correctly */
     312     7423324 :     if (st == REG_OKAY && nmatch > 0)
     313             :     {
     314      904390 :         if (v->pmatch != pmatch)
     315             :         {
     316             :             /* copy portion of match vector over from (larger) work area */
     317             :             assert(nmatch <= v->nmatch);
     318          42 :             memcpy(VS(pmatch), VS(v->pmatch), nmatch * sizeof(regmatch_t));
     319             :         }
     320      904390 :         if (v->g->cflags & REG_NOSUB)
     321             :         {
     322             :             /* don't expose possibly-partial sub-match results to caller */
     323      898756 :             zapallsubs(pmatch, nmatch);
     324             :         }
     325             :     }
     326             : 
     327             :     /* clean up */
     328     6524568 : cleanup:
     329     7423324 :     if (v->pmatch != pmatch && v->pmatch != mat)
     330           2 :         FREE(v->pmatch);
     331     7423324 :     if (v->subdfas != NULL)
     332             :     {
     333     7423324 :         n = (size_t) v->g->ntree;
     334    22302168 :         for (i = 0; i < n; i++)
     335             :         {
     336    14878844 :             if (v->subdfas[i] != NULL)
     337       24752 :                 freedfa(v->subdfas[i]);
     338             :         }
     339     7423324 :         if (v->subdfas != subdfas)
     340           4 :             FREE(v->subdfas);
     341             :     }
     342     7423324 :     if (v->ladfas != NULL)
     343             :     {
     344        1260 :         n = (size_t) v->g->nlacons;
     345        3844 :         for (i = 0; i < n; i++)
     346             :         {
     347        2584 :             if (v->ladfas[i] != NULL)
     348         106 :                 freedfa(v->ladfas[i]);
     349             :         }
     350        1260 :         FREE(v->ladfas);
     351             :     }
     352     7423324 :     if (v->lblastcss != NULL)
     353        1260 :         FREE(v->lblastcss);
     354     7423324 :     if (v->lblastcp != NULL)
     355        1260 :         FREE(v->lblastcp);
     356             : 
     357             : #ifdef REG_DEBUG
     358             :     if (v->eflags & (REG_FTRACE | REG_MTRACE))
     359             :         fflush(stdout);
     360             : #endif
     361             : 
     362     7423324 :     return st;
     363             : }
     364             : 
     365             : /*
     366             :  * getsubdfa - create or re-fetch the DFA for a tree subre node
     367             :  *
     368             :  * We only need to create the DFA once per overall regex execution.
     369             :  * The DFA will be freed by the cleanup step in pg_regexec().
     370             :  */
     371             : static struct dfa *
     372       26124 : getsubdfa(struct vars *v,
     373             :           struct subre *t)
     374             : {
     375       26124 :     struct dfa *d = v->subdfas[t->id];
     376             : 
     377       26124 :     if (d == NULL)
     378             :     {
     379       24752 :         d = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
     380       24752 :         if (d == NULL)
     381           0 :             return NULL;
     382             :         /* set up additional info if this is a backref node */
     383       24752 :         if (t->op == 'b')
     384             :         {
     385         280 :             d->backno = t->backno;
     386         280 :             d->backmin = t->min;
     387         280 :             d->backmax = t->max;
     388             :         }
     389       24752 :         v->subdfas[t->id] = d;
     390             :     }
     391       26124 :     return d;
     392             : }
     393             : 
     394             : /*
     395             :  * getladfa - create or re-fetch the DFA for a LACON subre node
     396             :  *
     397             :  * Same as above, but for LACONs.
     398             :  */
     399             : static struct dfa *
     400         240 : getladfa(struct vars *v,
     401             :          int n)
     402             : {
     403             :     assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
     404             : 
     405         240 :     if (v->ladfas[n] == NULL)
     406             :     {
     407         106 :         struct subre *sub = &v->g->lacons[n];
     408             : 
     409         106 :         v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC);
     410             :         /* a LACON can't contain a backref, so nothing else to do */
     411             :     }
     412         240 :     return v->ladfas[n];
     413             : }
     414             : 
     415             : /*
     416             :  * find - find a match for the main NFA (no-complications case)
     417             :  */
     418             : static int
     419     7423034 : find(struct vars *v,
     420             :      struct cnfa *cnfa,
     421             :      struct colormap *cm)
     422             : {
     423             :     struct dfa *s;
     424             :     struct dfa *d;
     425             :     chr        *begin;
     426     7423034 :     chr        *end = NULL;
     427             :     chr        *cold;
     428             :     chr        *open;           /* open and close of range of possible starts */
     429             :     chr        *close;
     430             :     int         hitend;
     431     7423034 :     int         shorter = (v->g->tree->flags & SHORTER) ? 1 : 0;
     432             : 
     433             :     /* first, a shot with the search RE */
     434     7423034 :     s = newdfa(v, &v->g->search, cm, &v->dfa1);
     435     7423034 :     if (s == NULL)
     436           0 :         return v->err;
     437             :     MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
     438     7423034 :     cold = NULL;
     439     7423034 :     close = shortest(v, s, v->search_start, v->search_start, v->stop,
     440             :                      &cold, (int *) NULL);
     441     7423034 :     freedfa(s);
     442     7423034 :     NOERR();
     443     7423034 :     if (v->g->cflags & REG_EXPECT)
     444             :     {
     445             :         assert(v->details != NULL);
     446          54 :         if (cold != NULL)
     447          54 :             v->details->rm_extend.rm_so = OFF(cold);
     448             :         else
     449           0 :             v->details->rm_extend.rm_so = OFF(v->stop);
     450          54 :         v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
     451             :     }
     452     7423034 :     if (close == NULL)          /* not found */
     453     6275462 :         return REG_NOMATCH;
     454     1147572 :     if (v->nmatch == 0)          /* found, don't need exact location */
     455      243298 :         return REG_OKAY;
     456             : 
     457             :     /* find starting point and match */
     458             :     assert(cold != NULL);
     459      904274 :     open = cold;
     460      904274 :     cold = NULL;
     461             :     MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
     462      904274 :     d = newdfa(v, cnfa, cm, &v->dfa1);
     463      904274 :     if (d == NULL)
     464           0 :         return v->err;
     465      904332 :     for (begin = open; begin <= close; begin++)
     466             :     {
     467             :         MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
     468      904332 :         if (shorter)
     469         124 :             end = shortest(v, d, begin, begin, v->stop,
     470             :                            (chr **) NULL, &hitend);
     471             :         else
     472      904208 :             end = longest(v, d, begin, v->stop, &hitend);
     473      904332 :         if (ISERR())
     474             :         {
     475           0 :             freedfa(d);
     476           0 :             return v->err;
     477             :         }
     478      904332 :         if (hitend && cold == NULL)
     479        6444 :             cold = begin;
     480      904332 :         if (end != NULL)
     481      904274 :             break;              /* NOTE BREAK OUT */
     482             :     }
     483             :     assert(end != NULL);        /* search RE succeeded so loop should */
     484      904274 :     freedfa(d);
     485             : 
     486             :     /* and pin down details */
     487             :     assert(v->nmatch > 0);
     488      904274 :     v->pmatch[0].rm_so = OFF(begin);
     489      904274 :     v->pmatch[0].rm_eo = OFF(end);
     490      904274 :     if (v->g->cflags & REG_EXPECT)
     491             :     {
     492          20 :         if (cold != NULL)
     493          10 :             v->details->rm_extend.rm_so = OFF(cold);
     494             :         else
     495          10 :             v->details->rm_extend.rm_so = OFF(v->stop);
     496          20 :         v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
     497             :     }
     498      904274 :     if (v->nmatch == 1)          /* no need for submatches */
     499      899870 :         return REG_OKAY;
     500             : 
     501             :     /* find submatches */
     502        4404 :     return cdissect(v, v->g->tree, begin, end);
     503             : }
     504             : 
     505             : /*
     506             :  * cfind - find a match for the main NFA (with complications)
     507             :  */
     508             : static int
     509         290 : cfind(struct vars *v,
     510             :       struct cnfa *cnfa,
     511             :       struct colormap *cm)
     512             : {
     513             :     struct dfa *s;
     514             :     struct dfa *d;
     515             :     chr        *cold;
     516             :     int         ret;
     517             : 
     518         290 :     s = newdfa(v, &v->g->search, cm, &v->dfa1);
     519         290 :     if (s == NULL)
     520           0 :         return v->err;
     521         290 :     d = newdfa(v, cnfa, cm, &v->dfa2);
     522         290 :     if (d == NULL)
     523             :     {
     524           0 :         freedfa(s);
     525           0 :         return v->err;
     526             :     }
     527             : 
     528         290 :     ret = cfindloop(v, cnfa, cm, d, s, &cold);
     529             : 
     530         290 :     freedfa(d);
     531         290 :     freedfa(s);
     532         290 :     NOERR();
     533         290 :     if (v->g->cflags & REG_EXPECT)
     534             :     {
     535             :         assert(v->details != NULL);
     536           0 :         if (cold != NULL)
     537           0 :             v->details->rm_extend.rm_so = OFF(cold);
     538             :         else
     539           0 :             v->details->rm_extend.rm_so = OFF(v->stop);
     540           0 :         v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
     541             :     }
     542         290 :     return ret;
     543             : }
     544             : 
     545             : /*
     546             :  * cfindloop - the heart of cfind
     547             :  */
     548             : static int
     549         290 : cfindloop(struct vars *v,
     550             :           struct cnfa *cnfa,
     551             :           struct colormap *cm,
     552             :           struct dfa *d,
     553             :           struct dfa *s,
     554             :           chr **coldp)          /* where to put coldstart pointer */
     555             : {
     556             :     chr        *begin;
     557             :     chr        *end;
     558             :     chr        *cold;
     559             :     chr        *open;           /* open and close of range of possible starts */
     560             :     chr        *close;
     561             :     chr        *estart;
     562             :     chr        *estop;
     563             :     int         er;
     564         290 :     int         shorter = v->g->tree->flags & SHORTER;
     565             :     int         hitend;
     566             : 
     567             :     assert(d != NULL && s != NULL);
     568         290 :     cold = NULL;
     569         290 :     close = v->search_start;
     570             :     do
     571             :     {
     572             :         /* Search with the search RE for match range at/beyond "close" */
     573             :         MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
     574         322 :         close = shortest(v, s, close, close, v->stop, &cold, (int *) NULL);
     575         322 :         if (ISERR())
     576             :         {
     577           0 :             *coldp = cold;
     578           0 :             return v->err;
     579             :         }
     580         322 :         if (close == NULL)
     581          30 :             break;              /* no more possible match anywhere */
     582             :         assert(cold != NULL);
     583         292 :         open = cold;
     584         292 :         cold = NULL;
     585             :         /* Search for matches starting between "open" and "close" inclusive */
     586             :         MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
     587        1026 :         for (begin = open; begin <= close; begin++)
     588             :         {
     589             :             MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
     590         904 :             estart = begin;
     591         904 :             estop = v->stop;
     592             :             for (;;)
     593             :             {
     594             :                 /* Here we use the top node's detailed RE */
     595        1292 :                 if (shorter)
     596         194 :                     end = shortest(v, d, begin, estart,
     597             :                                    estop, (chr **) NULL, &hitend);
     598             :                 else
     599        1098 :                     end = longest(v, d, begin, estop,
     600             :                                   &hitend);
     601        1292 :                 if (ISERR())
     602             :                 {
     603           0 :                     *coldp = cold;
     604           0 :                     return v->err;
     605             :                 }
     606        1292 :                 if (hitend && cold == NULL)
     607         208 :                     cold = begin;
     608        1292 :                 if (end == NULL)
     609         698 :                     break;      /* no match with this begin point, try next */
     610             :                 MDEBUG(("tentative end %ld\n", LOFF(end)));
     611             :                 /* Dissect the potential match to see if it really matches */
     612         594 :                 er = cdissect(v, v->g->tree, begin, end);
     613         594 :                 if (er == REG_OKAY)
     614             :                 {
     615         170 :                     if (v->nmatch > 0)
     616             :                     {
     617         170 :                         v->pmatch[0].rm_so = OFF(begin);
     618         170 :                         v->pmatch[0].rm_eo = OFF(end);
     619             :                     }
     620         170 :                     *coldp = cold;
     621         170 :                     return REG_OKAY;
     622             :                 }
     623         424 :                 if (er != REG_NOMATCH)
     624             :                 {
     625           0 :                     ERR(er);
     626           0 :                     *coldp = cold;
     627           0 :                     return er;
     628             :                 }
     629             :                 /* Try next longer/shorter match with same begin point */
     630         424 :                 if (shorter)
     631             :                 {
     632         162 :                     if (end == estop)
     633          24 :                         break;  /* no more, so try next begin point */
     634         138 :                     estart = end + 1;
     635             :                 }
     636             :                 else
     637             :                 {
     638         262 :                     if (end == begin)
     639          12 :                         break;  /* no more, so try next begin point */
     640         250 :                     estop = end - 1;
     641             :                 }
     642             :             }                   /* end loop over endpoint positions */
     643             :         }                       /* end loop over beginning positions */
     644             : 
     645             :         /*
     646             :          * If we get here, there is no possible match starting at or before
     647             :          * "close", so consider matches beyond that.  We'll do a fresh search
     648             :          * with the search RE to find a new promising match range.
     649             :          */
     650         122 :         close++;
     651         122 :     } while (close < v->stop);
     652             : 
     653         120 :     *coldp = cold;
     654         120 :     return REG_NOMATCH;
     655             : }
     656             : 
     657             : /*
     658             :  * zapallsubs - initialize all subexpression matches to "no match"
     659             :  *
     660             :  * Note that p[0], the overall-match location, is not touched.
     661             :  */
     662             : static void
     663     2015622 : zapallsubs(regmatch_t *p,
     664             :            size_t n)
     665             : {
     666             :     size_t      i;
     667             : 
     668     2032030 :     for (i = n - 1; i > 0; i--)
     669             :     {
     670       16408 :         p[i].rm_so = -1;
     671       16408 :         p[i].rm_eo = -1;
     672             :     }
     673     2015622 : }
     674             : 
     675             : /*
     676             :  * zaptreesubs - initialize subexpressions within subtree to "no match"
     677             :  */
     678             : static void
     679        1040 : zaptreesubs(struct vars *v,
     680             :             struct subre *t)
     681             : {
     682        1040 :     int         n = t->capno;
     683             :     struct subre *t2;
     684             : 
     685        1040 :     if (n > 0)
     686             :     {
     687         486 :         if ((size_t) n < v->nmatch)
     688             :         {
     689         486 :             v->pmatch[n].rm_so = -1;
     690         486 :             v->pmatch[n].rm_eo = -1;
     691             :         }
     692             :     }
     693             : 
     694        1390 :     for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
     695         350 :         zaptreesubs(v, t2);
     696        1040 : }
     697             : 
     698             : /*
     699             :  * subset - set subexpression match data for a successful subre
     700             :  */
     701             : static void
     702        8110 : subset(struct vars *v,
     703             :        struct subre *sub,
     704             :        chr *begin,
     705             :        chr *end)
     706             : {
     707        8110 :     int         n = sub->capno;
     708             : 
     709             :     assert(n > 0);
     710        8110 :     if ((size_t) n >= v->nmatch)
     711          18 :         return;
     712             : 
     713             :     MDEBUG(("%d: setting %d = %ld-%ld\n", sub->id, n, LOFF(begin), LOFF(end)));
     714        8092 :     v->pmatch[n].rm_so = OFF(begin);
     715        8092 :     v->pmatch[n].rm_eo = OFF(end);
     716             : }
     717             : 
     718             : /*
     719             :  * cdissect - check backrefs and determine subexpression matches
     720             :  *
     721             :  * cdissect recursively processes a subre tree to check matching of backrefs
     722             :  * and/or identify submatch boundaries for capture nodes.  The proposed match
     723             :  * runs from "begin" to "end" (not including "end"), and we are basically
     724             :  * "dissecting" it to see where the submatches are.
     725             :  *
     726             :  * Before calling any level of cdissect, the caller must have run the node's
     727             :  * DFA and found that the proposed substring satisfies the DFA.  (We make
     728             :  * the caller do that because in concatenation and iteration nodes, it's
     729             :  * much faster to check all the substrings against the child DFAs before we
     730             :  * recurse.)
     731             :  *
     732             :  * A side-effect of a successful match is to save match locations for
     733             :  * capturing subexpressions in v->pmatch[].  This is a little bit tricky,
     734             :  * so we make the following rules:
     735             :  * 1. Before initial entry to cdissect, all match data must have been
     736             :  *    cleared (this is seen to by zapallsubs).
     737             :  * 2. Before any recursive entry to cdissect, the match data for that
     738             :  *    subexpression tree must be guaranteed clear (see zaptreesubs).
     739             :  * 3. When returning REG_OKAY, each level of cdissect will have saved
     740             :  *    any relevant match locations.
     741             :  * 4. When returning REG_NOMATCH, each level of cdissect will guarantee
     742             :  *    that its subexpression match locations are again clear.
     743             :  * 5. No guarantees are made for error cases (i.e., other result codes).
     744             :  * 6. When a level of cdissect abandons a successful sub-match, it will
     745             :  *    clear that subtree's match locations with zaptreesubs before trying
     746             :  *    any new DFA match or cdissect call for that subtree or any subtree
     747             :  *    to its right (that is, any subtree that could have a backref into the
     748             :  *    abandoned match).
     749             :  * This may seem overly complicated, but it's difficult to simplify it
     750             :  * because of the provision that match locations must be reset before
     751             :  * any fresh DFA match (a rule that is needed to make dfa_backref safe).
     752             :  * That means it won't work to just reset relevant match locations at the
     753             :  * start of each cdissect level.
     754             :  */
     755             : static int                      /* regexec return code */
     756       30480 : cdissect(struct vars *v,
     757             :          struct subre *t,
     758             :          chr *begin,            /* beginning of relevant substring */
     759             :          chr *end)              /* end of same */
     760             : {
     761             :     int         er;
     762             : 
     763             :     assert(t != NULL);
     764             :     MDEBUG(("%d: cdissect %c %ld-%ld\n", t->id, t->op, LOFF(begin), LOFF(end)));
     765             : 
     766             :     /* handy place to check for operation cancel */
     767       30480 :     INTERRUPT(v->re);
     768             :     /* ... and stack overrun */
     769       30480 :     if (STACK_TOO_DEEP(v->re))
     770           0 :         return REG_ETOOBIG;
     771             : 
     772       30480 :     switch (t->op)
     773             :     {
     774       16766 :         case '=':               /* terminal node */
     775             :             assert(t->child == NULL);
     776       16766 :             er = REG_OKAY;      /* no action, parent did the work */
     777       16766 :             break;
     778         418 :         case 'b':               /* back reference */
     779             :             assert(t->child == NULL);
     780         418 :             er = cbrdissect(v, t, begin, end);
     781         418 :             break;
     782       12828 :         case '.':               /* concatenation */
     783             :             assert(t->child != NULL);
     784       12828 :             if (t->child->flags & SHORTER)    /* reverse scan */
     785         328 :                 er = crevcondissect(v, t, begin, end);
     786             :             else
     787       12500 :                 er = ccondissect(v, t, begin, end);
     788       12828 :             break;
     789         160 :         case '|':               /* alternation */
     790             :             assert(t->child != NULL);
     791         160 :             er = caltdissect(v, t, begin, end);
     792         160 :             break;
     793         248 :         case '*':               /* iteration */
     794             :             assert(t->child != NULL);
     795         248 :             if (t->child->flags & SHORTER)    /* reverse scan */
     796          14 :                 er = creviterdissect(v, t, begin, end);
     797             :             else
     798         234 :                 er = citerdissect(v, t, begin, end);
     799         248 :             break;
     800          60 :         case '(':               /* no-op capture node */
     801             :             assert(t->child != NULL);
     802          60 :             er = cdissect(v, t->child, begin, end);
     803          60 :             break;
     804           0 :         default:
     805           0 :             er = REG_ASSERT;
     806           0 :             break;
     807             :     }
     808             : 
     809             :     /*
     810             :      * We should never have a match failure unless backrefs lurk below;
     811             :      * otherwise, either caller failed to check the DFA, or there's some
     812             :      * inconsistency between the DFA and the node's innards.
     813             :      */
     814             :     assert(er != REG_NOMATCH || (t->flags & BACKR));
     815             : 
     816             :     /*
     817             :      * If this node is marked as capturing, save successful match's location.
     818             :      */
     819       30480 :     if (t->capno > 0 && er == REG_OKAY)
     820        8110 :         subset(v, t, begin, end);
     821             : 
     822       30480 :     return er;
     823             : }
     824             : 
     825             : /*
     826             :  * ccondissect - dissect match for concatenation node
     827             :  */
     828             : static int                      /* regexec return code */
     829       12500 : ccondissect(struct vars *v,
     830             :             struct subre *t,
     831             :             chr *begin,         /* beginning of relevant substring */
     832             :             chr *end)           /* end of same */
     833             : {
     834       12500 :     struct subre *left = t->child;
     835       12500 :     struct subre *right = left->sibling;
     836             :     struct dfa *d;
     837             :     struct dfa *d2;
     838             :     chr        *mid;
     839             :     int         er;
     840             : 
     841             :     assert(t->op == '.');
     842             :     assert(left != NULL && left->cnfa.nstates > 0);
     843             :     assert(right != NULL && right->cnfa.nstates > 0);
     844             :     assert(right->sibling == NULL);
     845             :     assert(!(left->flags & SHORTER));
     846             : 
     847       12500 :     d = getsubdfa(v, left);
     848       12500 :     NOERR();
     849       12500 :     d2 = getsubdfa(v, right);
     850       12500 :     NOERR();
     851             :     MDEBUG(("%d: ccondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
     852             : 
     853             :     /* pick a tentative midpoint */
     854       12500 :     mid = longest(v, d, begin, end, (int *) NULL);
     855       12500 :     NOERR();
     856       12500 :     if (mid == NULL)
     857          16 :         return REG_NOMATCH;
     858             :     MDEBUG(("%d: tentative midpoint %ld\n", t->id, LOFF(mid)));
     859             : 
     860             :     /* iterate until satisfaction or failure */
     861             :     for (;;)
     862             :     {
     863             :         /* try this midpoint on for size */
     864       15144 :         if (longest(v, d2, mid, end, (int *) NULL) == end)
     865             :         {
     866       12402 :             er = cdissect(v, left, begin, mid);
     867       12402 :             if (er == REG_OKAY)
     868             :             {
     869       12302 :                 er = cdissect(v, right, mid, end);
     870       12302 :                 if (er == REG_OKAY)
     871             :                 {
     872             :                     /* satisfaction */
     873             :                     MDEBUG(("%d: successful\n", t->id));
     874       11806 :                     return REG_OKAY;
     875             :                 }
     876             :                 /* Reset left's matches (right should have done so itself) */
     877         496 :                 zaptreesubs(v, left);
     878             :             }
     879         596 :             if (er != REG_NOMATCH)
     880           0 :                 return er;
     881             :         }
     882        3338 :         NOERR();
     883             : 
     884             :         /* that midpoint didn't work, find a new one */
     885        3338 :         if (mid == begin)
     886             :         {
     887             :             /* all possibilities exhausted */
     888             :             MDEBUG(("%d: no midpoint\n", t->id));
     889         150 :             return REG_NOMATCH;
     890             :         }
     891        3188 :         mid = longest(v, d, begin, mid - 1, (int *) NULL);
     892        3188 :         NOERR();
     893        3188 :         if (mid == NULL)
     894             :         {
     895             :             /* failed to find a new one */
     896             :             MDEBUG(("%d: failed midpoint\n", t->id));
     897         528 :             return REG_NOMATCH;
     898             :         }
     899             :         MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
     900             :     }
     901             : 
     902             :     /* can't get here */
     903             :     return REG_ASSERT;
     904             : }
     905             : 
     906             : /*
     907             :  * crevcondissect - dissect match for concatenation node, shortest-first
     908             :  */
     909             : static int                      /* regexec return code */
     910         328 : crevcondissect(struct vars *v,
     911             :                struct subre *t,
     912             :                chr *begin,      /* beginning of relevant substring */
     913             :                chr *end)        /* end of same */
     914             : {
     915         328 :     struct subre *left = t->child;
     916         328 :     struct subre *right = left->sibling;
     917             :     struct dfa *d;
     918             :     struct dfa *d2;
     919             :     chr        *mid;
     920             :     int         er;
     921             : 
     922             :     assert(t->op == '.');
     923             :     assert(left != NULL && left->cnfa.nstates > 0);
     924             :     assert(right != NULL && right->cnfa.nstates > 0);
     925             :     assert(right->sibling == NULL);
     926             :     assert(left->flags & SHORTER);
     927             : 
     928         328 :     d = getsubdfa(v, left);
     929         328 :     NOERR();
     930         328 :     d2 = getsubdfa(v, right);
     931         328 :     NOERR();
     932             :     MDEBUG(("%d: crevcondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
     933             : 
     934             :     /* pick a tentative midpoint */
     935         328 :     mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
     936         328 :     NOERR();
     937         328 :     if (mid == NULL)
     938           0 :         return REG_NOMATCH;
     939             :     MDEBUG(("%d: tentative midpoint %ld\n", t->id, LOFF(mid)));
     940             : 
     941             :     /* iterate until satisfaction or failure */
     942             :     for (;;)
     943             :     {
     944             :         /* try this midpoint on for size */
     945        1210 :         if (longest(v, d2, mid, end, (int *) NULL) == end)
     946             :         {
     947         186 :             er = cdissect(v, left, begin, mid);
     948         186 :             if (er == REG_OKAY)
     949             :             {
     950         186 :                 er = cdissect(v, right, mid, end);
     951         186 :                 if (er == REG_OKAY)
     952             :                 {
     953             :                     /* satisfaction */
     954             :                     MDEBUG(("%d: successful\n", t->id));
     955         162 :                     return REG_OKAY;
     956             :                 }
     957             :                 /* Reset left's matches (right should have done so itself) */
     958          24 :                 zaptreesubs(v, left);
     959             :             }
     960          24 :             if (er != REG_NOMATCH)
     961           0 :                 return er;
     962             :         }
     963        1048 :         NOERR();
     964             : 
     965             :         /* that midpoint didn't work, find a new one */
     966        1048 :         if (mid == end)
     967             :         {
     968             :             /* all possibilities exhausted */
     969             :             MDEBUG(("%d: no midpoint\n", t->id));
     970         166 :             return REG_NOMATCH;
     971             :         }
     972         882 :         mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL);
     973         882 :         NOERR();
     974         882 :         if (mid == NULL)
     975             :         {
     976             :             /* failed to find a new one */
     977             :             MDEBUG(("%d: failed midpoint\n", t->id));
     978           0 :             return REG_NOMATCH;
     979             :         }
     980             :         MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
     981             :     }
     982             : 
     983             :     /* can't get here */
     984             :     return REG_ASSERT;
     985             : }
     986             : 
     987             : /*
     988             :  * cbrdissect - dissect match for backref node
     989             :  *
     990             :  * The backref match might already have been verified by dfa_backref(),
     991             :  * but we don't know that for sure so must check it here.
     992             :  */
     993             : static int                      /* regexec return code */
     994         418 : cbrdissect(struct vars *v,
     995             :            struct subre *t,
     996             :            chr *begin,          /* beginning of relevant substring */
     997             :            chr *end)            /* end of same */
     998             : {
     999         418 :     int         n = t->backno;
    1000             :     size_t      numreps;
    1001             :     size_t      tlen;
    1002             :     size_t      brlen;
    1003             :     chr        *brstring;
    1004             :     chr        *p;
    1005         418 :     int         min = t->min;
    1006         418 :     int         max = t->max;
    1007             : 
    1008             :     assert(t != NULL);
    1009             :     assert(t->op == 'b');
    1010             :     assert(n >= 0);
    1011             :     assert((size_t) n < v->nmatch);
    1012             : 
    1013             :     MDEBUG(("%d: cbrdissect %d{%d-%d} %ld-%ld\n", t->id, n, min, max,
    1014             :             LOFF(begin), LOFF(end)));
    1015             : 
    1016             :     /* get the backreferenced string */
    1017         418 :     if (v->pmatch[n].rm_so == -1)
    1018         110 :         return REG_NOMATCH;
    1019         308 :     brstring = v->start + v->pmatch[n].rm_so;
    1020         308 :     brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
    1021             : 
    1022             :     /* special cases for zero-length strings */
    1023         308 :     if (brlen == 0)
    1024             :     {
    1025             :         /*
    1026             :          * matches only if target is zero length, but any number of
    1027             :          * repetitions can be considered to be present
    1028             :          */
    1029          14 :         if (begin == end && min <= max)
    1030             :         {
    1031             :             MDEBUG(("%d: backref matched trivially\n", t->id));
    1032          14 :             return REG_OKAY;
    1033             :         }
    1034           0 :         return REG_NOMATCH;
    1035             :     }
    1036         294 :     if (begin == end)
    1037             :     {
    1038             :         /* matches only if zero repetitions are okay */
    1039          12 :         if (min == 0)
    1040             :         {
    1041             :             MDEBUG(("%d: backref matched trivially\n", t->id));
    1042          10 :             return REG_OKAY;
    1043             :         }
    1044           2 :         return REG_NOMATCH;
    1045             :     }
    1046             : 
    1047             :     /*
    1048             :      * check target length to see if it could possibly be an allowed number of
    1049             :      * repetitions of brstring
    1050             :      */
    1051             :     assert(end > begin);
    1052         282 :     tlen = end - begin;
    1053         282 :     if (tlen % brlen != 0)
    1054          10 :         return REG_NOMATCH;
    1055         272 :     numreps = tlen / brlen;
    1056         272 :     if (numreps < min || (numreps > max && max != DUPINF))
    1057           6 :         return REG_NOMATCH;
    1058             : 
    1059             :     /* okay, compare the actual string contents */
    1060         266 :     p = begin;
    1061         472 :     while (numreps-- > 0)
    1062             :     {
    1063         302 :         if ((*v->g->compare) (brstring, p, brlen) != 0)
    1064          96 :             return REG_NOMATCH;
    1065         206 :         p += brlen;
    1066             :     }
    1067             : 
    1068             :     MDEBUG(("%d: backref matched\n", t->id));
    1069         170 :     return REG_OKAY;
    1070             : }
    1071             : 
    1072             : /*
    1073             :  * caltdissect - dissect match for alternation node
    1074             :  */
    1075             : static int                      /* regexec return code */
    1076         160 : caltdissect(struct vars *v,
    1077             :             struct subre *t,
    1078             :             chr *begin,         /* beginning of relevant substring */
    1079             :             chr *end)           /* end of same */
    1080             : {
    1081             :     struct dfa *d;
    1082             :     int         er;
    1083             : 
    1084             :     assert(t->op == '|');
    1085             : 
    1086         160 :     t = t->child;
    1087             :     /* there should be at least 2 alternatives */
    1088             :     assert(t != NULL && t->sibling != NULL);
    1089             : 
    1090         274 :     while (t != NULL)
    1091             :     {
    1092             :         assert(t->cnfa.nstates > 0);
    1093             : 
    1094             :         MDEBUG(("%d: caltdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
    1095             : 
    1096         222 :         d = getsubdfa(v, t);
    1097         222 :         NOERR();
    1098         222 :         if (longest(v, d, begin, end, (int *) NULL) == end)
    1099             :         {
    1100             :             MDEBUG(("%d: caltdissect matched\n", t->id));
    1101         176 :             er = cdissect(v, t, begin, end);
    1102         176 :             if (er != REG_NOMATCH)
    1103         108 :                 return er;
    1104             :         }
    1105         114 :         NOERR();
    1106             : 
    1107         114 :         t = t->sibling;
    1108             :     }
    1109             : 
    1110          52 :     return REG_NOMATCH;
    1111             : }
    1112             : 
    1113             : /*
    1114             :  * citerdissect - dissect match for iteration node
    1115             :  */
    1116             : static int                      /* regexec return code */
    1117         234 : citerdissect(struct vars *v,
    1118             :              struct subre *t,
    1119             :              chr *begin,        /* beginning of relevant substring */
    1120             :              chr *end)          /* end of same */
    1121             : {
    1122             :     struct dfa *d;
    1123             :     chr       **endpts;
    1124             :     chr        *limit;
    1125             :     int         min_matches;
    1126             :     size_t      max_matches;
    1127             :     int         nverified;
    1128             :     int         k;
    1129             :     int         i;
    1130             :     int         er;
    1131             : 
    1132             :     assert(t->op == '*');
    1133             :     assert(t->child != NULL && t->child->cnfa.nstates > 0);
    1134             :     assert(!(t->child->flags & SHORTER));
    1135             :     assert(begin <= end);
    1136             : 
    1137             :     MDEBUG(("%d: citerdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
    1138             : 
    1139             :     /*
    1140             :      * For the moment, assume the minimum number of matches is 1.  If zero
    1141             :      * matches are allowed, and the target string is empty, we are allowed to
    1142             :      * match regardless of the contents of the iter node --- but we would
    1143             :      * prefer to match once, so that capturing parens get set.  (An example of
    1144             :      * the concern here is a pattern like "()*\1", which historically this
    1145             :      * code has allowed to succeed.)  Therefore, we deal with the zero-matches
    1146             :      * case at the bottom, after failing to find any other way to match.
    1147             :      */
    1148         234 :     min_matches = t->min;
    1149         234 :     if (min_matches <= 0)
    1150         162 :         min_matches = 1;
    1151             : 
    1152             :     /*
    1153             :      * We need workspace to track the endpoints of each sub-match.  Normally
    1154             :      * we consider only nonzero-length sub-matches, so there can be at most
    1155             :      * end-begin of them.  However, if min is larger than that, we will also
    1156             :      * consider zero-length sub-matches in order to find enough matches.
    1157             :      *
    1158             :      * For convenience, endpts[0] contains the "begin" pointer and we store
    1159             :      * sub-match endpoints in endpts[1..max_matches].
    1160             :      */
    1161         234 :     max_matches = end - begin;
    1162         234 :     if (max_matches > t->max && t->max != DUPINF)
    1163           0 :         max_matches = t->max;
    1164         234 :     if (max_matches < min_matches)
    1165         146 :         max_matches = min_matches;
    1166         234 :     endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
    1167         234 :     if (endpts == NULL)
    1168           0 :         return REG_ESPACE;
    1169         234 :     endpts[0] = begin;
    1170             : 
    1171         234 :     d = getsubdfa(v, t->child);
    1172         234 :     if (ISERR())
    1173             :     {
    1174           0 :         FREE(endpts);
    1175           0 :         return v->err;
    1176             :     }
    1177             : 
    1178             :     /*
    1179             :      * Our strategy is to first find a set of sub-match endpoints that are
    1180             :      * valid according to the child node's DFA, and then recursively dissect
    1181             :      * each sub-match to confirm validity.  If any validity check fails,
    1182             :      * backtrack that sub-match and try again.  And, when we next try for a
    1183             :      * validity check, we need not recheck any successfully verified
    1184             :      * sub-matches that we didn't move the endpoints of.  nverified remembers
    1185             :      * how many sub-matches are currently known okay.
    1186             :      */
    1187             : 
    1188             :     /* initialize to consider first sub-match */
    1189         234 :     nverified = 0;
    1190         234 :     k = 1;
    1191         234 :     limit = end;
    1192             : 
    1193             :     /* iterate until satisfaction or failure */
    1194        1006 :     while (k > 0)
    1195             :     {
    1196             :         /* try to find an endpoint for the k'th sub-match */
    1197         814 :         endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
    1198         814 :         if (ISERR())
    1199             :         {
    1200           0 :             FREE(endpts);
    1201           0 :             return v->err;
    1202             :         }
    1203         814 :         if (endpts[k] == NULL)
    1204             :         {
    1205             :             /* no match possible, so see if we can shorten previous one */
    1206         424 :             k--;
    1207         424 :             goto backtrack;
    1208             :         }
    1209             :         MDEBUG(("%d: working endpoint %d: %ld\n",
    1210             :                 t->id, k, LOFF(endpts[k])));
    1211             : 
    1212             :         /* k'th sub-match can no longer be considered verified */
    1213         390 :         if (nverified >= k)
    1214          16 :             nverified = k - 1;
    1215             : 
    1216         390 :         if (endpts[k] != end)
    1217             :         {
    1218             :             /* haven't reached end yet, try another iteration if allowed */
    1219         268 :             if (k >= max_matches)
    1220             :             {
    1221             :                 /* must try to shorten some previous match */
    1222           0 :                 k--;
    1223           0 :                 goto backtrack;
    1224             :             }
    1225             : 
    1226             :             /* reject zero-length match unless necessary to achieve min */
    1227         268 :             if (endpts[k] == endpts[k - 1] &&
    1228           0 :                 (k >= min_matches || min_matches - k < end - endpts[k]))
    1229           0 :                 goto backtrack;
    1230             : 
    1231         268 :             k++;
    1232         268 :             limit = end;
    1233         268 :             continue;
    1234             :         }
    1235             : 
    1236             :         /*
    1237             :          * We've identified a way to divide the string into k sub-matches that
    1238             :          * works so far as the child DFA can tell.  If k is an allowed number
    1239             :          * of matches, start the slow part: recurse to verify each sub-match.
    1240             :          * We always have k <= max_matches, needn't check that.
    1241             :          */
    1242         122 :         if (k < min_matches)
    1243           0 :             goto backtrack;
    1244             : 
    1245             :         MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
    1246             : 
    1247         200 :         for (i = nverified + 1; i <= k; i++)
    1248             :         {
    1249             :             /* zap any match data from a non-last iteration */
    1250         158 :             zaptreesubs(v, t->child);
    1251         158 :             er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
    1252         158 :             if (er == REG_OKAY)
    1253             :             {
    1254          78 :                 nverified = i;
    1255          78 :                 continue;
    1256             :             }
    1257          80 :             if (er == REG_NOMATCH)
    1258          80 :                 break;
    1259             :             /* oops, something failed */
    1260           0 :             FREE(endpts);
    1261           0 :             return er;
    1262             :         }
    1263             : 
    1264         122 :         if (i > k)
    1265             :         {
    1266             :             /* satisfaction */
    1267             :             MDEBUG(("%d: successful\n", t->id));
    1268          42 :             FREE(endpts);
    1269          42 :             return REG_OKAY;
    1270             :         }
    1271             : 
    1272             :         /* i'th match failed to verify, so backtrack it */
    1273          80 :         k = i;
    1274             : 
    1275         504 : backtrack:
    1276             : 
    1277             :         /*
    1278             :          * Must consider shorter versions of the k'th sub-match.  However,
    1279             :          * we'll only ask for a zero-length match if necessary.
    1280             :          */
    1281         504 :         while (k > 0)
    1282             :         {
    1283         312 :             chr        *prev_end = endpts[k - 1];
    1284             : 
    1285         312 :             if (endpts[k] > prev_end)
    1286             :             {
    1287         312 :                 limit = endpts[k] - 1;
    1288         312 :                 if (limit > prev_end ||
    1289           0 :                     (k < min_matches && min_matches - k >= end - prev_end))
    1290             :                 {
    1291             :                     /* break out of backtrack loop, continue the outer one */
    1292             :                     break;
    1293             :                 }
    1294             :             }
    1295             :             /* can't shorten k'th sub-match any more, consider previous one */
    1296           0 :             k--;
    1297             :         }
    1298             :     }
    1299             : 
    1300             :     /* all possibilities exhausted */
    1301         192 :     FREE(endpts);
    1302             : 
    1303             :     /*
    1304             :      * Now consider the possibility that we can match to a zero-length string
    1305             :      * by using zero repetitions.
    1306             :      */
    1307         192 :     if (t->min == 0 && begin == end)
    1308             :     {
    1309             :         MDEBUG(("%d: allowing zero matches\n", t->id));
    1310         136 :         return REG_OKAY;
    1311             :     }
    1312             : 
    1313             :     MDEBUG(("%d: failed\n", t->id));
    1314          56 :     return REG_NOMATCH;
    1315             : }
    1316             : 
    1317             : /*
    1318             :  * creviterdissect - dissect match for iteration node, shortest-first
    1319             :  */
    1320             : static int                      /* regexec return code */
    1321          14 : creviterdissect(struct vars *v,
    1322             :                 struct subre *t,
    1323             :                 chr *begin,     /* beginning of relevant substring */
    1324             :                 chr *end)       /* end of same */
    1325             : {
    1326             :     struct dfa *d;
    1327             :     chr       **endpts;
    1328             :     chr        *limit;
    1329             :     int         min_matches;
    1330             :     size_t      max_matches;
    1331             :     int         nverified;
    1332             :     int         k;
    1333             :     int         i;
    1334             :     int         er;
    1335             : 
    1336             :     assert(t->op == '*');
    1337             :     assert(t->child != NULL && t->child->cnfa.nstates > 0);
    1338             :     assert(t->child->flags & SHORTER);
    1339             :     assert(begin <= end);
    1340             : 
    1341             :     MDEBUG(("%d: creviterdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
    1342             : 
    1343             :     /*
    1344             :      * If zero matches are allowed, and target string is empty, just declare
    1345             :      * victory.  OTOH, if target string isn't empty, zero matches can't work
    1346             :      * so we pretend the min is 1.
    1347             :      */
    1348          14 :     min_matches = t->min;
    1349          14 :     if (min_matches <= 0)
    1350             :     {
    1351          14 :         if (begin == end)
    1352             :         {
    1353             :             MDEBUG(("%d: allowing zero matches\n", t->id));
    1354           2 :             return REG_OKAY;
    1355             :         }
    1356          12 :         min_matches = 1;
    1357             :     }
    1358             : 
    1359             :     /*
    1360             :      * We need workspace to track the endpoints of each sub-match.  Normally
    1361             :      * we consider only nonzero-length sub-matches, so there can be at most
    1362             :      * end-begin of them.  However, if min is larger than that, we will also
    1363             :      * consider zero-length sub-matches in order to find enough matches.
    1364             :      *
    1365             :      * For convenience, endpts[0] contains the "begin" pointer and we store
    1366             :      * sub-match endpoints in endpts[1..max_matches].
    1367             :      */
    1368          12 :     max_matches = end - begin;
    1369          12 :     if (max_matches > t->max && t->max != DUPINF)
    1370          12 :         max_matches = t->max;
    1371          12 :     if (max_matches < min_matches)
    1372           0 :         max_matches = min_matches;
    1373          12 :     endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
    1374          12 :     if (endpts == NULL)
    1375           0 :         return REG_ESPACE;
    1376          12 :     endpts[0] = begin;
    1377             : 
    1378          12 :     d = getsubdfa(v, t->child);
    1379          12 :     if (ISERR())
    1380             :     {
    1381           0 :         FREE(endpts);
    1382           0 :         return v->err;
    1383             :     }
    1384             : 
    1385             :     /*
    1386             :      * Our strategy is to first find a set of sub-match endpoints that are
    1387             :      * valid according to the child node's DFA, and then recursively dissect
    1388             :      * each sub-match to confirm validity.  If any validity check fails,
    1389             :      * backtrack that sub-match and try again.  And, when we next try for a
    1390             :      * validity check, we need not recheck any successfully verified
    1391             :      * sub-matches that we didn't move the endpoints of.  nverified remembers
    1392             :      * how many sub-matches are currently known okay.
    1393             :      */
    1394             : 
    1395             :     /* initialize to consider first sub-match */
    1396          12 :     nverified = 0;
    1397          12 :     k = 1;
    1398          12 :     limit = begin;
    1399             : 
    1400             :     /* iterate until satisfaction or failure */
    1401          16 :     while (k > 0)
    1402             :     {
    1403             :         /* disallow zero-length match unless necessary to achieve min */
    1404          12 :         if (limit == endpts[k - 1] &&
    1405          12 :             limit != end &&
    1406           0 :             (k >= min_matches || min_matches - k < end - limit))
    1407          12 :             limit++;
    1408             : 
    1409             :         /* if this is the last allowed sub-match, it must reach to the end */
    1410          12 :         if (k >= max_matches)
    1411          12 :             limit = end;
    1412             : 
    1413             :         /* try to find an endpoint for the k'th sub-match */
    1414          12 :         endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
    1415             :                              (chr **) NULL, (int *) NULL);
    1416          12 :         if (ISERR())
    1417             :         {
    1418           0 :             FREE(endpts);
    1419           0 :             return v->err;
    1420             :         }
    1421          12 :         if (endpts[k] == NULL)
    1422             :         {
    1423             :             /* no match possible, so see if we can lengthen previous one */
    1424           0 :             k--;
    1425           0 :             goto backtrack;
    1426             :         }
    1427             :         MDEBUG(("%d: working endpoint %d: %ld\n",
    1428             :                 t->id, k, LOFF(endpts[k])));
    1429             : 
    1430             :         /* k'th sub-match can no longer be considered verified */
    1431          12 :         if (nverified >= k)
    1432           0 :             nverified = k - 1;
    1433             : 
    1434          12 :         if (endpts[k] != end)
    1435             :         {
    1436             :             /* haven't reached end yet, try another iteration if allowed */
    1437           0 :             if (k >= max_matches)
    1438             :             {
    1439             :                 /* must try to lengthen some previous match */
    1440           0 :                 k--;
    1441           0 :                 goto backtrack;
    1442             :             }
    1443             : 
    1444           0 :             k++;
    1445           0 :             limit = endpts[k - 1];
    1446           0 :             continue;
    1447             :         }
    1448             : 
    1449             :         /*
    1450             :          * We've identified a way to divide the string into k sub-matches that
    1451             :          * works so far as the child DFA can tell.  If k is an allowed number
    1452             :          * of matches, start the slow part: recurse to verify each sub-match.
    1453             :          * We always have k <= max_matches, needn't check that.
    1454             :          */
    1455          12 :         if (k < min_matches)
    1456           0 :             goto backtrack;
    1457             : 
    1458             :         MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
    1459             : 
    1460          20 :         for (i = nverified + 1; i <= k; i++)
    1461             :         {
    1462             :             /* zap any match data from a non-last iteration */
    1463          12 :             zaptreesubs(v, t->child);
    1464          12 :             er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
    1465          12 :             if (er == REG_OKAY)
    1466             :             {
    1467           8 :                 nverified = i;
    1468           8 :                 continue;
    1469             :             }
    1470           4 :             if (er == REG_NOMATCH)
    1471           4 :                 break;
    1472             :             /* oops, something failed */
    1473           0 :             FREE(endpts);
    1474           0 :             return er;
    1475             :         }
    1476             : 
    1477          12 :         if (i > k)
    1478             :         {
    1479             :             /* satisfaction */
    1480             :             MDEBUG(("%d: successful\n", t->id));
    1481           8 :             FREE(endpts);
    1482           8 :             return REG_OKAY;
    1483             :         }
    1484             : 
    1485             :         /* i'th match failed to verify, so backtrack it */
    1486           4 :         k = i;
    1487             : 
    1488           4 : backtrack:
    1489             : 
    1490             :         /*
    1491             :          * Must consider longer versions of the k'th sub-match.
    1492             :          */
    1493           8 :         while (k > 0)
    1494             :         {
    1495           4 :             if (endpts[k] < end)
    1496             :             {
    1497           0 :                 limit = endpts[k] + 1;
    1498             :                 /* break out of backtrack loop, continue the outer one */
    1499           0 :                 break;
    1500             :             }
    1501             :             /* can't lengthen k'th sub-match any more, consider previous one */
    1502           4 :             k--;
    1503             :         }
    1504             :     }
    1505             : 
    1506             :     /* all possibilities exhausted */
    1507             :     MDEBUG(("%d: failed\n", t->id));
    1508           4 :     FREE(endpts);
    1509           4 :     return REG_NOMATCH;
    1510             : }
    1511             : 
    1512             : 
    1513             : 
    1514             : #include "rege_dfa.c"

Generated by: LCOV version 1.14