LCOV - code coverage report
Current view: top level - src/backend/regex - regc_nfa.c (source / functions) Hit Total Coverage
Test: PostgreSQL 15devel Lines: 1119 1237 90.5 %
Date: 2021-12-09 03:08:47 Functions: 60 61 98.4 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * NFA utilities.
       3             :  * This file is #included by regcomp.c.
       4             :  *
       5             :  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
       6             :  *
       7             :  * Development of this software was funded, in part, by Cray Research Inc.,
       8             :  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
       9             :  * Corporation, none of whom are responsible for the results.  The author
      10             :  * thanks all of them.
      11             :  *
      12             :  * Redistribution and use in source and binary forms -- with or without
      13             :  * modification -- are permitted for any purpose, provided that
      14             :  * redistributions in source form retain this entire copyright notice and
      15             :  * indicate the origin and nature of any modifications.
      16             :  *
      17             :  * I'd appreciate being given credit for this package in the documentation
      18             :  * of software which uses it, but that is not a requirement.
      19             :  *
      20             :  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
      21             :  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
      22             :  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
      23             :  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
      24             :  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
      25             :  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
      26             :  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
      27             :  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
      28             :  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
      29             :  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
      30             :  *
      31             :  * src/backend/regex/regc_nfa.c
      32             :  *
      33             :  *
      34             :  * One or two things that technically ought to be in here
      35             :  * are actually in color.c, thanks to some incestuous relationships in
      36             :  * the color chains.
      37             :  */
      38             : 
      39             : #define NISERR()    VISERR(nfa->v)
      40             : #define NERR(e)     VERR(nfa->v, (e))
      41             : 
      42             : 
      43             : /*
      44             :  * newnfa - set up an NFA
      45             :  */
      46             : static struct nfa *             /* the NFA, or NULL */
      47       13644 : newnfa(struct vars *v,
      48             :        struct colormap *cm,
      49             :        struct nfa *parent)      /* NULL if primary NFA */
      50             : {
      51             :     struct nfa *nfa;
      52             : 
      53       13644 :     nfa = (struct nfa *) MALLOC(sizeof(struct nfa));
      54       13644 :     if (nfa == NULL)
      55             :     {
      56           0 :         ERR(REG_ESPACE);
      57           0 :         return NULL;
      58             :     }
      59             : 
      60             :     /* Make the NFA minimally valid, so freenfa() will behave sanely */
      61       13644 :     nfa->states = NULL;
      62       13644 :     nfa->slast = NULL;
      63       13644 :     nfa->freestates = NULL;
      64       13644 :     nfa->freearcs = NULL;
      65       13644 :     nfa->lastsb = NULL;
      66       13644 :     nfa->lastab = NULL;
      67       13644 :     nfa->lastsbused = 0;
      68       13644 :     nfa->lastabused = 0;
      69       13644 :     nfa->nstates = 0;
      70       13644 :     nfa->cm = cm;
      71       13644 :     nfa->v = v;
      72       13644 :     nfa->bos[0] = nfa->bos[1] = COLORLESS;
      73       13644 :     nfa->eos[0] = nfa->eos[1] = COLORLESS;
      74       13644 :     nfa->flags = 0;
      75       13644 :     nfa->minmatchall = nfa->maxmatchall = -1;
      76       13644 :     nfa->parent = parent;        /* Precedes newfstate so parent is valid. */
      77             : 
      78             :     /* Create required infrastructure */
      79       13644 :     nfa->post = newfstate(nfa, '@'); /* number 0 */
      80       13644 :     nfa->pre = newfstate(nfa, '>'); /* number 1 */
      81       13644 :     nfa->init = newstate(nfa);   /* may become invalid later */
      82       13644 :     nfa->final = newstate(nfa);
      83       13644 :     if (ISERR())
      84             :     {
      85           0 :         freenfa(nfa);
      86           0 :         return NULL;
      87             :     }
      88       13644 :     rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->pre, nfa->init);
      89       13644 :     newarc(nfa, '^', 1, nfa->pre, nfa->init);
      90       13644 :     newarc(nfa, '^', 0, nfa->pre, nfa->init);
      91       13644 :     rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->final, nfa->post);
      92       13644 :     newarc(nfa, '$', 1, nfa->final, nfa->post);
      93       13644 :     newarc(nfa, '$', 0, nfa->final, nfa->post);
      94             : 
      95       13644 :     if (ISERR())
      96             :     {
      97           0 :         freenfa(nfa);
      98           0 :         return NULL;
      99             :     }
     100       13644 :     return nfa;
     101             : }
     102             : 
     103             : /*
     104             :  * freenfa - free an entire NFA
     105             :  */
     106             : static void
     107       13644 : freenfa(struct nfa *nfa)
     108             : {
     109             :     struct statebatch *sb;
     110             :     struct statebatch *sbnext;
     111             :     struct arcbatch *ab;
     112             :     struct arcbatch *abnext;
     113             : 
     114       28522 :     for (sb = nfa->lastsb; sb != NULL; sb = sbnext)
     115             :     {
     116       14878 :         sbnext = sb->next;
     117       14878 :         nfa->v->spaceused -= STATEBATCHSIZE(sb->nstates);
     118       14878 :         FREE(sb);
     119             :     }
     120       13644 :     nfa->lastsb = NULL;
     121       38674 :     for (ab = nfa->lastab; ab != NULL; ab = abnext)
     122             :     {
     123       25030 :         abnext = ab->next;
     124       25030 :         nfa->v->spaceused -= ARCBATCHSIZE(ab->narcs);
     125       25030 :         FREE(ab);
     126             :     }
     127       13644 :     nfa->lastab = NULL;
     128             : 
     129       13644 :     nfa->nstates = -1;
     130       13644 :     FREE(nfa);
     131       13644 : }
     132             : 
     133             : /*
     134             :  * newstate - allocate an NFA state, with zero flag value
     135             :  */
     136             : static struct state *           /* NULL on error */
     137      367526 : newstate(struct nfa *nfa)
     138             : {
     139             :     struct state *s;
     140             : 
     141             :     /*
     142             :      * This is a handy place to check for operation cancel during regex
     143             :      * compilation, since no code path will go very long without making a new
     144             :      * state or arc.
     145             :      */
     146      367526 :     if (CANCEL_REQUESTED(nfa->v->re))
     147             :     {
     148           0 :         NERR(REG_CANCEL);
     149           0 :         return NULL;
     150             :     }
     151             : 
     152             :     /* first, recycle anything that's on the freelist */
     153      367526 :     if (nfa->freestates != NULL)
     154             :     {
     155       29514 :         s = nfa->freestates;
     156       29514 :         nfa->freestates = s->next;
     157             :     }
     158             :     /* otherwise, is there anything left in the last statebatch? */
     159      338012 :     else if (nfa->lastsb != NULL && nfa->lastsbused < nfa->lastsb->nstates)
     160             :     {
     161      323134 :         s = &nfa->lastsb->s[nfa->lastsbused++];
     162             :     }
     163             :     /* otherwise, need to allocate a new statebatch */
     164             :     else
     165             :     {
     166             :         struct statebatch *newSb;
     167             :         size_t      nstates;
     168             : 
     169       14878 :         if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
     170             :         {
     171           0 :             NERR(REG_ETOOBIG);
     172           0 :             return NULL;
     173             :         }
     174       14878 :         nstates = (nfa->lastsb != NULL) ? nfa->lastsb->nstates * 2 : FIRSTSBSIZE;
     175       14878 :         if (nstates > MAXSBSIZE)
     176          34 :             nstates = MAXSBSIZE;
     177       14878 :         newSb = (struct statebatch *) MALLOC(STATEBATCHSIZE(nstates));
     178       14878 :         if (newSb == NULL)
     179             :         {
     180           0 :             NERR(REG_ESPACE);
     181           0 :             return NULL;
     182             :         }
     183       14878 :         nfa->v->spaceused += STATEBATCHSIZE(nstates);
     184       14878 :         newSb->nstates = nstates;
     185       14878 :         newSb->next = nfa->lastsb;
     186       14878 :         nfa->lastsb = newSb;
     187       14878 :         nfa->lastsbused = 1;
     188       14878 :         s = &newSb->s[0];
     189             :     }
     190             : 
     191             :     assert(nfa->nstates >= 0);
     192      367526 :     s->no = nfa->nstates++;
     193      367526 :     s->flag = 0;
     194      367526 :     if (nfa->states == NULL)
     195       13644 :         nfa->states = s;
     196      367526 :     s->nins = 0;
     197      367526 :     s->ins = NULL;
     198      367526 :     s->nouts = 0;
     199      367526 :     s->outs = NULL;
     200      367526 :     s->tmp = NULL;
     201      367526 :     s->next = NULL;
     202      367526 :     if (nfa->slast != NULL)
     203             :     {
     204             :         assert(nfa->slast->next == NULL);
     205      353882 :         nfa->slast->next = s;
     206             :     }
     207      367526 :     s->prev = nfa->slast;
     208      367526 :     nfa->slast = s;
     209      367526 :     return s;
     210             : }
     211             : 
     212             : /*
     213             :  * newfstate - allocate an NFA state with a specified flag value
     214             :  */
     215             : static struct state *           /* NULL on error */
     216       27288 : newfstate(struct nfa *nfa, int flag)
     217             : {
     218             :     struct state *s;
     219             : 
     220       27288 :     s = newstate(nfa);
     221       27288 :     if (s != NULL)
     222       27288 :         s->flag = (char) flag;
     223       27288 :     return s;
     224             : }
     225             : 
     226             : /*
     227             :  * dropstate - delete a state's inarcs and outarcs and free it
     228             :  */
     229             : static void
     230      170502 : dropstate(struct nfa *nfa,
     231             :           struct state *s)
     232             : {
     233             :     struct arc *a;
     234             : 
     235      203774 :     while ((a = s->ins) != NULL)
     236       33272 :         freearc(nfa, a);
     237      278962 :     while ((a = s->outs) != NULL)
     238      108460 :         freearc(nfa, a);
     239      170502 :     freestate(nfa, s);
     240      170502 : }
     241             : 
     242             : /*
     243             :  * freestate - free a state, which has no in-arcs or out-arcs
     244             :  */
     245             : static void
     246      176482 : freestate(struct nfa *nfa,
     247             :           struct state *s)
     248             : {
     249             :     assert(s != NULL);
     250             :     assert(s->nins == 0 && s->nouts == 0);
     251             : 
     252      176482 :     s->no = FREESTATE;
     253      176482 :     s->flag = 0;
     254      176482 :     if (s->next != NULL)
     255      165614 :         s->next->prev = s->prev;
     256             :     else
     257             :     {
     258             :         assert(s == nfa->slast);
     259       10868 :         nfa->slast = s->prev;
     260             :     }
     261      176482 :     if (s->prev != NULL)
     262      176482 :         s->prev->next = s->next;
     263             :     else
     264             :     {
     265             :         assert(s == nfa->states);
     266           0 :         nfa->states = s->next;
     267             :     }
     268      176482 :     s->prev = NULL;
     269      176482 :     s->next = nfa->freestates;    /* don't delete it, put it on the free list */
     270      176482 :     nfa->freestates = s;
     271      176482 : }
     272             : 
     273             : /*
     274             :  * newarc - set up a new arc within an NFA
     275             :  *
     276             :  * This function checks to make sure that no duplicate arcs are created.
     277             :  * In general we never want duplicates.
     278             :  *
     279             :  * However: in principle, a RAINBOW arc is redundant with any plain arc
     280             :  * (unless that arc is for a pseudocolor).  But we don't try to recognize
     281             :  * that redundancy, either here or in allied operations such as moveins().
     282             :  * The pseudocolor consideration makes that more costly than it seems worth.
     283             :  */
     284             : static void
     285     1611894 : newarc(struct nfa *nfa,
     286             :        int t,
     287             :        color co,
     288             :        struct state *from,
     289             :        struct state *to)
     290             : {
     291             :     struct arc *a;
     292             : 
     293             :     assert(from != NULL && to != NULL);
     294             : 
     295             :     /*
     296             :      * This is a handy place to check for operation cancel during regex
     297             :      * compilation, since no code path will go very long without making a new
     298             :      * state or arc.
     299             :      */
     300     1611894 :     if (CANCEL_REQUESTED(nfa->v->re))
     301             :     {
     302           0 :         NERR(REG_CANCEL);
     303           0 :         return;
     304             :     }
     305             : 
     306             :     /* check for duplicate arc, using whichever chain is shorter */
     307     1611894 :     if (from->nouts <= to->nins)
     308             :     {
     309     5257826 :         for (a = from->outs; a != NULL; a = a->outchain)
     310     4595058 :             if (a->to == to && a->co == co && a->type == t)
     311      165956 :                 return;
     312             :     }
     313             :     else
     314             :     {
     315    76143636 :         for (a = to->ins; a != NULL; a = a->inchain)
     316    75527060 :             if (a->from == from && a->co == co && a->type == t)
     317      166594 :                 return;
     318             :     }
     319             : 
     320             :     /* no dup, so create the arc */
     321     1279344 :     createarc(nfa, t, co, from, to);
     322             : }
     323             : 
     324             : /*
     325             :  * createarc - create a new arc within an NFA
     326             :  *
     327             :  * This function must *only* be used after verifying that there is no existing
     328             :  * identical arc (same type/color/from/to).
     329             :  */
     330             : static void
     331    12328810 : createarc(struct nfa *nfa,
     332             :           int t,
     333             :           color co,
     334             :           struct state *from,
     335             :           struct state *to)
     336             : {
     337             :     struct arc *a;
     338             : 
     339    12328810 :     a = allocarc(nfa);
     340    12328810 :     if (NISERR())
     341        7612 :         return;
     342             :     assert(a != NULL);
     343             : 
     344    12321198 :     a->type = t;
     345    12321198 :     a->co = co;
     346    12321198 :     a->to = to;
     347    12321198 :     a->from = from;
     348             : 
     349             :     /*
     350             :      * Put the new arc on the beginning, not the end, of the chains; it's
     351             :      * simpler here, and freearc() is the same cost either way.  See also the
     352             :      * logic in moveins() and its cohorts, as well as fixempties().
     353             :      */
     354    12321198 :     a->inchain = to->ins;
     355    12321198 :     a->inchainRev = NULL;
     356    12321198 :     if (to->ins)
     357    11897746 :         to->ins->inchainRev = a;
     358    12321198 :     to->ins = a;
     359    12321198 :     a->outchain = from->outs;
     360    12321198 :     a->outchainRev = NULL;
     361    12321198 :     if (from->outs)
     362    11944438 :         from->outs->outchainRev = a;
     363    12321198 :     from->outs = a;
     364             : 
     365    12321198 :     from->nouts++;
     366    12321198 :     to->nins++;
     367             : 
     368    12321198 :     if (COLORED(a) && nfa->parent == NULL)
     369     1495402 :         colorchain(nfa->cm, a);
     370             : }
     371             : 
     372             : /*
     373             :  * allocarc - allocate a new arc within an NFA
     374             :  */
     375             : static struct arc *             /* NULL for failure */
     376    12328810 : allocarc(struct nfa *nfa)
     377             : {
     378             :     struct arc *a;
     379             : 
     380             :     /* first, recycle anything that's on the freelist */
     381    12328810 :     if (nfa->freearcs != NULL)
     382             :     {
     383     1208624 :         a = nfa->freearcs;
     384     1208624 :         nfa->freearcs = a->freechain;
     385             :     }
     386             :     /* otherwise, is there anything left in the last arcbatch? */
     387    11120186 :     else if (nfa->lastab != NULL && nfa->lastabused < nfa->lastab->narcs)
     388             :     {
     389    11087544 :         a = &nfa->lastab->a[nfa->lastabused++];
     390             :     }
     391             :     /* otherwise, need to allocate a new arcbatch */
     392             :     else
     393             :     {
     394             :         struct arcbatch *newAb;
     395             :         size_t      narcs;
     396             : 
     397       32642 :         if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
     398             :         {
     399        7612 :             NERR(REG_ETOOBIG);
     400        7612 :             return NULL;
     401             :         }
     402       25030 :         narcs = (nfa->lastab != NULL) ? nfa->lastab->narcs * 2 : FIRSTABSIZE;
     403       25030 :         if (narcs > MAXABSIZE)
     404       10394 :             narcs = MAXABSIZE;
     405       25030 :         newAb = (struct arcbatch *) MALLOC(ARCBATCHSIZE(narcs));
     406       25030 :         if (newAb == NULL)
     407             :         {
     408           0 :             NERR(REG_ESPACE);
     409           0 :             return NULL;
     410             :         }
     411       25030 :         nfa->v->spaceused += ARCBATCHSIZE(narcs);
     412       25030 :         newAb->narcs = narcs;
     413       25030 :         newAb->next = nfa->lastab;
     414       25030 :         nfa->lastab = newAb;
     415       25030 :         nfa->lastabused = 1;
     416       25030 :         a = &newAb->a[0];
     417             :     }
     418             : 
     419    12321198 :     return a;
     420             : }
     421             : 
     422             : /*
     423             :  * freearc - free an arc
     424             :  */
     425             : static void
     426     1378804 : freearc(struct nfa *nfa,
     427             :         struct arc *victim)
     428             : {
     429     1378804 :     struct state *from = victim->from;
     430     1378804 :     struct state *to = victim->to;
     431             :     struct arc *predecessor;
     432             : 
     433             :     assert(victim->type != 0);
     434             : 
     435             :     /* take it off color chain if necessary */
     436     1378804 :     if (COLORED(victim) && nfa->parent == NULL)
     437      682028 :         uncolorchain(nfa->cm, victim);
     438             : 
     439             :     /* take it off source's out-chain */
     440             :     assert(from != NULL);
     441     1378804 :     predecessor = victim->outchainRev;
     442     1378804 :     if (predecessor == NULL)
     443             :     {
     444             :         assert(from->outs == victim);
     445      379688 :         from->outs = victim->outchain;
     446             :     }
     447             :     else
     448             :     {
     449             :         assert(predecessor->outchain == victim);
     450      999116 :         predecessor->outchain = victim->outchain;
     451             :     }
     452     1378804 :     if (victim->outchain != NULL)
     453             :     {
     454             :         assert(victim->outchain->outchainRev == victim);
     455      953864 :         victim->outchain->outchainRev = predecessor;
     456             :     }
     457     1378804 :     from->nouts--;
     458             : 
     459             :     /* take it off target's in-chain */
     460             :     assert(to != NULL);
     461     1378804 :     predecessor = victim->inchainRev;
     462     1378804 :     if (predecessor == NULL)
     463             :     {
     464             :         assert(to->ins == victim);
     465      636144 :         to->ins = victim->inchain;
     466             :     }
     467             :     else
     468             :     {
     469             :         assert(predecessor->inchain == victim);
     470      742660 :         predecessor->inchain = victim->inchain;
     471             :     }
     472     1378804 :     if (victim->inchain != NULL)
     473             :     {
     474             :         assert(victim->inchain->inchainRev == victim);
     475      976248 :         victim->inchain->inchainRev = predecessor;
     476             :     }
     477     1378804 :     to->nins--;
     478             : 
     479             :     /* clean up and place on NFA's free list */
     480     1378804 :     victim->type = 0;
     481     1378804 :     victim->from = NULL;     /* precautions... */
     482     1378804 :     victim->to = NULL;
     483     1378804 :     victim->inchain = NULL;
     484     1378804 :     victim->inchainRev = NULL;
     485     1378804 :     victim->outchain = NULL;
     486     1378804 :     victim->outchainRev = NULL;
     487     1378804 :     victim->freechain = nfa->freearcs;
     488     1378804 :     nfa->freearcs = victim;
     489     1378804 : }
     490             : 
     491             : /*
     492             :  * changearcsource - flip an arc to have a different from state
     493             :  *
     494             :  * Caller must have verified that there is no pre-existing duplicate arc.
     495             :  */
     496             : static void
     497         588 : changearcsource(struct arc *a, struct state *newfrom)
     498             : {
     499         588 :     struct state *oldfrom = a->from;
     500             :     struct arc *predecessor;
     501             : 
     502             :     assert(oldfrom != newfrom);
     503             : 
     504             :     /* take it off old source's out-chain */
     505             :     assert(oldfrom != NULL);
     506         588 :     predecessor = a->outchainRev;
     507         588 :     if (predecessor == NULL)
     508             :     {
     509             :         assert(oldfrom->outs == a);
     510         588 :         oldfrom->outs = a->outchain;
     511             :     }
     512             :     else
     513             :     {
     514             :         assert(predecessor->outchain == a);
     515           0 :         predecessor->outchain = a->outchain;
     516             :     }
     517         588 :     if (a->outchain != NULL)
     518             :     {
     519             :         assert(a->outchain->outchainRev == a);
     520         574 :         a->outchain->outchainRev = predecessor;
     521             :     }
     522         588 :     oldfrom->nouts--;
     523             : 
     524         588 :     a->from = newfrom;
     525             : 
     526             :     /* prepend it to new source's out-chain */
     527         588 :     a->outchain = newfrom->outs;
     528         588 :     a->outchainRev = NULL;
     529         588 :     if (newfrom->outs)
     530         588 :         newfrom->outs->outchainRev = a;
     531         588 :     newfrom->outs = a;
     532         588 :     newfrom->nouts++;
     533         588 : }
     534             : 
     535             : /*
     536             :  * changearctarget - flip an arc to have a different to state
     537             :  *
     538             :  * Caller must have verified that there is no pre-existing duplicate arc.
     539             :  */
     540             : static void
     541         320 : changearctarget(struct arc *a, struct state *newto)
     542             : {
     543         320 :     struct state *oldto = a->to;
     544             :     struct arc *predecessor;
     545             : 
     546             :     assert(oldto != newto);
     547             : 
     548             :     /* take it off old target's in-chain */
     549             :     assert(oldto != NULL);
     550         320 :     predecessor = a->inchainRev;
     551         320 :     if (predecessor == NULL)
     552             :     {
     553             :         assert(oldto->ins == a);
     554         320 :         oldto->ins = a->inchain;
     555             :     }
     556             :     else
     557             :     {
     558             :         assert(predecessor->inchain == a);
     559           0 :         predecessor->inchain = a->inchain;
     560             :     }
     561         320 :     if (a->inchain != NULL)
     562             :     {
     563             :         assert(a->inchain->inchainRev == a);
     564         312 :         a->inchain->inchainRev = predecessor;
     565             :     }
     566         320 :     oldto->nins--;
     567             : 
     568         320 :     a->to = newto;
     569             : 
     570             :     /* prepend it to new target's in-chain */
     571         320 :     a->inchain = newto->ins;
     572         320 :     a->inchainRev = NULL;
     573         320 :     if (newto->ins)
     574         320 :         newto->ins->inchainRev = a;
     575         320 :     newto->ins = a;
     576         320 :     newto->nins++;
     577         320 : }
     578             : 
     579             : /*
     580             :  * hasnonemptyout - Does state have a non-EMPTY out arc?
     581             :  */
     582             : static int
     583      152712 : hasnonemptyout(struct state *s)
     584             : {
     585             :     struct arc *a;
     586             : 
     587      174444 :     for (a = s->outs; a != NULL; a = a->outchain)
     588             :     {
     589      171220 :         if (a->type != EMPTY)
     590      149488 :             return 1;
     591             :     }
     592        3224 :     return 0;
     593             : }
     594             : 
     595             : /*
     596             :  * findarc - find arc, if any, from given source with given type and color
     597             :  * If there is more than one such arc, the result is random.
     598             :  */
     599             : static struct arc *
     600         912 : findarc(struct state *s,
     601             :         int type,
     602             :         color co)
     603             : {
     604             :     struct arc *a;
     605             : 
     606        2320 :     for (a = s->outs; a != NULL; a = a->outchain)
     607        1410 :         if (a->type == type && a->co == co)
     608           2 :             return a;
     609         910 :     return NULL;
     610             : }
     611             : 
     612             : /*
     613             :  * cparc - allocate a new arc within an NFA, copying details from old one
     614             :  */
     615             : static void
     616     1335774 : cparc(struct nfa *nfa,
     617             :       struct arc *oa,
     618             :       struct state *from,
     619             :       struct state *to)
     620             : {
     621     1335774 :     newarc(nfa, oa->type, oa->co, from, to);
     622     1335774 : }
     623             : 
     624             : /*
     625             :  * sortins - sort the in arcs of a state by from/color/type
     626             :  */
     627             : static void
     628       24288 : sortins(struct nfa *nfa,
     629             :         struct state *s)
     630             : {
     631             :     struct arc **sortarray;
     632             :     struct arc *a;
     633       24288 :     int         n = s->nins;
     634             :     int         i;
     635             : 
     636       24288 :     if (n <= 1)
     637           4 :         return;                 /* nothing to do */
     638             :     /* make an array of arc pointers ... */
     639       24284 :     sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
     640       24284 :     if (sortarray == NULL)
     641             :     {
     642           0 :         NERR(REG_ESPACE);
     643           0 :         return;
     644             :     }
     645       24284 :     i = 0;
     646      115598 :     for (a = s->ins; a != NULL; a = a->inchain)
     647       91314 :         sortarray[i++] = a;
     648             :     assert(i == n);
     649             :     /* ... sort the array */
     650       24284 :     qsort(sortarray, n, sizeof(struct arc *), sortins_cmp);
     651             :     /* ... and rebuild arc list in order */
     652             :     /* it seems worth special-casing first and last items to simplify loop */
     653       24284 :     a = sortarray[0];
     654       24284 :     s->ins = a;
     655       24284 :     a->inchain = sortarray[1];
     656       24284 :     a->inchainRev = NULL;
     657       67030 :     for (i = 1; i < n - 1; i++)
     658             :     {
     659       42746 :         a = sortarray[i];
     660       42746 :         a->inchain = sortarray[i + 1];
     661       42746 :         a->inchainRev = sortarray[i - 1];
     662             :     }
     663       24284 :     a = sortarray[i];
     664       24284 :     a->inchain = NULL;
     665       24284 :     a->inchainRev = sortarray[i - 1];
     666       24284 :     FREE(sortarray);
     667             : }
     668             : 
     669             : static int
     670    51826560 : sortins_cmp(const void *a, const void *b)
     671             : {
     672    51826560 :     const struct arc *aa = *((const struct arc *const *) a);
     673    51826560 :     const struct arc *bb = *((const struct arc *const *) b);
     674             : 
     675             :     /* we check the fields in the order they are most likely to be different */
     676    51826560 :     if (aa->from->no < bb->from->no)
     677    41178004 :         return -1;
     678    10648556 :     if (aa->from->no > bb->from->no)
     679    10289502 :         return 1;
     680      359054 :     if (aa->co < bb->co)
     681      188552 :         return -1;
     682      170502 :     if (aa->co > bb->co)
     683      167888 :         return 1;
     684        2614 :     if (aa->type < bb->type)
     685          86 :         return -1;
     686        2528 :     if (aa->type > bb->type)
     687          48 :         return 1;
     688        2480 :     return 0;
     689             : }
     690             : 
     691             : /*
     692             :  * sortouts - sort the out arcs of a state by to/color/type
     693             :  */
     694             : static void
     695          32 : sortouts(struct nfa *nfa,
     696             :          struct state *s)
     697             : {
     698             :     struct arc **sortarray;
     699             :     struct arc *a;
     700          32 :     int         n = s->nouts;
     701             :     int         i;
     702             : 
     703          32 :     if (n <= 1)
     704           0 :         return;                 /* nothing to do */
     705             :     /* make an array of arc pointers ... */
     706          32 :     sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
     707          32 :     if (sortarray == NULL)
     708             :     {
     709           0 :         NERR(REG_ESPACE);
     710           0 :         return;
     711             :     }
     712          32 :     i = 0;
     713         968 :     for (a = s->outs; a != NULL; a = a->outchain)
     714         936 :         sortarray[i++] = a;
     715             :     assert(i == n);
     716             :     /* ... sort the array */
     717          32 :     qsort(sortarray, n, sizeof(struct arc *), sortouts_cmp);
     718             :     /* ... and rebuild arc list in order */
     719             :     /* it seems worth special-casing first and last items to simplify loop */
     720          32 :     a = sortarray[0];
     721          32 :     s->outs = a;
     722          32 :     a->outchain = sortarray[1];
     723          32 :     a->outchainRev = NULL;
     724         904 :     for (i = 1; i < n - 1; i++)
     725             :     {
     726         872 :         a = sortarray[i];
     727         872 :         a->outchain = sortarray[i + 1];
     728         872 :         a->outchainRev = sortarray[i - 1];
     729             :     }
     730          32 :     a = sortarray[i];
     731          32 :     a->outchain = NULL;
     732          32 :     a->outchainRev = sortarray[i - 1];
     733          32 :     FREE(sortarray);
     734             : }
     735             : 
     736             : static int
     737        5492 : sortouts_cmp(const void *a, const void *b)
     738             : {
     739        5492 :     const struct arc *aa = *((const struct arc *const *) a);
     740        5492 :     const struct arc *bb = *((const struct arc *const *) b);
     741             : 
     742             :     /* we check the fields in the order they are most likely to be different */
     743        5492 :     if (aa->to->no < bb->to->no)
     744         610 :         return -1;
     745        4882 :     if (aa->to->no > bb->to->no)
     746         160 :         return 1;
     747        4722 :     if (aa->co < bb->co)
     748        2562 :         return -1;
     749        2160 :     if (aa->co > bb->co)
     750        2116 :         return 1;
     751          44 :     if (aa->type < bb->type)
     752           2 :         return -1;
     753          42 :     if (aa->type > bb->type)
     754          14 :         return 1;
     755          28 :     return 0;
     756             : }
     757             : 
     758             : /*
     759             :  * Common decision logic about whether to use arc-by-arc operations or
     760             :  * sort/merge.  If there's just a few source arcs we cannot recoup the
     761             :  * cost of sorting the destination arc list, no matter how large it is.
     762             :  * Otherwise, limit the number of arc-by-arc comparisons to about 1000
     763             :  * (a somewhat arbitrary choice, but the breakeven point would probably
     764             :  * be machine dependent anyway).
     765             :  */
     766             : #define BULK_ARC_OP_USE_SORT(nsrcarcs, ndestarcs) \
     767             :     ((nsrcarcs) < 4 ? 0 : ((nsrcarcs) > 32 || (ndestarcs) > 32))
     768             : 
     769             : /*
     770             :  * moveins - move all in arcs of a state to another state
     771             :  *
     772             :  * You might think this could be done better by just updating the
     773             :  * existing arcs, and you would be right if it weren't for the need
     774             :  * for duplicate suppression, which makes it easier to just make new
     775             :  * ones to exploit the suppression built into newarc.
     776             :  *
     777             :  * However, if we have a whole lot of arcs to deal with, retail duplicate
     778             :  * checks become too slow.  In that case we proceed by sorting and merging
     779             :  * the arc lists, and then we can indeed just update the arcs in-place.
     780             :  *
     781             :  * On the other hand, it's also true that this is frequently called with
     782             :  * a brand-new newState that has no existing in-arcs.  In that case,
     783             :  * de-duplication is unnecessary, so we can just blindly move all the arcs.
     784             :  */
     785             : static void
     786      182656 : moveins(struct nfa *nfa,
     787             :         struct state *oldState,
     788             :         struct state *newState)
     789             : {
     790             :     assert(oldState != newState);
     791             : 
     792      182656 :     if (newState->nins == 0)
     793             :     {
     794             :         /* No need for de-duplication */
     795             :         struct arc *a;
     796             : 
     797      131750 :         while ((a = oldState->ins) != NULL)
     798             :         {
     799       69412 :             createarc(nfa, a->type, a->co, a->from, newState);
     800       69412 :             freearc(nfa, a);
     801             :         }
     802             :     }
     803      120318 :     else if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
     804      120242 :     {
     805             :         /* With not too many arcs, just do them one at a time */
     806             :         struct arc *a;
     807             : 
     808      304186 :         while ((a = oldState->ins) != NULL)
     809             :         {
     810      183944 :             cparc(nfa, a, a->from, newState);
     811      183944 :             freearc(nfa, a);
     812             :         }
     813             :     }
     814             :     else
     815             :     {
     816             :         /*
     817             :          * With many arcs, use a sort-merge approach.  Note changearctarget()
     818             :          * will put the arc onto the front of newState's chain, so it does not
     819             :          * break our walk through the sorted part of the chain.
     820             :          */
     821             :         struct arc *oa;
     822             :         struct arc *na;
     823             : 
     824             :         /*
     825             :          * Because we bypass newarc() in this code path, we'd better include a
     826             :          * cancel check.
     827             :          */
     828          76 :         if (CANCEL_REQUESTED(nfa->v->re))
     829             :         {
     830           0 :             NERR(REG_CANCEL);
     831           0 :             return;
     832             :         }
     833             : 
     834          76 :         sortins(nfa, oldState);
     835          76 :         sortins(nfa, newState);
     836          76 :         if (NISERR())
     837           0 :             return;             /* might have failed to sort */
     838          76 :         oa = oldState->ins;
     839          76 :         na = newState->ins;
     840        2718 :         while (oa != NULL && na != NULL)
     841             :         {
     842        2638 :             struct arc *a = oa;
     843             : 
     844        2638 :             switch (sortins_cmp(&oa, &na))
     845             :             {
     846         166 :                 case -1:
     847             :                     /* newState does not have anything matching oa */
     848         166 :                     oa = oa->inchain;
     849             : 
     850             :                     /*
     851             :                      * Rather than doing createarc+freearc, we can just unlink
     852             :                      * and relink the existing arc struct.
     853             :                      */
     854         166 :                     changearctarget(a, newState);
     855         166 :                     break;
     856         384 :                 case 0:
     857             :                     /* match, advance in both lists */
     858         384 :                     oa = oa->inchain;
     859         384 :                     na = na->inchain;
     860             :                     /* ... and drop duplicate arc from oldState */
     861         384 :                     freearc(nfa, a);
     862         384 :                     break;
     863        2088 :                 case +1:
     864             :                     /* advance only na; oa might have a match later */
     865        2088 :                     na = na->inchain;
     866        2088 :                     break;
     867        2714 :                 default:
     868             :                     assert(NOTREACHED);
     869             :             }
     870             :         }
     871         230 :         while (oa != NULL)
     872             :         {
     873             :             /* newState does not have anything matching oa */
     874         154 :             struct arc *a = oa;
     875             : 
     876         154 :             oa = oa->inchain;
     877         154 :             changearctarget(a, newState);
     878             :         }
     879             :     }
     880             : 
     881             :     assert(oldState->nins == 0);
     882             :     assert(oldState->ins == NULL);
     883             : }
     884             : 
     885             : /*
     886             :  * copyins - copy in arcs of a state to another state
     887             :  *
     888             :  * The comments for moveins() apply here as well.  However, in current
     889             :  * usage, this is *only* called with brand-new target states, so that
     890             :  * only the "no need for de-duplication" code path is ever reached.
     891             :  * We keep the rest #ifdef'd out in case it's needed in the future.
     892             :  */
     893             : static void
     894       17156 : copyins(struct nfa *nfa,
     895             :         struct state *oldState,
     896             :         struct state *newState)
     897             : {
     898             :     assert(oldState != newState);
     899             :     assert(newState->nins == 0); /* see comment above */
     900             : 
     901       17156 :     if (newState->nins == 0)
     902             :     {
     903             :         /* No need for de-duplication */
     904             :         struct arc *a;
     905             : 
     906      244362 :         for (a = oldState->ins; a != NULL; a = a->inchain)
     907      227206 :             createarc(nfa, a->type, a->co, a->from, newState);
     908             :     }
     909             : #ifdef NOT_USED                 /* see comment above */
     910             :     else if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
     911             :     {
     912             :         /* With not too many arcs, just do them one at a time */
     913             :         struct arc *a;
     914             : 
     915             :         for (a = oldState->ins; a != NULL; a = a->inchain)
     916             :             cparc(nfa, a, a->from, newState);
     917             :     }
     918             :     else
     919             :     {
     920             :         /*
     921             :          * With many arcs, use a sort-merge approach.  Note that createarc()
     922             :          * will put new arcs onto the front of newState's chain, so it does
     923             :          * not break our walk through the sorted part of the chain.
     924             :          */
     925             :         struct arc *oa;
     926             :         struct arc *na;
     927             : 
     928             :         /*
     929             :          * Because we bypass newarc() in this code path, we'd better include a
     930             :          * cancel check.
     931             :          */
     932             :         if (CANCEL_REQUESTED(nfa->v->re))
     933             :         {
     934             :             NERR(REG_CANCEL);
     935             :             return;
     936             :         }
     937             : 
     938             :         sortins(nfa, oldState);
     939             :         sortins(nfa, newState);
     940             :         if (NISERR())
     941             :             return;             /* might have failed to sort */
     942             :         oa = oldState->ins;
     943             :         na = newState->ins;
     944             :         while (oa != NULL && na != NULL)
     945             :         {
     946             :             struct arc *a = oa;
     947             : 
     948             :             switch (sortins_cmp(&oa, &na))
     949             :             {
     950             :                 case -1:
     951             :                     /* newState does not have anything matching oa */
     952             :                     oa = oa->inchain;
     953             :                     createarc(nfa, a->type, a->co, a->from, newState);
     954             :                     break;
     955             :                 case 0:
     956             :                     /* match, advance in both lists */
     957             :                     oa = oa->inchain;
     958             :                     na = na->inchain;
     959             :                     break;
     960             :                 case +1:
     961             :                     /* advance only na; oa might have a match later */
     962             :                     na = na->inchain;
     963             :                     break;
     964             :                 default:
     965             :                     assert(NOTREACHED);
     966             :             }
     967             :         }
     968             :         while (oa != NULL)
     969             :         {
     970             :             /* newState does not have anything matching oa */
     971             :             struct arc *a = oa;
     972             : 
     973             :             oa = oa->inchain;
     974             :             createarc(nfa, a->type, a->co, a->from, newState);
     975             :         }
     976             :     }
     977             : #endif                          /* NOT_USED */
     978       17156 : }
     979             : 
     980             : /*
     981             :  * mergeins - merge a list of inarcs into a state
     982             :  *
     983             :  * This is much like copyins, but the source arcs are listed in an array,
     984             :  * and are not guaranteed unique.  It's okay to clobber the array contents.
     985             :  */
     986             : static void
     987      176316 : mergeins(struct nfa *nfa,
     988             :          struct state *s,
     989             :          struct arc **arcarray,
     990             :          int arccount)
     991             : {
     992             :     struct arc *na;
     993             :     int         i;
     994             :     int         j;
     995             : 
     996      176316 :     if (arccount <= 0)
     997      152180 :         return;
     998             : 
     999             :     /*
    1000             :      * Because we bypass newarc() in this code path, we'd better include a
    1001             :      * cancel check.
    1002             :      */
    1003       24136 :     if (CANCEL_REQUESTED(nfa->v->re))
    1004             :     {
    1005           0 :         NERR(REG_CANCEL);
    1006           0 :         return;
    1007             :     }
    1008             : 
    1009             :     /* Sort existing inarcs as well as proposed new ones */
    1010       24136 :     sortins(nfa, s);
    1011       24136 :     if (NISERR())
    1012           0 :         return;                 /* might have failed to sort */
    1013             : 
    1014       24136 :     qsort(arcarray, arccount, sizeof(struct arc *), sortins_cmp);
    1015             : 
    1016             :     /*
    1017             :      * arcarray very likely includes dups, so we must eliminate them.  (This
    1018             :      * could be folded into the next loop, but it's not worth the trouble.)
    1019             :      */
    1020       24136 :     j = 0;
    1021    10268730 :     for (i = 1; i < arccount; i++)
    1022             :     {
    1023    10244594 :         switch (sortins_cmp(&arcarray[j], &arcarray[i]))
    1024             :         {
    1025    10243598 :             case -1:
    1026             :                 /* non-dup */
    1027    10243598 :                 arcarray[++j] = arcarray[i];
    1028    10243598 :                 break;
    1029         996 :             case 0:
    1030             :                 /* dup */
    1031         996 :                 break;
    1032    10244594 :             default:
    1033             :                 /* trouble */
    1034             :                 assert(NOTREACHED);
    1035             :         }
    1036             :     }
    1037       24136 :     arccount = j + 1;
    1038             : 
    1039             :     /*
    1040             :      * Now merge into s' inchain.  Note that createarc() will put new arcs
    1041             :      * onto the front of s's chain, so it does not break our walk through the
    1042             :      * sorted part of the chain.
    1043             :      */
    1044       24136 :     i = 0;
    1045       24136 :     na = s->ins;
    1046    10298286 :     while (i < arccount && na != NULL)
    1047             :     {
    1048    10274150 :         struct arc *a = arcarray[i];
    1049             : 
    1050    10274150 :         switch (sortins_cmp(&a, &na))
    1051             :         {
    1052    10238258 :             case -1:
    1053             :                 /* s does not have anything matching a */
    1054    10238258 :                 createarc(nfa, a->type, a->co, a->from, s);
    1055    10238258 :                 i++;
    1056    10238258 :                 break;
    1057          16 :             case 0:
    1058             :                 /* match, advance in both lists */
    1059          16 :                 i++;
    1060          16 :                 na = na->inchain;
    1061          16 :                 break;
    1062       35876 :             case +1:
    1063             :                 /* advance only na; array might have a match later */
    1064       35876 :                 na = na->inchain;
    1065       35876 :                 break;
    1066    10274150 :             default:
    1067             :                 assert(NOTREACHED);
    1068             :         }
    1069             :     }
    1070       53596 :     while (i < arccount)
    1071             :     {
    1072             :         /* s does not have anything matching a */
    1073       29460 :         struct arc *a = arcarray[i];
    1074             : 
    1075       29460 :         createarc(nfa, a->type, a->co, a->from, s);
    1076       29460 :         i++;
    1077             :     }
    1078             : }
    1079             : 
    1080             : /*
    1081             :  * moveouts - move all out arcs of a state to another state
    1082             :  *
    1083             :  * See comments for moveins()
    1084             :  */
    1085             : static void
    1086       56672 : moveouts(struct nfa *nfa,
    1087             :          struct state *oldState,
    1088             :          struct state *newState)
    1089             : {
    1090             :     assert(oldState != newState);
    1091             : 
    1092       56672 :     if (newState->nouts == 0)
    1093             :     {
    1094             :         /* No need for de-duplication */
    1095             :         struct arc *a;
    1096             : 
    1097       43536 :         while ((a = oldState->outs) != NULL)
    1098             :         {
    1099       25020 :             createarc(nfa, a->type, a->co, newState, a->to);
    1100       25020 :             freearc(nfa, a);
    1101             :         }
    1102             :     }
    1103       38156 :     else if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
    1104       38140 :     {
    1105             :         /* With not too many arcs, just do them one at a time */
    1106             :         struct arc *a;
    1107             : 
    1108       95626 :         while ((a = oldState->outs) != NULL)
    1109             :         {
    1110       57486 :             cparc(nfa, a, newState, a->to);
    1111       57486 :             freearc(nfa, a);
    1112             :         }
    1113             :     }
    1114             :     else
    1115             :     {
    1116             :         /*
    1117             :          * With many arcs, use a sort-merge approach.  Note changearcsource()
    1118             :          * will put the arc onto the front of newState's chain, so it does not
    1119             :          * break our walk through the sorted part of the chain.
    1120             :          */
    1121             :         struct arc *oa;
    1122             :         struct arc *na;
    1123             : 
    1124             :         /*
    1125             :          * Because we bypass newarc() in this code path, we'd better include a
    1126             :          * cancel check.
    1127             :          */
    1128          16 :         if (CANCEL_REQUESTED(nfa->v->re))
    1129             :         {
    1130           0 :             NERR(REG_CANCEL);
    1131           0 :             return;
    1132             :         }
    1133             : 
    1134          16 :         sortouts(nfa, oldState);
    1135          16 :         sortouts(nfa, newState);
    1136          16 :         if (NISERR())
    1137           0 :             return;             /* might have failed to sort */
    1138          16 :         oa = oldState->outs;
    1139          16 :         na = newState->outs;
    1140         578 :         while (oa != NULL && na != NULL)
    1141             :         {
    1142         560 :             struct arc *a = oa;
    1143             : 
    1144         560 :             switch (sortouts_cmp(&oa, &na))
    1145             :             {
    1146         516 :                 case -1:
    1147             :                     /* newState does not have anything matching oa */
    1148         516 :                     oa = oa->outchain;
    1149             : 
    1150             :                     /*
    1151             :                      * Rather than doing createarc+freearc, we can just unlink
    1152             :                      * and relink the existing arc struct.
    1153             :                      */
    1154         516 :                     changearcsource(a, newState);
    1155         516 :                     break;
    1156          28 :                 case 0:
    1157             :                     /* match, advance in both lists */
    1158          28 :                     oa = oa->outchain;
    1159          28 :                     na = na->outchain;
    1160             :                     /* ... and drop duplicate arc from oldState */
    1161          28 :                     freearc(nfa, a);
    1162          28 :                     break;
    1163          16 :                 case +1:
    1164             :                     /* advance only na; oa might have a match later */
    1165          16 :                     na = na->outchain;
    1166          16 :                     break;
    1167         576 :                 default:
    1168             :                     assert(NOTREACHED);
    1169             :             }
    1170             :         }
    1171          88 :         while (oa != NULL)
    1172             :         {
    1173             :             /* newState does not have anything matching oa */
    1174          72 :             struct arc *a = oa;
    1175             : 
    1176          72 :             oa = oa->outchain;
    1177          72 :             changearcsource(a, newState);
    1178             :         }
    1179             :     }
    1180             : 
    1181             :     assert(oldState->nouts == 0);
    1182             :     assert(oldState->outs == NULL);
    1183             : }
    1184             : 
    1185             : /*
    1186             :  * copyouts - copy out arcs of a state to another state
    1187             :  *
    1188             :  * See comments for copyins()
    1189             :  */
    1190             : static void
    1191       13216 : copyouts(struct nfa *nfa,
    1192             :          struct state *oldState,
    1193             :          struct state *newState)
    1194             : {
    1195             :     assert(oldState != newState);
    1196             :     assert(newState->nouts == 0);    /* see comment above */
    1197             : 
    1198       13216 :     if (newState->nouts == 0)
    1199             :     {
    1200             :         /* No need for de-duplication */
    1201             :         struct arc *a;
    1202             : 
    1203      473326 :         for (a = oldState->outs; a != NULL; a = a->outchain)
    1204      460110 :             createarc(nfa, a->type, a->co, newState, a->to);
    1205             :     }
    1206             : #ifdef NOT_USED                 /* see comment above */
    1207             :     else if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
    1208             :     {
    1209             :         /* With not too many arcs, just do them one at a time */
    1210             :         struct arc *a;
    1211             : 
    1212             :         for (a = oldState->outs; a != NULL; a = a->outchain)
    1213             :             cparc(nfa, a, newState, a->to);
    1214             :     }
    1215             :     else
    1216             :     {
    1217             :         /*
    1218             :          * With many arcs, use a sort-merge approach.  Note that createarc()
    1219             :          * will put new arcs onto the front of newState's chain, so it does
    1220             :          * not break our walk through the sorted part of the chain.
    1221             :          */
    1222             :         struct arc *oa;
    1223             :         struct arc *na;
    1224             : 
    1225             :         /*
    1226             :          * Because we bypass newarc() in this code path, we'd better include a
    1227             :          * cancel check.
    1228             :          */
    1229             :         if (CANCEL_REQUESTED(nfa->v->re))
    1230             :         {
    1231             :             NERR(REG_CANCEL);
    1232             :             return;
    1233             :         }
    1234             : 
    1235             :         sortouts(nfa, oldState);
    1236             :         sortouts(nfa, newState);
    1237             :         if (NISERR())
    1238             :             return;             /* might have failed to sort */
    1239             :         oa = oldState->outs;
    1240             :         na = newState->outs;
    1241             :         while (oa != NULL && na != NULL)
    1242             :         {
    1243             :             struct arc *a = oa;
    1244             : 
    1245             :             switch (sortouts_cmp(&oa, &na))
    1246             :             {
    1247             :                 case -1:
    1248             :                     /* newState does not have anything matching oa */
    1249             :                     oa = oa->outchain;
    1250             :                     createarc(nfa, a->type, a->co, newState, a->to);
    1251             :                     break;
    1252             :                 case 0:
    1253             :                     /* match, advance in both lists */
    1254             :                     oa = oa->outchain;
    1255             :                     na = na->outchain;
    1256             :                     break;
    1257             :                 case +1:
    1258             :                     /* advance only na; oa might have a match later */
    1259             :                     na = na->outchain;
    1260             :                     break;
    1261             :                 default:
    1262             :                     assert(NOTREACHED);
    1263             :             }
    1264             :         }
    1265             :         while (oa != NULL)
    1266             :         {
    1267             :             /* newState does not have anything matching oa */
    1268             :             struct arc *a = oa;
    1269             : 
    1270             :             oa = oa->outchain;
    1271             :             createarc(nfa, a->type, a->co, newState, a->to);
    1272             :         }
    1273             :     }
    1274             : #endif                          /* NOT_USED */
    1275       13216 : }
    1276             : 
    1277             : /*
    1278             :  * cloneouts - copy out arcs of a state to another state pair, modifying type
    1279             :  *
    1280             :  * This is only used to convert PLAIN arcs to AHEAD/BEHIND arcs, which share
    1281             :  * the same interpretation of "co".  It wouldn't be sensible with LACONs.
    1282             :  */
    1283             : static void
    1284         296 : cloneouts(struct nfa *nfa,
    1285             :           struct state *old,
    1286             :           struct state *from,
    1287             :           struct state *to,
    1288             :           int type)
    1289             : {
    1290             :     struct arc *a;
    1291             : 
    1292             :     assert(old != from);
    1293             :     assert(type == AHEAD || type == BEHIND);
    1294             : 
    1295        1058 :     for (a = old->outs; a != NULL; a = a->outchain)
    1296             :     {
    1297             :         assert(a->type == PLAIN);
    1298         762 :         newarc(nfa, type, a->co, from, to);
    1299             :     }
    1300         296 : }
    1301             : 
    1302             : /*
    1303             :  * delsub - delete a sub-NFA, updating subre pointers if necessary
    1304             :  *
    1305             :  * This uses a recursive traversal of the sub-NFA, marking already-seen
    1306             :  * states using their tmp pointer.
    1307             :  */
    1308             : static void
    1309        5846 : delsub(struct nfa *nfa,
    1310             :        struct state *lp,        /* the sub-NFA goes from here... */
    1311             :        struct state *rp)        /* ...to here, *not* inclusive */
    1312             : {
    1313             :     assert(lp != rp);
    1314             : 
    1315        5846 :     rp->tmp = rp;                /* mark end */
    1316             : 
    1317        5846 :     deltraverse(nfa, lp, lp);
    1318        5846 :     if (NISERR())
    1319           0 :         return;                 /* asserts might not hold after failure */
    1320             :     assert(lp->nouts == 0 && rp->nins == 0);  /* did the job */
    1321             :     assert(lp->no != FREESTATE && rp->no != FREESTATE); /* no more */
    1322             : 
    1323        5846 :     rp->tmp = NULL;              /* unmark end */
    1324        5846 :     lp->tmp = NULL;              /* and begin, marked by deltraverse */
    1325             : }
    1326             : 
    1327             : /*
    1328             :  * deltraverse - the recursive heart of delsub
    1329             :  * This routine's basic job is to destroy all out-arcs of the state.
    1330             :  */
    1331             : static void
    1332       17000 : deltraverse(struct nfa *nfa,
    1333             :             struct state *leftend,
    1334             :             struct state *s)
    1335             : {
    1336             :     struct arc *a;
    1337             :     struct state *to;
    1338             : 
    1339             :     /* Since this is recursive, it could be driven to stack overflow */
    1340       17000 :     if (STACK_TOO_DEEP(nfa->v->re))
    1341             :     {
    1342           0 :         NERR(REG_ETOOBIG);
    1343           0 :         return;
    1344             :     }
    1345             : 
    1346       17000 :     if (s->nouts == 0)
    1347         166 :         return;                 /* nothing to do */
    1348       16834 :     if (s->tmp != NULL)
    1349        5700 :         return;                 /* already in progress */
    1350             : 
    1351       11134 :     s->tmp = s;                  /* mark as in progress */
    1352             : 
    1353       22288 :     while ((a = s->outs) != NULL)
    1354             :     {
    1355       11154 :         to = a->to;
    1356       11154 :         deltraverse(nfa, leftend, to);
    1357       11154 :         if (NISERR())
    1358           0 :             return;             /* asserts might not hold after failure */
    1359             :         assert(to->nouts == 0 || to->tmp != NULL);
    1360       11154 :         freearc(nfa, a);
    1361       11154 :         if (to->nins == 0 && to->tmp == NULL)
    1362             :         {
    1363             :             assert(to->nouts == 0);
    1364        5288 :             freestate(nfa, to);
    1365             :         }
    1366             :     }
    1367             : 
    1368             :     assert(s->no != FREESTATE); /* we're still here */
    1369             :     assert(s == leftend || s->nins != 0);    /* and still reachable */
    1370             :     assert(s->nouts == 0);       /* but have no outarcs */
    1371             : 
    1372       11134 :     s->tmp = NULL;               /* we're done here */
    1373             : }
    1374             : 
    1375             : /*
    1376             :  * dupnfa - duplicate sub-NFA
    1377             :  *
    1378             :  * Another recursive traversal, this time using tmp to point to duplicates
    1379             :  * as well as mark already-seen states.  (You knew there was a reason why
    1380             :  * it's a state pointer, didn't you? :-))
    1381             :  */
    1382             : static void
    1383       10892 : dupnfa(struct nfa *nfa,
    1384             :        struct state *start,     /* duplicate of subNFA starting here */
    1385             :        struct state *stop,      /* and stopping here */
    1386             :        struct state *from,      /* stringing duplicate from here */
    1387             :        struct state *to)        /* to here */
    1388             : {
    1389       10892 :     if (start == stop)
    1390             :     {
    1391           0 :         newarc(nfa, EMPTY, 0, from, to);
    1392           0 :         return;
    1393             :     }
    1394             : 
    1395       10892 :     stop->tmp = to;
    1396       10892 :     duptraverse(nfa, start, from);
    1397             :     /* done, except for clearing out the tmp pointers */
    1398             : 
    1399       10892 :     stop->tmp = NULL;
    1400       10892 :     cleartraverse(nfa, start);
    1401             : }
    1402             : 
    1403             : /*
    1404             :  * duptraverse - recursive heart of dupnfa
    1405             :  */
    1406             : static void
    1407      301128 : duptraverse(struct nfa *nfa,
    1408             :             struct state *s,
    1409             :             struct state *stmp) /* s's duplicate, or NULL */
    1410             : {
    1411             :     struct arc *a;
    1412             : 
    1413             :     /* Since this is recursive, it could be driven to stack overflow */
    1414      301128 :     if (STACK_TOO_DEEP(nfa->v->re))
    1415             :     {
    1416           0 :         NERR(REG_ETOOBIG);
    1417           0 :         return;
    1418             :     }
    1419             : 
    1420      301128 :     if (s->tmp != NULL)
    1421      105678 :         return;                 /* already done */
    1422             : 
    1423      195450 :     s->tmp = (stmp == NULL) ? newstate(nfa) : stmp;
    1424      195450 :     if (s->tmp == NULL)
    1425             :     {
    1426             :         assert(NISERR());
    1427           0 :         return;
    1428             :     }
    1429             : 
    1430      485686 :     for (a = s->outs; a != NULL && !NISERR(); a = a->outchain)
    1431             :     {
    1432      290236 :         duptraverse(nfa, a->to, (struct state *) NULL);
    1433      290236 :         if (NISERR())
    1434           0 :             break;
    1435             :         assert(a->to->tmp != NULL);
    1436      290236 :         cparc(nfa, a, s->tmp, a->to->tmp);
    1437             :     }
    1438             : }
    1439             : 
    1440             : /*
    1441             :  * removeconstraints - remove any constraints in an NFA
    1442             :  *
    1443             :  * Constraint arcs are replaced by empty arcs, essentially treating all
    1444             :  * constraints as automatically satisfied.
    1445             :  */
    1446             : static void
    1447         166 : removeconstraints(struct nfa *nfa,
    1448             :                   struct state *start,  /* process subNFA starting here */
    1449             :                   struct state *stop)   /* and stopping here */
    1450             : {
    1451         166 :     if (start == stop)
    1452           0 :         return;
    1453             : 
    1454         166 :     stop->tmp = stop;
    1455         166 :     removetraverse(nfa, start);
    1456             :     /* done, except for clearing out the tmp pointers */
    1457             : 
    1458         166 :     stop->tmp = NULL;
    1459         166 :     cleartraverse(nfa, start);
    1460             : }
    1461             : 
    1462             : /*
    1463             :  * removetraverse - recursive heart of removeconstraints
    1464             :  */
    1465             : static void
    1466         478 : removetraverse(struct nfa *nfa,
    1467             :                struct state *s)
    1468             : {
    1469             :     struct arc *a;
    1470             :     struct arc *oa;
    1471             : 
    1472             :     /* Since this is recursive, it could be driven to stack overflow */
    1473         478 :     if (STACK_TOO_DEEP(nfa->v->re))
    1474             :     {
    1475           0 :         NERR(REG_ETOOBIG);
    1476           0 :         return;
    1477             :     }
    1478             : 
    1479         478 :     if (s->tmp != NULL)
    1480         212 :         return;                 /* already done */
    1481             : 
    1482         266 :     s->tmp = s;
    1483         578 :     for (a = s->outs; a != NULL && !NISERR(); a = oa)
    1484             :     {
    1485         312 :         removetraverse(nfa, a->to);
    1486         312 :         if (NISERR())
    1487           0 :             break;
    1488         312 :         oa = a->outchain;
    1489         312 :         switch (a->type)
    1490             :         {
    1491         298 :             case PLAIN:
    1492             :             case EMPTY:
    1493             :                 /* nothing to do */
    1494         298 :                 break;
    1495          14 :             case AHEAD:
    1496             :             case BEHIND:
    1497             :             case '^':
    1498             :             case '$':
    1499             :             case LACON:
    1500             :                 /* replace it */
    1501          14 :                 newarc(nfa, EMPTY, 0, s, a->to);
    1502          14 :                 freearc(nfa, a);
    1503          14 :                 break;
    1504           0 :             default:
    1505           0 :                 NERR(REG_ASSERT);
    1506           0 :                 break;
    1507             :         }
    1508             :     }
    1509             : }
    1510             : 
    1511             : /*
    1512             :  * cleartraverse - recursive cleanup for algorithms that leave tmp ptrs set
    1513             :  */
    1514             : static void
    1515     1804618 : cleartraverse(struct nfa *nfa,
    1516             :               struct state *s)
    1517             : {
    1518             :     struct arc *a;
    1519             : 
    1520             :     /* Since this is recursive, it could be driven to stack overflow */
    1521     1804618 :     if (STACK_TOO_DEEP(nfa->v->re))
    1522             :     {
    1523           0 :         NERR(REG_ETOOBIG);
    1524           0 :         return;
    1525             :     }
    1526             : 
    1527     1804618 :     if (s->tmp == NULL)
    1528     1159350 :         return;
    1529      645268 :     s->tmp = NULL;
    1530             : 
    1531     2412004 :     for (a = s->outs; a != NULL; a = a->outchain)
    1532     1766736 :         cleartraverse(nfa, a->to);
    1533             : }
    1534             : 
    1535             : /*
    1536             :  * single_color_transition - does getting from s1 to s2 cross one PLAIN arc?
    1537             :  *
    1538             :  * If traversing from s1 to s2 requires a single PLAIN match (possibly of any
    1539             :  * of a set of colors), return a state whose outarc list contains only PLAIN
    1540             :  * arcs of those color(s).  Otherwise return NULL.
    1541             :  *
    1542             :  * This is used before optimizing the NFA, so there may be EMPTY arcs, which
    1543             :  * we should ignore; the possibility of an EMPTY is why the result state could
    1544             :  * be different from s1.
    1545             :  *
    1546             :  * It's worth troubling to handle multiple parallel PLAIN arcs here because a
    1547             :  * bracket construct such as [abc] might yield either one or several parallel
    1548             :  * PLAIN arcs depending on earlier atoms in the expression.  We'd rather that
    1549             :  * that implementation detail not create user-visible performance differences.
    1550             :  */
    1551             : static struct state *
    1552         204 : single_color_transition(struct state *s1, struct state *s2)
    1553             : {
    1554             :     struct arc *a;
    1555             : 
    1556             :     /* Ignore leading EMPTY arc, if any */
    1557         204 :     if (s1->nouts == 1 && s1->outs->type == EMPTY)
    1558         204 :         s1 = s1->outs->to;
    1559             :     /* Likewise for any trailing EMPTY arc */
    1560         204 :     if (s2->nins == 1 && s2->ins->type == EMPTY)
    1561         204 :         s2 = s2->ins->from;
    1562             :     /* Perhaps we could have a single-state loop in between, if so reject */
    1563         204 :     if (s1 == s2)
    1564           0 :         return NULL;
    1565             :     /* s1 must have at least one outarc... */
    1566         204 :     if (s1->outs == NULL)
    1567           0 :         return NULL;
    1568             :     /* ... and they must all be PLAIN arcs to s2 */
    1569         346 :     for (a = s1->outs; a != NULL; a = a->outchain)
    1570             :     {
    1571         214 :         if (a->type != PLAIN || a->to != s2)
    1572          72 :             return NULL;
    1573             :     }
    1574             :     /* OK, return s1 as the possessor of the relevant outarcs */
    1575         132 :     return s1;
    1576             : }
    1577             : 
    1578             : /*
    1579             :  * specialcolors - fill in special colors for an NFA
    1580             :  */
    1581             : static void
    1582       13418 : specialcolors(struct nfa *nfa)
    1583             : {
    1584             :     /* false colors for BOS, BOL, EOS, EOL */
    1585       13418 :     if (nfa->parent == NULL)
    1586             :     {
    1587        5178 :         nfa->bos[0] = pseudocolor(nfa->cm);
    1588        5178 :         nfa->bos[1] = pseudocolor(nfa->cm);
    1589        5178 :         nfa->eos[0] = pseudocolor(nfa->cm);
    1590        5178 :         nfa->eos[1] = pseudocolor(nfa->cm);
    1591             :     }
    1592             :     else
    1593             :     {
    1594             :         assert(nfa->parent->bos[0] != COLORLESS);
    1595        8240 :         nfa->bos[0] = nfa->parent->bos[0];
    1596             :         assert(nfa->parent->bos[1] != COLORLESS);
    1597        8240 :         nfa->bos[1] = nfa->parent->bos[1];
    1598             :         assert(nfa->parent->eos[0] != COLORLESS);
    1599        8240 :         nfa->eos[0] = nfa->parent->eos[0];
    1600             :         assert(nfa->parent->eos[1] != COLORLESS);
    1601        8240 :         nfa->eos[1] = nfa->parent->eos[1];
    1602             :     }
    1603       13418 : }
    1604             : 
    1605             : /*
    1606             :  * optimize - optimize an NFA
    1607             :  *
    1608             :  * The main goal of this function is not so much "optimization" (though it
    1609             :  * does try to get rid of useless NFA states) as reducing the NFA to a form
    1610             :  * the regex executor can handle.  The executor, and indeed the cNFA format
    1611             :  * that is its input, can only handle PLAIN and LACON arcs.  The output of
    1612             :  * the regex parser also includes EMPTY (do-nothing) arcs, as well as
    1613             :  * ^, $, AHEAD, and BEHIND constraint arcs, which we must get rid of here.
    1614             :  * We first get rid of EMPTY arcs and then deal with the constraint arcs.
    1615             :  * The hardest part of either job is to get rid of circular loops of the
    1616             :  * target arc type.  We would have to do that in any case, though, as such a
    1617             :  * loop would otherwise allow the executor to cycle through the loop endlessly
    1618             :  * without making any progress in the input string.
    1619             :  */
    1620             : static long                     /* re_info bits */
    1621       13414 : optimize(struct nfa *nfa,
    1622             :          FILE *f)               /* for debug output; NULL none */
    1623             : {
    1624             : #ifdef REG_DEBUG
    1625             :     int         verbose = (f != NULL) ? 1 : 0;
    1626             : 
    1627             :     if (verbose)
    1628             :         fprintf(f, "\ninitial cleanup:\n");
    1629             : #endif
    1630       13414 :     cleanup(nfa);               /* may simplify situation */
    1631             : #ifdef REG_DEBUG
    1632             :     if (verbose)
    1633             :         dumpnfa(nfa, f);
    1634             :     if (verbose)
    1635             :         fprintf(f, "\nempties:\n");
    1636             : #endif
    1637       13414 :     fixempties(nfa, f);         /* get rid of EMPTY arcs */
    1638             : #ifdef REG_DEBUG
    1639             :     if (verbose)
    1640             :         fprintf(f, "\nconstraints:\n");
    1641             : #endif
    1642       13414 :     fixconstraintloops(nfa, f); /* get rid of constraint loops */
    1643       13414 :     pullback(nfa, f);           /* pull back constraints backward */
    1644       13414 :     pushfwd(nfa, f);            /* push fwd constraints forward */
    1645             : #ifdef REG_DEBUG
    1646             :     if (verbose)
    1647             :         fprintf(f, "\nfinal cleanup:\n");
    1648             : #endif
    1649       13414 :     cleanup(nfa);               /* final tidying */
    1650             : #ifdef REG_DEBUG
    1651             :     if (verbose)
    1652             :         dumpnfa(nfa, f);
    1653             : #endif
    1654       13414 :     return analyze(nfa);        /* and analysis */
    1655             : }
    1656             : 
    1657             : /*
    1658             :  * pullback - pull back constraints backward to eliminate them
    1659             :  */
    1660             : static void
    1661       20660 : pullback(struct nfa *nfa,
    1662             :          FILE *f)               /* for debug output; NULL none */
    1663             : {
    1664             :     struct state *s;
    1665             :     struct state *nexts;
    1666             :     struct arc *a;
    1667             :     struct arc *nexta;
    1668             :     struct state *intermediates;
    1669             :     int         progress;
    1670             : 
    1671             :     /* find and pull until there are no more */
    1672             :     do
    1673             :     {
    1674       20660 :         progress = 0;
    1675      295528 :         for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
    1676             :         {
    1677      274868 :             nexts = s->next;
    1678      274868 :             intermediates = NULL;
    1679     1531774 :             for (a = s->outs; a != NULL && !NISERR(); a = nexta)
    1680             :             {
    1681     1256906 :                 nexta = a->outchain;
    1682     1256906 :                 if (a->type == '^' || a->type == BEHIND)
    1683       75884 :                     if (pull(nfa, a, &intermediates))
    1684       31662 :                         progress = 1;
    1685             :             }
    1686             :             /* clear tmp fields of intermediate states created here */
    1687      276856 :             while (intermediates != NULL)
    1688             :             {
    1689        1988 :                 struct state *ns = intermediates->tmp;
    1690             : 
    1691        1988 :                 intermediates->tmp = NULL;
    1692        1988 :                 intermediates = ns;
    1693             :             }
    1694             :             /* if s is now useless, get rid of it */
    1695      274868 :             if ((s->nins == 0 || s->nouts == 0) && !s->flag)
    1696       26976 :                 dropstate(nfa, s);
    1697             :         }
    1698       20660 :         if (progress && f != NULL)
    1699           0 :             dumpnfa(nfa, f);
    1700       20660 :     } while (progress && !NISERR());
    1701       13414 :     if (NISERR())
    1702           4 :         return;
    1703             : 
    1704             :     /*
    1705             :      * Any ^ constraints we were able to pull to the start state can now be
    1706             :      * replaced by PLAIN arcs referencing the BOS or BOL colors.  There should
    1707             :      * be no other ^ or BEHIND arcs left in the NFA, though we do not check
    1708             :      * that here (compact() will fail if so).
    1709             :      */
    1710       55244 :     for (a = nfa->pre->outs; a != NULL; a = nexta)
    1711             :     {
    1712       41834 :         nexta = a->outchain;
    1713       41834 :         if (a->type == '^')
    1714             :         {
    1715             :             assert(a->co == 0 || a->co == 1);
    1716       29134 :             newarc(nfa, PLAIN, nfa->bos[a->co], a->from, a->to);
    1717       29134 :             freearc(nfa, a);
    1718             :         }
    1719             :     }
    1720             : }
    1721             : 
    1722             : /*
    1723             :  * pull - pull a back constraint backward past its source state
    1724             :  *
    1725             :  * Returns 1 if successful (which it always is unless the source is the
    1726             :  * start state or we have an internal error), 0 if nothing happened.
    1727             :  *
    1728             :  * A significant property of this function is that it deletes no pre-existing
    1729             :  * states, and no outarcs of the constraint's from state other than the given
    1730             :  * constraint arc.  This makes the loops in pullback() safe, at the cost that
    1731             :  * we may leave useless states behind.  Therefore, we leave it to pullback()
    1732             :  * to delete such states.
    1733             :  *
    1734             :  * If the from state has multiple back-constraint outarcs, and/or multiple
    1735             :  * compatible constraint inarcs, we only need to create one new intermediate
    1736             :  * state per combination of predecessor and successor states.  *intermediates
    1737             :  * points to a list of such intermediate states for this from state (chained
    1738             :  * through their tmp fields).
    1739             :  */
    1740             : static int
    1741       75884 : pull(struct nfa *nfa,
    1742             :      struct arc *con,
    1743             :      struct state **intermediates)
    1744             : {
    1745       75884 :     struct state *from = con->from;
    1746       75884 :     struct state *to = con->to;
    1747             :     struct arc *a;
    1748             :     struct arc *nexta;
    1749             :     struct state *s;
    1750             : 
    1751             :     assert(from != to);         /* should have gotten rid of this earlier */
    1752       75884 :     if (from->flag)              /* can't pull back beyond start */
    1753       44222 :         return 0;
    1754       31662 :     if (from->nins == 0)
    1755             :     {                           /* unreachable */
    1756        6396 :         freearc(nfa, con);
    1757        6396 :         return 1;
    1758             :     }
    1759             : 
    1760             :     /*
    1761             :      * First, clone from state if necessary to avoid other outarcs.  This may
    1762             :      * seem wasteful, but it simplifies the logic, and we'll get rid of the
    1763             :      * clone state again at the bottom.
    1764             :      */
    1765       25266 :     if (from->nouts > 1)
    1766             :     {
    1767       17156 :         s = newstate(nfa);
    1768       17156 :         if (NISERR())
    1769           0 :             return 0;
    1770       17156 :         copyins(nfa, from, s);  /* duplicate inarcs */
    1771       17156 :         cparc(nfa, con, s, to); /* move constraint arc */
    1772       17156 :         freearc(nfa, con);
    1773       17156 :         if (NISERR())
    1774           0 :             return 0;
    1775       17156 :         from = s;
    1776       17156 :         con = from->outs;
    1777             :     }
    1778             :     assert(from->nouts == 1);
    1779             : 
    1780             :     /* propagate the constraint into the from state's inarcs */
    1781      283958 :     for (a = from->ins; a != NULL && !NISERR(); a = nexta)
    1782             :     {
    1783      258692 :         nexta = a->inchain;
    1784      258692 :         switch (combine(nfa, con, a))
    1785             :         {
    1786       69760 :             case INCOMPATIBLE:  /* destroy the arc */
    1787       69760 :                 freearc(nfa, a);
    1788       69760 :                 break;
    1789       11978 :             case SATISFIED:     /* no action needed */
    1790       11978 :                 break;
    1791      174866 :             case COMPATIBLE:    /* swap the two arcs, more or less */
    1792             :                 /* need an intermediate state, but might have one already */
    1793      197912 :                 for (s = *intermediates; s != NULL; s = s->tmp)
    1794             :                 {
    1795             :                     assert(s->nins > 0 && s->nouts > 0);
    1796      195924 :                     if (s->ins->from == a->from && s->outs->to == to)
    1797      172878 :                         break;
    1798             :                 }
    1799      174866 :                 if (s == NULL)
    1800             :                 {
    1801        1988 :                     s = newstate(nfa);
    1802        1988 :                     if (NISERR())
    1803           0 :                         return 0;
    1804        1988 :                     s->tmp = *intermediates;
    1805        1988 :                     *intermediates = s;
    1806             :                 }
    1807      174866 :                 cparc(nfa, con, a->from, s);
    1808      174866 :                 cparc(nfa, a, s, to);
    1809      174866 :                 freearc(nfa, a);
    1810      174866 :                 break;
    1811        2088 :             case REPLACEARC:    /* replace arc's color */
    1812        2088 :                 newarc(nfa, a->type, con->co, a->from, to);
    1813        2088 :                 freearc(nfa, a);
    1814        2088 :                 break;
    1815           0 :             default:
    1816             :                 assert(NOTREACHED);
    1817           0 :                 break;
    1818             :         }
    1819             :     }
    1820             : 
    1821             :     /* remaining inarcs, if any, incorporate the constraint */
    1822       25266 :     moveins(nfa, from, to);
    1823       25266 :     freearc(nfa, con);
    1824             :     /* from state is now useless, but we leave it to pullback() to clean up */
    1825       25266 :     return 1;
    1826             : }
    1827             : 
    1828             : /*
    1829             :  * pushfwd - push forward constraints forward to eliminate them
    1830             :  */
    1831             : static void
    1832       19334 : pushfwd(struct nfa *nfa,
    1833             :         FILE *f)                /* for debug output; NULL none */
    1834             : {
    1835             :     struct state *s;
    1836             :     struct state *nexts;
    1837             :     struct arc *a;
    1838             :     struct arc *nexta;
    1839             :     struct state *intermediates;
    1840             :     int         progress;
    1841             : 
    1842             :     /* find and push until there are no more */
    1843             :     do
    1844             :     {
    1845       19334 :         progress = 0;
    1846      253634 :         for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
    1847             :         {
    1848      234300 :             nexts = s->next;
    1849      234300 :             intermediates = NULL;
    1850     1375486 :             for (a = s->ins; a != NULL && !NISERR(); a = nexta)
    1851             :             {
    1852     1141186 :                 nexta = a->inchain;
    1853     1141186 :                 if (a->type == '$' || a->type == AHEAD)
    1854       51208 :                     if (push(nfa, a, &intermediates))
    1855       17390 :                         progress = 1;
    1856             :             }
    1857             :             /* clear tmp fields of intermediate states created here */
    1858      234304 :             while (intermediates != NULL)
    1859             :             {
    1860           4 :                 struct state *ns = intermediates->tmp;
    1861             : 
    1862           4 :                 intermediates->tmp = NULL;
    1863           4 :                 intermediates = ns;
    1864             :             }
    1865             :             /* if s is now useless, get rid of it */
    1866      234300 :             if ((s->nins == 0 || s->nouts == 0) && !s->flag)
    1867       17824 :                 dropstate(nfa, s);
    1868             :         }
    1869       19334 :         if (progress && f != NULL)
    1870           0 :             dumpnfa(nfa, f);
    1871       19334 :     } while (progress && !NISERR());
    1872       13414 :     if (NISERR())
    1873           4 :         return;
    1874             : 
    1875             :     /*
    1876             :      * Any $ constraints we were able to push to the post state can now be
    1877             :      * replaced by PLAIN arcs referencing the EOS or EOL colors.  There should
    1878             :      * be no other $ or AHEAD arcs left in the NFA, though we do not check
    1879             :      * that here (compact() will fail if so).
    1880             :      */
    1881       44740 :     for (a = nfa->post->ins; a != NULL; a = nexta)
    1882             :     {
    1883       31330 :         nexta = a->inchain;
    1884       31330 :         if (a->type == '$')
    1885             :         {
    1886             :             assert(a->co == 0 || a->co == 1);
    1887       21370 :             newarc(nfa, PLAIN, nfa->eos[a->co], a->from, a->to);
    1888       21370 :             freearc(nfa, a);
    1889             :         }
    1890             :     }
    1891             : }
    1892             : 
    1893             : /*
    1894             :  * push - push a forward constraint forward past its destination state
    1895             :  *
    1896             :  * Returns 1 if successful (which it always is unless the destination is the
    1897             :  * post state or we have an internal error), 0 if nothing happened.
    1898             :  *
    1899             :  * A significant property of this function is that it deletes no pre-existing
    1900             :  * states, and no inarcs of the constraint's to state other than the given
    1901             :  * constraint arc.  This makes the loops in pushfwd() safe, at the cost that
    1902             :  * we may leave useless states behind.  Therefore, we leave it to pushfwd()
    1903             :  * to delete such states.
    1904             :  *
    1905             :  * If the to state has multiple forward-constraint inarcs, and/or multiple
    1906             :  * compatible constraint outarcs, we only need to create one new intermediate
    1907             :  * state per combination of predecessor and successor states.  *intermediates
    1908             :  * points to a list of such intermediate states for this to state (chained
    1909             :  * through their tmp fields).
    1910             :  */
    1911             : static int
    1912       51208 : push(struct nfa *nfa,
    1913             :      struct arc *con,
    1914             :      struct state **intermediates)
    1915             : {
    1916       51208 :     struct state *from = con->from;
    1917       51208 :     struct state *to = con->to;
    1918             :     struct arc *a;
    1919             :     struct arc *nexta;
    1920             :     struct state *s;
    1921             : 
    1922             :     assert(to != from);         /* should have gotten rid of this earlier */
    1923       51208 :     if (to->flag)                /* can't push forward beyond end */
    1924       33818 :         return 0;
    1925       17390 :     if (to->nouts == 0)
    1926             :     {                           /* dead end */
    1927         710 :         freearc(nfa, con);
    1928         710 :         return 1;
    1929             :     }
    1930             : 
    1931             :     /*
    1932             :      * First, clone to state if necessary to avoid other inarcs.  This may
    1933             :      * seem wasteful, but it simplifies the logic, and we'll get rid of the
    1934             :      * clone state again at the bottom.
    1935             :      */
    1936       16680 :     if (to->nins > 1)
    1937             :     {
    1938       10378 :         s = newstate(nfa);
    1939       10378 :         if (NISERR())
    1940           0 :             return 0;
    1941       10378 :         copyouts(nfa, to, s);   /* duplicate outarcs */
    1942       10378 :         cparc(nfa, con, from, s);   /* move constraint arc */
    1943       10378 :         freearc(nfa, con);
    1944       10378 :         if (NISERR())
    1945           0 :             return 0;
    1946       10378 :         to = s;
    1947       10378 :         con = to->ins;
    1948             :     }
    1949             :     assert(to->nins == 1);
    1950             : 
    1951             :     /* propagate the constraint into the to state's outarcs */
    1952      110448 :     for (a = to->outs; a != NULL && !NISERR(); a = nexta)
    1953             :     {
    1954       93768 :         nexta = a->outchain;
    1955       93768 :         switch (combine(nfa, con, a))
    1956             :         {
    1957       80758 :             case INCOMPATIBLE:  /* destroy the arc */
    1958       80758 :                 freearc(nfa, a);
    1959       80758 :                 break;
    1960       10110 :             case SATISFIED:     /* no action needed */
    1961       10110 :                 break;
    1962          16 :             case COMPATIBLE:    /* swap the two arcs, more or less */
    1963             :                 /* need an intermediate state, but might have one already */
    1964          16 :                 for (s = *intermediates; s != NULL; s = s->tmp)
    1965             :                 {
    1966             :                     assert(s->nins > 0 && s->nouts > 0);
    1967          12 :                     if (s->ins->from == from && s->outs->to == a->to)
    1968          12 :                         break;
    1969             :                 }
    1970          16 :                 if (s == NULL)
    1971             :                 {
    1972           4 :                     s = newstate(nfa);
    1973           4 :                     if (NISERR())
    1974           0 :                         return 0;
    1975           4 :                     s->tmp = *intermediates;
    1976           4 :                     *intermediates = s;
    1977             :                 }
    1978          16 :                 cparc(nfa, con, s, a->to);
    1979          16 :                 cparc(nfa, a, from, s);
    1980          16 :                 freearc(nfa, a);
    1981          16 :                 break;
    1982        2884 :             case REPLACEARC:    /* replace arc's color */
    1983        2884 :                 newarc(nfa, a->type, con->co, from, a->to);
    1984        2884 :                 freearc(nfa, a);
    1985        2884 :                 break;
    1986           0 :             default:
    1987             :                 assert(NOTREACHED);
    1988           0 :                 break;
    1989             :         }
    1990             :     }
    1991             : 
    1992             :     /* remaining outarcs, if any, incorporate the constraint */
    1993       16680 :     moveouts(nfa, to, from);
    1994       16680 :     freearc(nfa, con);
    1995             :     /* to state is now useless, but we leave it to pushfwd() to clean up */
    1996       16680 :     return 1;
    1997             : }
    1998             : 
    1999             : /*
    2000             :  * combine - constraint lands on an arc, what happens?
    2001             :  *
    2002             :  * #def INCOMPATIBLE    1   // destroys arc
    2003             :  * #def SATISFIED       2   // constraint satisfied
    2004             :  * #def COMPATIBLE      3   // compatible but not satisfied yet
    2005             :  * #def REPLACEARC      4   // replace arc's color with constraint color
    2006             :  */
    2007             : static int
    2008      352460 : combine(struct nfa *nfa,
    2009             :         struct arc *con,
    2010             :         struct arc *a)
    2011             : {
    2012             : #define  CA(ct,at)   (((ct)<<CHAR_BIT) | (at))
    2013             : 
    2014      352460 :     switch (CA(con->type, a->type))
    2015             :     {
    2016       69606 :         case CA('^', PLAIN):    /* newlines are handled separately */
    2017             :         case CA('$', PLAIN):
    2018       69606 :             return INCOMPATIBLE;
    2019             :             break;
    2020       35892 :         case CA(AHEAD, PLAIN):  /* color constraints meet colors */
    2021             :         case CA(BEHIND, PLAIN):
    2022       35892 :             if (con->co == a->co)
    2023        1428 :                 return SATISFIED;
    2024       34464 :             if (con->co == RAINBOW)
    2025             :             {
    2026             :                 /* con is satisfied unless arc's color is a pseudocolor */
    2027           4 :                 if (!(nfa->cm->cd[a->co].flags & PSEUDO))
    2028           4 :                     return SATISFIED;
    2029             :             }
    2030       34460 :             else if (a->co == RAINBOW)
    2031             :             {
    2032             :                 /* con is incompatible if it's for a pseudocolor */
    2033             :                 /* (this is hypothetical; we make no such constraints today) */
    2034        4964 :                 if (nfa->cm->cd[con->co].flags & PSEUDO)
    2035           0 :                     return INCOMPATIBLE;
    2036             :                 /* otherwise, constraint constrains arc to be only its color */
    2037        4964 :                 return REPLACEARC;
    2038             :             }
    2039       29496 :             return INCOMPATIBLE;
    2040             :             break;
    2041       33986 :         case CA('^', '^'):      /* collision, similar constraints */
    2042             :         case CA('$', '$'):
    2043       33986 :             if (con->co == a->co) /* true duplication */
    2044       19812 :                 return SATISFIED;
    2045       14174 :             return INCOMPATIBLE;
    2046             :             break;
    2047       24226 :         case CA(AHEAD, AHEAD):  /* collision, similar constraints */
    2048             :         case CA(BEHIND, BEHIND):
    2049       24226 :             if (con->co == a->co) /* true duplication */
    2050         840 :                 return SATISFIED;
    2051       23386 :             if (con->co == RAINBOW)
    2052             :             {
    2053             :                 /* con is satisfied unless arc's color is a pseudocolor */
    2054           4 :                 if (!(nfa->cm->cd[a->co].flags & PSEUDO))
    2055           4 :                     return SATISFIED;
    2056             :             }
    2057       23382 :             else if (a->co == RAINBOW)
    2058             :             {
    2059             :                 /* con is incompatible if it's for a pseudocolor */
    2060             :                 /* (this is hypothetical; we make no such constraints today) */
    2061           8 :                 if (nfa->cm->cd[con->co].flags & PSEUDO)
    2062           0 :                     return INCOMPATIBLE;
    2063             :                 /* otherwise, constraint constrains arc to be only its color */
    2064           8 :                 return REPLACEARC;
    2065             :             }
    2066       23374 :             return INCOMPATIBLE;
    2067             :             break;
    2068       13868 :         case CA('^', BEHIND):   /* collision, dissimilar constraints */
    2069             :         case CA(BEHIND, '^'):
    2070             :         case CA('$', AHEAD):
    2071             :         case CA(AHEAD, '$'):
    2072       13868 :             return INCOMPATIBLE;
    2073             :             break;
    2074      174882 :         case CA('^', '$'):      /* constraints passing each other */
    2075             :         case CA('^', AHEAD):
    2076             :         case CA(BEHIND, '$'):
    2077             :         case CA(BEHIND, AHEAD):
    2078             :         case CA('$', '^'):
    2079             :         case CA('$', BEHIND):
    2080             :         case CA(AHEAD, '^'):
    2081             :         case CA(AHEAD, BEHIND):
    2082             :         case CA('^', LACON):
    2083             :         case CA(BEHIND, LACON):
    2084             :         case CA('$', LACON):
    2085             :         case CA(AHEAD, LACON):
    2086      174882 :             return COMPATIBLE;
    2087             :             break;
    2088             :     }
    2089             :     assert(NOTREACHED);
    2090           0 :     return INCOMPATIBLE;        /* for benefit of blind compilers */
    2091             : }
    2092             : 
    2093             : /*
    2094             :  * fixempties - get rid of EMPTY arcs
    2095             :  */
    2096             : static void
    2097       13414 : fixempties(struct nfa *nfa,
    2098             :            FILE *f)             /* for debug output; NULL none */
    2099             : {
    2100             :     struct state *s;
    2101             :     struct state *s2;
    2102             :     struct state *nexts;
    2103             :     struct arc *a;
    2104             :     struct arc *nexta;
    2105             :     int         totalinarcs;
    2106             :     struct arc **inarcsorig;
    2107             :     struct arc **arcarray;
    2108             :     int         arccount;
    2109             :     int         prevnins;
    2110             :     int         nskip;
    2111             : 
    2112             :     /*
    2113             :      * First, get rid of any states whose sole out-arc is an EMPTY, since
    2114             :      * they're basically just aliases for their successor.  The parsing
    2115             :      * algorithm creates enough of these that it's worth special-casing this.
    2116             :      */
    2117      312790 :     for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
    2118             :     {
    2119      299376 :         nexts = s->next;
    2120      299376 :         if (s->flag || s->nouts != 1)
    2121       84880 :             continue;
    2122      214496 :         a = s->outs;
    2123             :         assert(a != NULL && a->outchain == NULL);
    2124      214496 :         if (a->type != EMPTY)
    2125      119444 :             continue;
    2126       95052 :         if (s != a->to)
    2127       95052 :             moveins(nfa, s, a->to);
    2128       95052 :         dropstate(nfa, s);
    2129             :     }
    2130             : 
    2131             :     /*
    2132             :      * Similarly, get rid of any state with a single EMPTY in-arc, by folding
    2133             :      * it into its predecessor.
    2134             :      */
    2135      217738 :     for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
    2136             :     {
    2137      204324 :         nexts = s->next;
    2138             :         /* while we're at it, ensure tmp fields are clear for next step */
    2139             :         assert(s->tmp == NULL);
    2140      204324 :         if (s->flag || s->nins != 1)
    2141       80110 :             continue;
    2142      124214 :         a = s->ins;
    2143             :         assert(a != NULL && a->inchain == NULL);
    2144      124214 :         if (a->type != EMPTY)
    2145      102738 :             continue;
    2146       21476 :         if (s != a->from)
    2147       21476 :             moveouts(nfa, s, a->from);
    2148       21476 :         dropstate(nfa, s);
    2149             :     }
    2150             : 
    2151       13414 :     if (NISERR())
    2152           0 :         return;
    2153             : 
    2154             :     /*
    2155             :      * For each remaining NFA state, find all other states from which it is
    2156             :      * reachable by a chain of one or more EMPTY arcs.  Then generate new arcs
    2157             :      * that eliminate the need for each such chain.
    2158             :      *
    2159             :      * We could replace a chain of EMPTY arcs that leads from a "from" state
    2160             :      * to a "to" state either by pushing non-EMPTY arcs forward (linking
    2161             :      * directly from "from"'s predecessors to "to") or by pulling them back
    2162             :      * (linking directly from "from" to "to"'s successors).  We choose to
    2163             :      * always do the former; this choice is somewhat arbitrary, but the
    2164             :      * approach below requires that we uniformly do one or the other.
    2165             :      *
    2166             :      * Suppose we have a chain of N successive EMPTY arcs (where N can easily
    2167             :      * approach the size of the NFA).  All of the intermediate states must
    2168             :      * have additional inarcs and outarcs, else they'd have been removed by
    2169             :      * the steps above.  Assuming their inarcs are mostly not empties, we will
    2170             :      * add O(N^2) arcs to the NFA, since a non-EMPTY inarc leading to any one
    2171             :      * state in the chain must be duplicated to lead to all its successor
    2172             :      * states as well.  So there is no hope of doing less than O(N^2) work;
    2173             :      * however, we should endeavor to keep the big-O cost from being even
    2174             :      * worse than that, which it can easily become without care.  In
    2175             :      * particular, suppose we were to copy all S1's inarcs forward to S2, and
    2176             :      * then also to S3, and then later we consider pushing S2's inarcs forward
    2177             :      * to S3.  If we include the arcs already copied from S1 in that, we'd be
    2178             :      * doing O(N^3) work.  (The duplicate-arc elimination built into newarc()
    2179             :      * and its cohorts would get rid of the extra arcs, but not without cost.)
    2180             :      *
    2181             :      * We can avoid this cost by treating only arcs that existed at the start
    2182             :      * of this phase as candidates to be pushed forward.  To identify those,
    2183             :      * we remember the first inarc each state had to start with.  We rely on
    2184             :      * the fact that newarc() and friends put new arcs on the front of their
    2185             :      * to-states' inchains, and that this phase never deletes arcs, so that
    2186             :      * the original arcs must be the last arcs in their to-states' inchains.
    2187             :      *
    2188             :      * So the process here is that, for each state in the NFA, we gather up
    2189             :      * all non-EMPTY inarcs of states that can reach the target state via
    2190             :      * EMPTY arcs.  We then sort, de-duplicate, and merge these arcs into the
    2191             :      * target state's inchain.  (We can safely use sort-merge for this as long
    2192             :      * as we update each state's original-arcs pointer after we add arcs to
    2193             :      * it; the sort step of mergeins probably changed the order of the old
    2194             :      * arcs.)
    2195             :      *
    2196             :      * Another refinement worth making is that, because we only add non-EMPTY
    2197             :      * arcs during this phase, and all added arcs have the same from-state as
    2198             :      * the non-EMPTY arc they were cloned from, we know ahead of time that any
    2199             :      * states having only EMPTY outarcs will be useless for lack of outarcs
    2200             :      * after we drop the EMPTY arcs.  (They cannot gain non-EMPTY outarcs if
    2201             :      * they had none to start with.)  So we need not bother to update the
    2202             :      * inchains of such states at all.
    2203             :      */
    2204             : 
    2205             :     /* Remember the states' first original inarcs */
    2206             :     /* ... and while at it, count how many old inarcs there are altogether */
    2207       13414 :     inarcsorig = (struct arc **) MALLOC(nfa->nstates * sizeof(struct arc *));
    2208       13414 :     if (inarcsorig == NULL)
    2209             :     {
    2210           0 :         NERR(REG_ESPACE);
    2211           0 :         return;
    2212             :     }
    2213       13414 :     totalinarcs = 0;
    2214      196262 :     for (s = nfa->states; s != NULL; s = s->next)
    2215             :     {
    2216      182848 :         inarcsorig[s->no] = s->ins;
    2217      182848 :         totalinarcs += s->nins;
    2218             :     }
    2219             : 
    2220             :     /*
    2221             :      * Create a workspace for accumulating the inarcs to be added to the
    2222             :      * current target state.  totalinarcs is probably a considerable
    2223             :      * overestimate of the space needed, but the NFA is unlikely to be large
    2224             :      * enough at this point to make it worth being smarter.
    2225             :      */
    2226       13414 :     arcarray = (struct arc **) MALLOC(totalinarcs * sizeof(struct arc *));
    2227       13414 :     if (arcarray == NULL)
    2228             :     {
    2229           0 :         NERR(REG_ESPACE);
    2230           0 :         FREE(inarcsorig);
    2231           0 :         return;
    2232             :     }
    2233             : 
    2234             :     /* And iterate over the target states */
    2235      192954 :     for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
    2236             :     {
    2237             :         /* Ignore target states without non-EMPTY outarcs, per note above */
    2238      179540 :         if (!s->flag && !hasnonemptyout(s))
    2239        3224 :             continue;
    2240             : 
    2241             :         /* Find predecessor states and accumulate their original inarcs */
    2242      176316 :         arccount = 0;
    2243    10413410 :         for (s2 = emptyreachable(nfa, s, s, inarcsorig); s2 != s; s2 = nexts)
    2244             :         {
    2245             :             /* Add s2's original inarcs to arcarray[], but ignore empties */
    2246    30739980 :             for (a = inarcsorig[s2->no]; a != NULL; a = a->inchain)
    2247             :             {
    2248    20502886 :                 if (a->type != EMPTY)
    2249    10268730 :                     arcarray[arccount++] = a;
    2250             :             }
    2251             : 
    2252             :             /* Reset the tmp fields as we walk back */
    2253    10237094 :             nexts = s2->tmp;
    2254    10237094 :             s2->tmp = NULL;
    2255             :         }
    2256      176316 :         s->tmp = NULL;
    2257             :         assert(arccount <= totalinarcs);
    2258             : 
    2259             :         /* Remember how many original inarcs this state has */
    2260      176316 :         prevnins = s->nins;
    2261             : 
    2262             :         /* Add non-duplicate inarcs to target state */
    2263      176316 :         mergeins(nfa, s, arcarray, arccount);
    2264             : 
    2265             :         /* Now we must update the state's inarcsorig pointer */
    2266      176316 :         nskip = s->nins - prevnins;
    2267      176316 :         a = s->ins;
    2268    10436422 :         while (nskip-- > 0)
    2269    10260106 :             a = a->inchain;
    2270      176316 :         inarcsorig[s->no] = a;
    2271             :     }
    2272             : 
    2273       13414 :     FREE(arcarray);
    2274       13414 :     FREE(inarcsorig);
    2275             : 
    2276       13414 :     if (NISERR())
    2277           4 :         return;
    2278             : 
    2279             :     /*
    2280             :      * Now remove all the EMPTY arcs, since we don't need them anymore.
    2281             :      */
    2282      184250 :     for (s = nfa->states; s != NULL; s = s->next)
    2283             :     {
    2284     1281352 :         for (a = s->outs; a != NULL; a = nexta)
    2285             :         {
    2286     1110512 :             nexta = a->outchain;
    2287     1110512 :             if (a->type == EMPTY)
    2288       21798 :                 freearc(nfa, a);
    2289             :         }
    2290             :     }
    2291             : 
    2292             :     /*
    2293             :      * And remove any states that have become useless.  (This cleanup is not
    2294             :      * very thorough, and would be even less so if we tried to combine it with
    2295             :      * the previous step; but cleanup() will take care of anything we miss.)
    2296             :      */
    2297      184250 :     for (s = nfa->states; s != NULL; s = nexts)
    2298             :     {
    2299      170840 :         nexts = s->next;
    2300      170840 :         if ((s->nins == 0 || s->nouts == 0) && !s->flag)
    2301        3224 :             dropstate(nfa, s);
    2302             :     }
    2303             : 
    2304       13410 :     if (f != NULL)
    2305           0 :         dumpnfa(nfa, f);
    2306             : }
    2307             : 
    2308             : /*
    2309             :  * emptyreachable - recursively find all states that can reach s by EMPTY arcs
    2310             :  *
    2311             :  * The return value is the last such state found.  Its tmp field links back
    2312             :  * to the next-to-last such state, and so on back to s, so that all these
    2313             :  * states can be located without searching the whole NFA.
    2314             :  *
    2315             :  * Since this is only used in fixempties(), we pass in the inarcsorig[] array
    2316             :  * maintained by that function.  This lets us skip over all new inarcs, which
    2317             :  * are certainly not EMPTY arcs.
    2318             :  *
    2319             :  * The maximum recursion depth here is equal to the length of the longest
    2320             :  * loop-free chain of EMPTY arcs, which is surely no more than the size of
    2321             :  * the NFA ... but that could still be enough to cause trouble.
    2322             :  */
    2323             : static struct state *
    2324    10413410 : emptyreachable(struct nfa *nfa,
    2325             :                struct state *s,
    2326             :                struct state *lastfound,
    2327             :                struct arc **inarcsorig)
    2328             : {
    2329             :     struct arc *a;
    2330             : 
    2331             :     /* Since this is recursive, it could be driven to stack overflow */
    2332    10413410 :     if (STACK_TOO_DEEP(nfa->v->re))
    2333             :     {
    2334           0 :         NERR(REG_ETOOBIG);
    2335           0 :         return lastfound;
    2336             :     }
    2337             : 
    2338    10413410 :     s->tmp = lastfound;
    2339    10413410 :     lastfound = s;
    2340    31235618 :     for (a = inarcsorig[s->no]; a != NULL; a = a->inchain)
    2341             :     {
    2342    20822208 :         if (a->type == EMPTY && a->from->tmp == NULL)
    2343    10237094 :             lastfound = emptyreachable(nfa, a->from, lastfound, inarcsorig);
    2344             :     }
    2345    10413410 :     return lastfound;
    2346             : }
    2347             : 
    2348             : /*
    2349             :  * isconstraintarc - detect whether an arc is of a constraint type
    2350             :  */
    2351             : static inline int
    2352     2288380 : isconstraintarc(struct arc *a)
    2353             : {
    2354     2288380 :     switch (a->type)
    2355             :     {
    2356      249096 :         case '^':
    2357             :         case '$':
    2358             :         case BEHIND:
    2359             :         case AHEAD:
    2360             :         case LACON:
    2361      249096 :             return 1;
    2362             :     }
    2363     2039284 :     return 0;
    2364             : }
    2365             : 
    2366             : /*
    2367             :  * hasconstraintout - does state have a constraint out arc?
    2368             :  */
    2369             : static int
    2370       21558 : hasconstraintout(struct state *s)
    2371             : {
    2372             :     struct arc *a;
    2373             : 
    2374       40156 :     for (a = s->outs; a != NULL; a = a->outchain)
    2375             :     {
    2376       32428 :         if (isconstraintarc(a))
    2377       13830 :             return 1;
    2378             :     }
    2379        7728 :     return 0;
    2380             : }
    2381             : 
    2382             : /*
    2383             :  * fixconstraintloops - get rid of loops containing only constraint arcs
    2384             :  *
    2385             :  * A loop of states that contains only constraint arcs is useless, since
    2386             :  * passing around the loop represents no forward progress.  Moreover, it
    2387             :  * would cause infinite looping in pullback/pushfwd, so we need to get rid
    2388             :  * of such loops before doing that.
    2389             :  */
    2390             : static void
    2391       13414 : fixconstraintloops(struct nfa *nfa,
    2392             :                    FILE *f)     /* for debug output; NULL none */
    2393             : {
    2394             :     struct state *s;
    2395             :     struct state *nexts;
    2396             :     struct arc *a;
    2397             :     struct arc *nexta;
    2398             :     int         hasconstraints;
    2399             : 
    2400             :     /*
    2401             :      * In the trivial case of a state that loops to itself, we can just drop
    2402             :      * the constraint arc altogether.  This is worth special-casing because
    2403             :      * such loops are far more common than loops containing multiple states.
    2404             :      * While we're at it, note whether any constraint arcs survive.
    2405             :      */
    2406       13414 :     hasconstraints = 0;
    2407      181030 :     for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
    2408             :     {
    2409      167616 :         nexts = s->next;
    2410             :         /* while we're at it, ensure tmp fields are clear for next step */
    2411             :         assert(s->tmp == NULL);
    2412     1253112 :         for (a = s->outs; a != NULL && !NISERR(); a = nexta)
    2413             :         {
    2414     1085496 :             nexta = a->outchain;
    2415     1085496 :             if (isconstraintarc(a))
    2416             :             {
    2417       88114 :                 if (a->to == s)
    2418         296 :                     freearc(nfa, a);
    2419             :                 else
    2420       87818 :                     hasconstraints = 1;
    2421             :             }
    2422             :         }
    2423             :         /* If we removed all the outarcs, the state is useless. */
    2424      167616 :         if (s->nouts == 0 && !s->flag)
    2425           0 :             dropstate(nfa, s);
    2426             :     }
    2427             : 
    2428             :     /* Nothing to do if no remaining constraint arcs */
    2429       13414 :     if (NISERR() || !hasconstraints)
    2430           4 :         return;
    2431             : 
    2432             :     /*
    2433             :      * Starting from each remaining NFA state, search outwards for a
    2434             :      * constraint loop.  If we find a loop, break the loop, then start the
    2435             :      * search over.  (We could possibly retain some state from the first scan,
    2436             :      * but it would complicate things greatly, and multi-state constraint
    2437             :      * loops are rare enough that it's not worth optimizing the case.)
    2438             :      */
    2439       13410 : restart:
    2440      192550 :     for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
    2441             :     {
    2442      179140 :         if (findconstraintloop(nfa, s))
    2443         342 :             goto restart;
    2444             :     }
    2445             : 
    2446       13410 :     if (NISERR())
    2447           0 :         return;
    2448             : 
    2449             :     /*
    2450             :      * Now remove any states that have become useless.  (This cleanup is not
    2451             :      * very thorough, and would be even less so if we tried to combine it with
    2452             :      * the previous step; but cleanup() will take care of anything we miss.)
    2453             :      *
    2454             :      * Because findconstraintloop intentionally doesn't reset all tmp fields,
    2455             :      * we have to clear them after it's done.  This is a convenient place to
    2456             :      * do that, too.
    2457             :      */
    2458      182430 :     for (s = nfa->states; s != NULL; s = nexts)
    2459             :     {
    2460      169020 :         nexts = s->next;
    2461      169020 :         s->tmp = NULL;
    2462      169020 :         if ((s->nins == 0 || s->nouts == 0) && !s->flag)
    2463         364 :             dropstate(nfa, s);
    2464             :     }
    2465             : 
    2466       13410 :     if (f != NULL)
    2467           0 :         dumpnfa(nfa, f);
    2468             : }
    2469             : 
    2470             : /*
    2471             :  * findconstraintloop - recursively find a loop of constraint arcs
    2472             :  *
    2473             :  * If we find a loop, break it by calling breakconstraintloop(), then
    2474             :  * return 1; otherwise return 0.
    2475             :  *
    2476             :  * State tmp fields are guaranteed all NULL on a success return, because
    2477             :  * breakconstraintloop does that.  After a failure return, any state that
    2478             :  * is known not to be part of a loop is marked with s->tmp == s; this allows
    2479             :  * us not to have to re-prove that fact on later calls.  (This convention is
    2480             :  * workable because we already eliminated single-state loops.)
    2481             :  *
    2482             :  * Note that the found loop doesn't necessarily include the first state we
    2483             :  * are called on.  Any loop reachable from that state will do.
    2484             :  *
    2485             :  * The maximum recursion depth here is one more than the length of the longest
    2486             :  * loop-free chain of constraint arcs, which is surely no more than the size
    2487             :  * of the NFA ... but that could still be enough to cause trouble.
    2488             :  */
    2489             : static int
    2490      301536 : findconstraintloop(struct nfa *nfa, struct state *s)
    2491             : {
    2492             :     struct arc *a;
    2493             : 
    2494             :     /* Since this is recursive, it could be driven to stack overflow */
    2495      301536 :     if (STACK_TOO_DEEP(nfa->v->re))
    2496             :     {
    2497           0 :         NERR(REG_ETOOBIG);
    2498           0 :         return 1;               /* to exit as quickly as possible */
    2499             :     }
    2500             : 
    2501      301536 :     if (s->tmp != NULL)
    2502             :     {
    2503             :         /* Already proven uninteresting? */
    2504      118418 :         if (s->tmp == s)
    2505      118076 :             return 0;
    2506             :         /* Found a loop involving s */
    2507         342 :         breakconstraintloop(nfa, s);
    2508             :         /* The tmp fields have been cleaned up by breakconstraintloop */
    2509         342 :         return 1;
    2510             :     }
    2511     1326838 :     for (a = s->outs; a != NULL; a = a->outchain)
    2512             :     {
    2513     1144826 :         if (isconstraintarc(a))
    2514             :         {
    2515      122396 :             struct state *sto = a->to;
    2516             : 
    2517             :             assert(sto != s);
    2518      122396 :             s->tmp = sto;
    2519      122396 :             if (findconstraintloop(nfa, sto))
    2520        1106 :                 return 1;
    2521             :         }
    2522             :     }
    2523             : 
    2524             :     /*
    2525             :      * If we get here, no constraint loop exists leading out from s.  Mark it
    2526             :      * with s->tmp == s so we need not rediscover that fact again later.
    2527             :      */
    2528      182012 :     s->tmp = s;
    2529      182012 :     return 0;
    2530             : }
    2531             : 
    2532             : /*
    2533             :  * breakconstraintloop - break a loop of constraint arcs
    2534             :  *
    2535             :  * sinitial is any one member state of the loop.  Each loop member's tmp
    2536             :  * field links to its successor within the loop.  (Note that this function
    2537             :  * will reset all the tmp fields to NULL.)
    2538             :  *
    2539             :  * We can break the loop by, for any one state S1 in the loop, cloning its
    2540             :  * loop successor state S2 (and possibly following states), and then moving
    2541             :  * all S1->S2 constraint arcs to point to the cloned S2.  The cloned S2 should
    2542             :  * copy any non-constraint outarcs of S2.  Constraint outarcs should be
    2543             :  * dropped if they point back to S1, else they need to be copied as arcs to
    2544             :  * similarly cloned states S3, S4, etc.  In general, each cloned state copies
    2545             :  * non-constraint outarcs, drops constraint outarcs that would lead to itself
    2546             :  * or any earlier cloned state, and sends other constraint outarcs to newly
    2547             :  * cloned states.  No cloned state will have any inarcs that aren't constraint
    2548             :  * arcs or do not lead from S1 or earlier-cloned states.  It's okay to drop
    2549             :  * constraint back-arcs since they would not take us to any state we've not
    2550             :  * already been in; therefore, no new constraint loop is created.  In this way
    2551             :  * we generate a modified NFA that can still represent every useful state
    2552             :  * sequence, but not sequences that represent state loops with no consumption
    2553             :  * of input data.  Note that the set of cloned states will certainly include
    2554             :  * all of the loop member states other than S1, and it may also include
    2555             :  * non-loop states that are reachable from S2 via constraint arcs.  This is
    2556             :  * important because there is no guarantee that findconstraintloop found a
    2557             :  * maximal loop (and searching for one would be NP-hard, so don't try).
    2558             :  * Frequently the "non-loop states" are actually part of a larger loop that
    2559             :  * we didn't notice, and indeed there may be several overlapping loops.
    2560             :  * This technique ensures convergence in such cases, while considering only
    2561             :  * the originally-found loop does not.
    2562             :  *
    2563             :  * If there is only one S1->S2 constraint arc, then that constraint is
    2564             :  * certainly satisfied when we enter any of the clone states.  This means that
    2565             :  * in the common case where many of the constraint arcs are identically
    2566             :  * labeled, we can merge together clone states linked by a similarly-labeled
    2567             :  * constraint: if we can get to the first one we can certainly get to the
    2568             :  * second, so there's no need to distinguish.  This greatly reduces the number
    2569             :  * of new states needed, so we preferentially break the given loop at a state
    2570             :  * pair where this is true.
    2571             :  *
    2572             :  * Furthermore, it's fairly common to find that a cloned successor state has
    2573             :  * no outarcs, especially if we're a bit aggressive about removing unnecessary
    2574             :  * outarcs.  If that happens, then there is simply not any interesting state
    2575             :  * that can be reached through the predecessor's loop arcs, which means we can
    2576             :  * break the loop just by removing those loop arcs, with no new states added.
    2577             :  */
    2578             : static void
    2579         342 : breakconstraintloop(struct nfa *nfa, struct state *sinitial)
    2580             : {
    2581             :     struct state *s;
    2582             :     struct state *shead;
    2583             :     struct state *stail;
    2584             :     struct state *sclone;
    2585             :     struct state *nexts;
    2586             :     struct arc *refarc;
    2587             :     struct arc *a;
    2588             :     struct arc *nexta;
    2589             : 
    2590             :     /*
    2591             :      * Start by identifying which loop step we want to break at.
    2592             :      * Preferentially this is one with only one constraint arc.  (XXX are
    2593             :      * there any other secondary heuristics we want to use here?)  Set refarc
    2594             :      * to point to the selected lone constraint arc, if there is one.
    2595             :      */
    2596         342 :     refarc = NULL;
    2597         342 :     s = sinitial;
    2598             :     do
    2599             :     {
    2600         884 :         nexts = s->tmp;
    2601             :         assert(nexts != s);     /* should not see any one-element loops */
    2602         884 :         if (refarc == NULL)
    2603             :         {
    2604         556 :             int         narcs = 0;
    2605             : 
    2606        6708 :             for (a = s->outs; a != NULL; a = a->outchain)
    2607             :             {
    2608        6152 :                 if (a->to == nexts && isconstraintarc(a))
    2609             :                 {
    2610        2336 :                     refarc = a;
    2611        2336 :                     narcs++;
    2612             :                 }
    2613             :             }
    2614             :             assert(narcs > 0);
    2615         556 :             if (narcs > 1)
    2616         316 :                 refarc = NULL;  /* multiple constraint arcs here, no good */
    2617             :         }
    2618         884 :         s = nexts;
    2619         884 :     } while (s != sinitial);
    2620             : 
    2621         342 :     if (refarc)
    2622             :     {
    2623             :         /* break at the refarc */
    2624         240 :         shead = refarc->from;
    2625         240 :         stail = refarc->to;
    2626             :         assert(stail == shead->tmp);
    2627             :     }
    2628             :     else
    2629             :     {
    2630             :         /* for lack of a better idea, break after sinitial */
    2631         102 :         shead = sinitial;
    2632         102 :         stail = sinitial->tmp;
    2633             :     }
    2634             : 
    2635             :     /*
    2636             :      * Reset the tmp fields so that we can use them for local storage in
    2637             :      * clonesuccessorstates.  (findconstraintloop won't mind, since it's just
    2638             :      * going to abandon its search anyway.)
    2639             :      */
    2640       28308 :     for (s = nfa->states; s != NULL; s = s->next)
    2641       27966 :         s->tmp = NULL;
    2642             : 
    2643             :     /*
    2644             :      * Recursively build clone state(s) as needed.
    2645             :      */
    2646         342 :     sclone = newstate(nfa);
    2647         342 :     if (sclone == NULL)
    2648             :     {
    2649             :         assert(NISERR());
    2650           0 :         return;
    2651             :     }
    2652             : 
    2653         342 :     clonesuccessorstates(nfa, stail, sclone, shead, refarc,
    2654             :                          NULL, NULL, nfa->nstates);
    2655             : 
    2656         342 :     if (NISERR())
    2657           0 :         return;
    2658             : 
    2659             :     /*
    2660             :      * It's possible that sclone has no outarcs at all, in which case it's
    2661             :      * useless.  (We don't try extremely hard to get rid of useless states
    2662             :      * here, but this is an easy and fairly common case.)
    2663             :      */
    2664         342 :     if (sclone->nouts == 0)
    2665             :     {
    2666          80 :         freestate(nfa, sclone);
    2667          80 :         sclone = NULL;
    2668             :     }
    2669             : 
    2670             :     /*
    2671             :      * Move shead's constraint-loop arcs to point to sclone, or just drop them
    2672             :      * if we discovered we don't need sclone.
    2673             :      */
    2674        3900 :     for (a = shead->outs; a != NULL; a = nexta)
    2675             :     {
    2676        3558 :         nexta = a->outchain;
    2677        3558 :         if (a->to == stail && isconstraintarc(a))
    2678             :         {
    2679         862 :             if (sclone)
    2680         730 :                 cparc(nfa, a, shead, sclone);
    2681         862 :             freearc(nfa, a);
    2682         862 :             if (NISERR())
    2683           0 :                 break;
    2684             :         }
    2685             :     }
    2686             : }
    2687             : 
    2688             : /*
    2689             :  * clonesuccessorstates - create a tree of constraint-arc successor states
    2690             :  *
    2691             :  * ssource is the state to be cloned, and sclone is the state to copy its
    2692             :  * outarcs into.  sclone's inarcs, if any, should already be set up.
    2693             :  *
    2694             :  * spredecessor is the original predecessor state that we are trying to build
    2695             :  * successors for (it may not be the immediate predecessor of ssource).
    2696             :  * refarc, if not NULL, is the original constraint arc that is known to have
    2697             :  * been traversed out of spredecessor to reach the successor(s).
    2698             :  *
    2699             :  * For each cloned successor state, we transiently create a "donemap" that is
    2700             :  * a boolean array showing which source states we've already visited for this
    2701             :  * clone state.  This prevents infinite recursion as well as useless repeat
    2702             :  * visits to the same state subtree (which can add up fast, since typical NFAs
    2703             :  * have multiple redundant arc pathways).  Each donemap is a char array
    2704             :  * indexed by state number.  The donemaps are all of the same size "nstates",
    2705             :  * which is nfa->nstates as of the start of the recursion.  This is enough to
    2706             :  * have entries for all pre-existing states, but *not* entries for clone
    2707             :  * states created during the recursion.  That's okay since we have no need to
    2708             :  * mark those.
    2709             :  *
    2710             :  * curdonemap is NULL when recursing to a new sclone state, or sclone's
    2711             :  * donemap when we are recursing without having created a new state (which we
    2712             :  * do when we decide we can merge a successor state into the current clone
    2713             :  * state).  outerdonemap is NULL at the top level and otherwise the parent
    2714             :  * clone state's donemap.
    2715             :  *
    2716             :  * The successor states we create and fill here form a strict tree structure,
    2717             :  * with each state having exactly one predecessor, except that the toplevel
    2718             :  * state has no inarcs as yet (breakconstraintloop will add its inarcs from
    2719             :  * spredecessor after we're done).  Thus, we can examine sclone's inarcs back
    2720             :  * to the root, plus refarc if any, to identify the set of constraints already
    2721             :  * known valid at the current point.  This allows us to avoid generating extra
    2722             :  * successor states.
    2723             :  */
    2724             : static void
    2725        2966 : clonesuccessorstates(struct nfa *nfa,
    2726             :                      struct state *ssource,
    2727             :                      struct state *sclone,
    2728             :                      struct state *spredecessor,
    2729             :                      struct arc *refarc,
    2730             :                      char *curdonemap,
    2731             :                      char *outerdonemap,
    2732             :                      int nstates)
    2733             : {
    2734             :     char       *donemap;
    2735             :     struct arc *a;
    2736             : 
    2737             :     /* Since this is recursive, it could be driven to stack overflow */
    2738        2966 :     if (STACK_TOO_DEEP(nfa->v->re))
    2739             :     {
    2740           0 :         NERR(REG_ETOOBIG);
    2741           0 :         return;
    2742             :     }
    2743             : 
    2744             :     /* If this state hasn't already got a donemap, create one */
    2745        2966 :     donemap = curdonemap;
    2746        2966 :     if (donemap == NULL)
    2747             :     {
    2748        1484 :         donemap = (char *) MALLOC(nstates * sizeof(char));
    2749        1484 :         if (donemap == NULL)
    2750             :         {
    2751           0 :             NERR(REG_ESPACE);
    2752           0 :             return;
    2753             :         }
    2754             : 
    2755        1484 :         if (outerdonemap != NULL)
    2756             :         {
    2757             :             /*
    2758             :              * Not at outermost recursion level, so copy the outer level's
    2759             :              * donemap; this ensures that we see states in process of being
    2760             :              * visited at outer levels, or already merged into predecessor
    2761             :              * states, as ones we shouldn't traverse back to.
    2762             :              */
    2763        1142 :             memcpy(donemap, outerdonemap, nstates * sizeof(char));
    2764             :         }
    2765             :         else
    2766             :         {
    2767             :             /* At outermost level, only spredecessor is off-limits */
    2768         342 :             memset(donemap, 0, nstates * sizeof(char));
    2769             :             assert(spredecessor->no < nstates);
    2770         342 :             donemap[spredecessor->no] = 1;
    2771             :         }
    2772             :     }
    2773             : 
    2774             :     /* Mark ssource as visited in the donemap */
    2775             :     assert(ssource->no < nstates);
    2776             :     assert(donemap[ssource->no] == 0);
    2777        2966 :     donemap[ssource->no] = 1;
    2778             : 
    2779             :     /*
    2780             :      * We proceed by first cloning all of ssource's outarcs, creating new
    2781             :      * clone states as needed but not doing more with them than that.  Then in
    2782             :      * a second pass, recurse to process the child clone states.  This allows
    2783             :      * us to have only one child clone state per reachable source state, even
    2784             :      * when there are multiple outarcs leading to the same state.  Also, when
    2785             :      * we do visit a child state, its set of inarcs is known exactly, which
    2786             :      * makes it safe to apply the constraint-is-already-checked optimization.
    2787             :      * Also, this ensures that we've merged all the states we can into the
    2788             :      * current clone before we recurse to any children, thus possibly saving
    2789             :      * them from making extra images of those states.
    2790             :      *
    2791             :      * While this function runs, child clone states of the current state are
    2792             :      * marked by setting their tmp fields to point to the original state they
    2793             :      * were cloned from.  This makes it possible to detect multiple outarcs
    2794             :      * leading to the same state, and also makes it easy to distinguish clone
    2795             :      * states from original states (which will have tmp == NULL).
    2796             :      */
    2797       25398 :     for (a = ssource->outs; a != NULL && !NISERR(); a = a->outchain)
    2798             :     {
    2799       22432 :         struct state *sto = a->to;
    2800             : 
    2801             :         /*
    2802             :          * We do not consider cloning successor states that have no constraint
    2803             :          * outarcs; just link to them as-is.  They cannot be part of a
    2804             :          * constraint loop so there is no need to make copies.  In particular,
    2805             :          * this rule keeps us from trying to clone the post state, which would
    2806             :          * be a bad idea.
    2807             :          */
    2808       22432 :         if (isconstraintarc(a) && hasconstraintout(sto))
    2809        9752 :         {
    2810             :             struct state *prevclone;
    2811             :             int         canmerge;
    2812             :             struct arc *a2;
    2813             : 
    2814             :             /*
    2815             :              * Back-link constraint arcs must not be followed.  Nor is there a
    2816             :              * need to revisit states previously merged into this clone.
    2817             :              */
    2818             :             assert(sto->no < nstates);
    2819       13830 :             if (donemap[sto->no] != 0)
    2820        4078 :                 continue;
    2821             : 
    2822             :             /*
    2823             :              * Check whether we already have a child clone state for this
    2824             :              * source state.
    2825             :              */
    2826        9752 :             prevclone = NULL;
    2827       31734 :             for (a2 = sclone->outs; a2 != NULL; a2 = a2->outchain)
    2828             :             {
    2829       29110 :                 if (a2->to->tmp == sto)
    2830             :                 {
    2831        7128 :                     prevclone = a2->to;
    2832        7128 :                     break;
    2833             :                 }
    2834             :             }
    2835             : 
    2836             :             /*
    2837             :              * If this arc is labeled the same as refarc, or the same as any
    2838             :              * arc we must have traversed to get to sclone, then no additional
    2839             :              * constraints need to be met to get to sto, so we should just
    2840             :              * merge its outarcs into sclone.
    2841             :              */
    2842        9752 :             if (refarc && a->type == refarc->type && a->co == refarc->co)
    2843        1482 :                 canmerge = 1;
    2844             :             else
    2845             :             {
    2846             :                 struct state *s;
    2847             : 
    2848        8270 :                 canmerge = 0;
    2849       41252 :                 for (s = sclone; s->ins; s = s->ins->from)
    2850             :                 {
    2851       32982 :                     if (s->nins == 1 &&
    2852          32 :                         a->type == s->ins->type && a->co == s->ins->co)
    2853             :                     {
    2854           0 :                         canmerge = 1;
    2855           0 :                         break;
    2856             :                     }
    2857             :                 }
    2858             :             }
    2859             : 
    2860        9752 :             if (canmerge)
    2861             :             {
    2862             :                 /*
    2863             :                  * We can merge into sclone.  If we previously made a child
    2864             :                  * clone state, drop it; there's no need to visit it.  (This
    2865             :                  * can happen if ssource has multiple pathways to sto, and we
    2866             :                  * only just now found one that is provably a no-op.)
    2867             :                  */
    2868        1482 :                 if (prevclone)
    2869           0 :                     dropstate(nfa, prevclone);  /* kills our outarc, too */
    2870             : 
    2871             :                 /* Recurse to merge sto's outarcs into sclone */
    2872        1482 :                 clonesuccessorstates(nfa,
    2873             :                                      sto,
    2874             :                                      sclone,
    2875             :                                      spredecessor,
    2876             :                                      refarc,
    2877             :                                      donemap,
    2878             :                                      outerdonemap,
    2879             :                                      nstates);
    2880             :                 /* sto should now be marked as previously visited */
    2881             :                 assert(NISERR() || donemap[sto->no] == 1);
    2882             :             }
    2883        8270 :             else if (prevclone)
    2884             :             {
    2885             :                 /*
    2886             :                  * We already have a clone state for this successor, so just
    2887             :                  * make another arc to it.
    2888             :                  */
    2889        7128 :                 cparc(nfa, a, sclone, prevclone);
    2890             :             }
    2891             :             else
    2892             :             {
    2893             :                 /*
    2894             :                  * We need to create a new successor clone state.
    2895             :                  */
    2896             :                 struct state *stoclone;
    2897             : 
    2898        1142 :                 stoclone = newstate(nfa);
    2899        1142 :                 if (stoclone == NULL)
    2900             :                 {
    2901             :                     assert(NISERR());
    2902           0 :                     break;
    2903             :                 }
    2904             :                 /* Mark it as to what it's a clone of */
    2905        1142 :                 stoclone->tmp = sto;
    2906             :                 /* ... and add the outarc leading to it */
    2907        1142 :                 cparc(nfa, a, sclone, stoclone);
    2908             :             }
    2909             :         }
    2910             :         else
    2911             :         {
    2912             :             /*
    2913             :              * Non-constraint outarcs just get copied to sclone, as do outarcs
    2914             :              * leading to states with no constraint outarc.
    2915             :              */
    2916        8602 :             cparc(nfa, a, sclone, sto);
    2917             :         }
    2918             :     }
    2919             : 
    2920             :     /*
    2921             :      * If we are at outer level for this clone state, recurse to all its child
    2922             :      * clone states, clearing their tmp fields as we go.  (If we're not
    2923             :      * outermost for sclone, leave this to be done by the outer call level.)
    2924             :      * Note that if we have multiple outarcs leading to the same clone state,
    2925             :      * it will only be recursed-to once.
    2926             :      */
    2927        2966 :     if (curdonemap == NULL)
    2928             :     {
    2929       14040 :         for (a = sclone->outs; a != NULL && !NISERR(); a = a->outchain)
    2930             :         {
    2931       12556 :             struct state *stoclone = a->to;
    2932       12556 :             struct state *sto = stoclone->tmp;
    2933             : 
    2934       12556 :             if (sto != NULL)
    2935             :             {
    2936        1142 :                 stoclone->tmp = NULL;
    2937        1142 :                 clonesuccessorstates(nfa,
    2938             :                                      sto,
    2939             :                                      stoclone,
    2940             :                                      spredecessor,
    2941             :                                      refarc,
    2942             :                                      NULL,
    2943             :                                      donemap,
    2944             :                                      nstates);
    2945             :             }
    2946             :         }
    2947             : 
    2948             :         /* Don't forget to free sclone's donemap when done with it */
    2949        1484 :         FREE(donemap);
    2950             :     }
    2951             : }
    2952             : 
    2953             : /*
    2954             :  * cleanup - clean up NFA after optimizations
    2955             :  */
    2956             : static void
    2957       26828 : cleanup(struct nfa *nfa)
    2958             : {
    2959             :     struct state *s;
    2960             :     struct state *nexts;
    2961             :     int         n;
    2962             : 
    2963       26828 :     if (NISERR())
    2964           4 :         return;
    2965             : 
    2966             :     /* clear out unreachable or dead-end states */
    2967             :     /* use pre to mark reachable, then post to mark can-reach-post */
    2968       26824 :     markreachable(nfa, nfa->pre, (struct state *) NULL, nfa->pre);
    2969       26824 :     markcanreach(nfa, nfa->post, nfa->pre, nfa->post);
    2970      481356 :     for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
    2971             :     {
    2972      454532 :         nexts = s->next;
    2973      454532 :         if (s->tmp != nfa->post && !s->flag)
    2974        4910 :             dropstate(nfa, s);
    2975             :     }
    2976             :     assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == nfa->post);
    2977       26824 :     cleartraverse(nfa, nfa->pre);
    2978             :     assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == NULL);
    2979             :     /* the nins==0 (final unreachable) case will be caught later */
    2980             : 
    2981             :     /* renumber surviving states */
    2982       26824 :     n = 0;
    2983      476446 :     for (s = nfa->states; s != NULL; s = s->next)
    2984      449622 :         s->no = n++;
    2985       26824 :     nfa->nstates = n;
    2986             : }
    2987             : 
    2988             : /*
    2989             :  * markreachable - recursive marking of reachable states
    2990             :  */
    2991             : static void
    2992     1503172 : markreachable(struct nfa *nfa,
    2993             :               struct state *s,
    2994             :               struct state *okay,   /* consider only states with this mark */
    2995             :               struct state *mark)   /* the value to mark with */
    2996             : {
    2997             :     struct arc *a;
    2998             : 
    2999             :     /* Since this is recursive, it could be driven to stack overflow */
    3000     1503172 :     if (STACK_TOO_DEEP(nfa->v->re))
    3001             :     {
    3002           0 :         NERR(REG_ETOOBIG);
    3003           0 :         return;
    3004             :     }
    3005             : 
    3006     1503172 :     if (s->tmp != okay)
    3007     1053560 :         return;
    3008      449612 :     s->tmp = mark;
    3009             : 
    3010     1925960 :     for (a = s->outs; a != NULL; a = a->outchain)
    3011     1476348 :         markreachable(nfa, a->to, okay, mark);
    3012             : }
    3013             : 
    3014             : /*
    3015             :  * markcanreach - recursive marking of states which can reach here
    3016             :  */
    3017             : static void
    3018     1503968 : markcanreach(struct nfa *nfa,
    3019             :              struct state *s,
    3020             :              struct state *okay,    /* consider only states with this mark */
    3021             :              struct state *mark)    /* the value to mark with */
    3022             : {
    3023             :     struct arc *a;
    3024             : 
    3025             :     /* Since this is recursive, it could be driven to stack overflow */
    3026     1503968 :     if (STACK_TOO_DEEP(nfa->v->re))
    3027             :     {
    3028           0 :         NERR(REG_ETOOBIG);
    3029           0 :         return;
    3030             :     }
    3031             : 
    3032     1503968 :     if (s->tmp != okay)
    3033     1054486 :         return;
    3034      449482 :     s->tmp = mark;
    3035             : 
    3036     1926626 :     for (a = s->ins; a != NULL; a = a->inchain)
    3037     1477144 :         markcanreach(nfa, a->from, okay, mark);
    3038             : }
    3039             : 
    3040             : /*
    3041             :  * analyze - ascertain potentially-useful facts about an optimized NFA
    3042             :  */
    3043             : static long                     /* re_info bits to be ORed in */
    3044       13414 : analyze(struct nfa *nfa)
    3045             : {
    3046             :     struct arc *a;
    3047             :     struct arc *aa;
    3048             : 
    3049       13414 :     if (NISERR())
    3050           4 :         return 0;
    3051             : 
    3052             :     /* Detect whether NFA can't match anything */
    3053       13410 :     if (nfa->pre->outs == NULL)
    3054          70 :         return REG_UIMPOSSIBLE;
    3055             : 
    3056             :     /* Detect whether NFA matches all strings (possibly with length bounds) */
    3057       13340 :     checkmatchall(nfa);
    3058             : 
    3059             :     /* Detect whether NFA can possibly match a zero-length string */
    3060       43026 :     for (a = nfa->pre->outs; a != NULL; a = a->outchain)
    3061     1523070 :         for (aa = a->to->outs; aa != NULL; aa = aa->outchain)
    3062     1493384 :             if (aa->to == nfa->post)
    3063        2474 :                 return REG_UEMPTYMATCH;
    3064       10866 :     return 0;
    3065             : }
    3066             : 
    3067             : /*
    3068             :  * checkmatchall - does the NFA represent no more than a string length test?
    3069             :  *
    3070             :  * If so, set nfa->minmatchall and nfa->maxmatchall correctly (they are -1
    3071             :  * to begin with) and set the MATCHALL bit in nfa->flags.
    3072             :  *
    3073             :  * To succeed, we require all arcs to be PLAIN RAINBOW arcs, except for those
    3074             :  * for pseudocolors (i.e., BOS/BOL/EOS/EOL).  We must be able to reach the
    3075             :  * post state via RAINBOW arcs, and if there are any loops in the graph, they
    3076             :  * must be loop-to-self arcs, ensuring that each loop iteration consumes
    3077             :  * exactly one character.  (Longer loops are problematic because they create
    3078             :  * non-consecutive possible match lengths; we have no good way to represent
    3079             :  * that situation for lengths beyond the DUPINF limit.)
    3080             :  *
    3081             :  * Pseudocolor arcs complicate things a little.  We know that they can only
    3082             :  * appear as pre-state outarcs (for BOS/BOL) or post-state inarcs (for
    3083             :  * EOS/EOL).  There, they must exactly replicate the parallel RAINBOW arcs,
    3084             :  * e.g. if the pre state has one RAINBOW outarc to state 2, it must have BOS
    3085             :  * and BOL outarcs to state 2, and no others.  Missing or extra pseudocolor
    3086             :  * arcs can occur, meaning that the NFA involves some constraint on the
    3087             :  * adjacent characters, which makes it not a matchall NFA.
    3088             :  */
    3089             : static void
    3090       13340 : checkmatchall(struct nfa *nfa)
    3091             : {
    3092             :     bool      **haspaths;
    3093             :     struct state *s;
    3094             :     int         i;
    3095             : 
    3096             :     /*
    3097             :      * If there are too many states, don't bother trying to detect matchall.
    3098             :      * This limit serves to bound the time and memory we could consume below.
    3099             :      * Note that even if the graph is all-RAINBOW, if there are significantly
    3100             :      * more than DUPINF states then it's likely that there are paths of length
    3101             :      * more than DUPINF, which would force us to fail anyhow.  In practice,
    3102             :      * plausible ways of writing a matchall regex with maximum finite path
    3103             :      * length K tend not to have very many more than K states.
    3104             :      */
    3105       13340 :     if (nfa->nstates > DUPINF * 2)
    3106          12 :         return;
    3107             : 
    3108             :     /*
    3109             :      * First, scan all the states to verify that only RAINBOW arcs appear,
    3110             :      * plus pseudocolor arcs adjacent to the pre and post states.  This lets
    3111             :      * us quickly eliminate most cases that aren't matchall NFAs.
    3112             :      */
    3113       49442 :     for (s = nfa->states; s != NULL; s = s->next)
    3114             :     {
    3115             :         struct arc *a;
    3116             : 
    3117      140876 :         for (a = s->outs; a != NULL; a = a->outchain)
    3118             :         {
    3119      104762 :             if (a->type != PLAIN)
    3120          74 :                 return;         /* any LACONs make it non-matchall */
    3121      104688 :             if (a->co != RAINBOW)
    3122             :             {
    3123       45444 :                 if (nfa->cm->cd[a->co].flags & PSEUDO)
    3124             :                 {
    3125             :                     /*
    3126             :                      * Pseudocolor arc: verify it's in a valid place (this
    3127             :                      * seems quite unlikely to fail, but let's be sure).
    3128             :                      */
    3129       33410 :                     if (s == nfa->pre &&
    3130       23832 :                         (a->co == nfa->bos[0] || a->co == nfa->bos[1]))
    3131             :                          /* okay BOS/BOL arc */ ;
    3132        9578 :                     else if (a->to == nfa->post &&
    3133        9578 :                              (a->co == nfa->eos[0] || a->co == nfa->eos[1]))
    3134             :                          /* okay EOS/EOL arc */ ;
    3135             :                     else
    3136           0 :                         return; /* unexpected pseudocolor arc */
    3137             :                     /* We'll check these arcs some more below. */
    3138             :                 }
    3139             :                 else
    3140       12034 :                     return;     /* any other color makes it non-matchall */
    3141             :             }
    3142             :         }
    3143             :         /* Also, assert that the tmp fields are available for use. */
    3144             :         assert(s->tmp == NULL);
    3145             :     }
    3146             : 
    3147             :     /*
    3148             :      * The next cheapest check we can make is to verify that the BOS/BOL
    3149             :      * outarcs of the pre state reach the same states as its RAINBOW outarcs.
    3150             :      * If they don't, the NFA expresses some constraints on the character
    3151             :      * before the matched string, making it non-matchall.  Likewise, the
    3152             :      * EOS/EOL inarcs of the post state must match its RAINBOW inarcs.
    3153             :      */
    3154        1220 :     if (!check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[0]) ||
    3155        1214 :         !check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[1]) ||
    3156         942 :         !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[0]) ||
    3157         934 :         !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[1]))
    3158         438 :         return;
    3159             : 
    3160             :     /*
    3161             :      * Initialize an array of path-length arrays, in which
    3162             :      * checkmatchall_recurse will return per-state results.  This lets us
    3163             :      * memo-ize the recursive search and avoid exponential time consumption.
    3164             :      */
    3165         782 :     haspaths = (bool **) MALLOC(nfa->nstates * sizeof(bool *));
    3166         782 :     if (haspaths == NULL)
    3167           0 :         return;                 /* fail quietly */
    3168         782 :     memset(haspaths, 0, nfa->nstates * sizeof(bool *));
    3169             : 
    3170             :     /*
    3171             :      * Recursively search the graph for all-RAINBOW paths to the "post" state,
    3172             :      * starting at the "pre" state, and computing the lengths of the paths.
    3173             :      * (Given the preceding checks, there should be at least one such path.
    3174             :      * However we could get back a false result anyway, in case there are
    3175             :      * multi-state loops, paths exceeding DUPINF+1 length, or non-algorithmic
    3176             :      * failures such as ENOMEM.)
    3177             :      */
    3178         782 :     if (checkmatchall_recurse(nfa, nfa->pre, haspaths))
    3179             :     {
    3180             :         /* The useful result is the path length array for the pre state */
    3181         766 :         bool       *haspath = haspaths[nfa->pre->no];
    3182             :         int         minmatch,
    3183             :                     maxmatch,
    3184             :                     morematch;
    3185             : 
    3186             :         assert(haspath != NULL);
    3187             : 
    3188             :         /*
    3189             :          * haspath[] now represents the set of possible path lengths; but we
    3190             :          * want to reduce that to a min and max value, because it doesn't seem
    3191             :          * worth complicating regexec.c to deal with nonconsecutive possible
    3192             :          * match lengths.  Find min and max of first run of lengths, then
    3193             :          * verify there are no nonconsecutive lengths.
    3194             :          */
    3195        3476 :         for (minmatch = 0; minmatch <= DUPINF + 1; minmatch++)
    3196             :         {
    3197        3476 :             if (haspath[minmatch])
    3198         766 :                 break;
    3199             :         }
    3200             :         assert(minmatch <= DUPINF + 1); /* else checkmatchall_recurse lied */
    3201       46598 :         for (maxmatch = minmatch; maxmatch < DUPINF + 1; maxmatch++)
    3202             :         {
    3203       46422 :             if (!haspath[maxmatch + 1])
    3204         590 :                 break;
    3205             :         }
    3206      147054 :         for (morematch = maxmatch + 1; morematch <= DUPINF + 1; morematch++)
    3207             :         {
    3208      146296 :             if (haspath[morematch])
    3209             :             {
    3210           8 :                 haspath = NULL; /* fail, there are nonconsecutive lengths */
    3211           8 :                 break;
    3212             :             }
    3213             :         }
    3214             : 
    3215         766 :         if (haspath != NULL)
    3216             :         {
    3217             :             /*
    3218             :              * Success, so record the info.  Here we have a fine point: the
    3219             :              * path length from the pre state includes the pre-to-initial
    3220             :              * transition, so it's one more than the actually matched string
    3221             :              * length.  (We avoided counting the final-to-post transition
    3222             :              * within checkmatchall_recurse, but not this one.)  This is why
    3223             :              * checkmatchall_recurse allows one more level of path length than
    3224             :              * might seem necessary.  This decrement also takes care of
    3225             :              * converting checkmatchall_recurse's definition of "infinity" as
    3226             :              * "DUPINF+1" to our normal representation as "DUPINF".
    3227             :              */
    3228             :             assert(minmatch > 0);    /* else pre and post states were adjacent */
    3229         758 :             nfa->minmatchall = minmatch - 1;
    3230         758 :             nfa->maxmatchall = maxmatch - 1;
    3231         758 :             nfa->flags |= MATCHALL;
    3232             :         }
    3233             :     }
    3234             : 
    3235             :     /* Clean up */
    3236        8024 :     for (i = 0; i < nfa->nstates; i++)
    3237             :     {
    3238        7242 :         if (haspaths[i] != NULL)
    3239        6460 :             FREE(haspaths[i]);
    3240             :     }
    3241         782 :     FREE(haspaths);
    3242             : }
    3243             : 
    3244             : /*
    3245             :  * checkmatchall_recurse - recursive search for checkmatchall
    3246             :  *
    3247             :  * s is the state to be examined in this recursion level.
    3248             :  * haspaths[] is an array of per-state exit path length arrays.
    3249             :  *
    3250             :  * We return true if the search was performed successfully, false if
    3251             :  * we had to fail because of multi-state loops or other internal reasons.
    3252             :  * (Because "dead" states that can't reach the post state have been
    3253             :  * eliminated, and we already verified that only RAINBOW and matching
    3254             :  * pseudocolor arcs exist, every state should have RAINBOW path(s) to
    3255             :  * the post state.  Hence we take a false result from recursive calls
    3256             :  * as meaning that we'd better fail altogether, not just that that
    3257             :  * particular state can't reach the post state.)
    3258             :  *
    3259             :  * On success, we store a malloc'd result array in haspaths[s->no],
    3260             :  * showing the possible path lengths from s to the post state.
    3261             :  * Each state's haspath[] array is of length DUPINF+2.  The entries from
    3262             :  * k = 0 to DUPINF are true if there is an all-RAINBOW path of length k
    3263             :  * from this state to the string end.  haspath[DUPINF+1] is true if all
    3264             :  * path lengths >= DUPINF+1 are possible.  (Situations that cannot be
    3265             :  * represented under these rules cause failure.)
    3266             :  *
    3267             :  * checkmatchall is responsible for eventually freeing the haspath[] arrays.
    3268             :  */
    3269             : static bool
    3270        6460 : checkmatchall_recurse(struct nfa *nfa, struct state *s, bool **haspaths)
    3271             : {
    3272        6460 :     bool        result = false;
    3273        6460 :     bool        foundloop = false;
    3274             :     bool       *haspath;
    3275             :     struct arc *a;
    3276             : 
    3277             :     /*
    3278             :      * Since this is recursive, it could be driven to stack overflow.  But we
    3279             :      * need not treat that as a hard failure; just deem the NFA non-matchall.
    3280             :      */
    3281        6460 :     if (STACK_TOO_DEEP(nfa->v->re))
    3282           0 :         return false;
    3283             : 
    3284             :     /* In case the search takes a long time, check for cancel */
    3285        6460 :     if (CANCEL_REQUESTED(nfa->v->re))
    3286             :     {
    3287           0 :         NERR(REG_CANCEL);
    3288           0 :         return false;
    3289             :     }
    3290             : 
    3291             :     /* Create a haspath array for this state */
    3292        6460 :     haspath = (bool *) MALLOC((DUPINF + 2) * sizeof(bool));
    3293        6460 :     if (haspath == NULL)
    3294           0 :         return false;           /* again, treat as non-matchall */
    3295        6460 :     memset(haspath, 0, (DUPINF + 2) * sizeof(bool));
    3296             : 
    3297             :     /* Mark this state as being visited */
    3298             :     assert(s->tmp == NULL);
    3299        6460 :     s->tmp = s;
    3300             : 
    3301       57502 :     for (a = s->outs; a != NULL; a = a->outchain)
    3302             :     {
    3303       51106 :         if (a->co != RAINBOW)
    3304        4800 :             continue;           /* ignore pseudocolor arcs */
    3305       46306 :         if (a->to == nfa->post)
    3306             :         {
    3307             :             /* We found an all-RAINBOW path to the post state */
    3308         774 :             result = true;
    3309             : 
    3310             :             /*
    3311             :              * Mark this state as being zero steps away from the string end
    3312             :              * (the transition to the post state isn't counted).
    3313             :              */
    3314         774 :             haspath[0] = true;
    3315             :         }
    3316       45532 :         else if (a->to == s)
    3317             :         {
    3318             :             /* We found a cycle of length 1, which we'll deal with below. */
    3319         182 :             foundloop = true;
    3320             :         }
    3321       45350 :         else if (a->to->tmp != NULL)
    3322             :         {
    3323             :             /* It's busy, so we found a cycle of length > 1, so fail. */
    3324           8 :             result = false;
    3325           8 :             break;
    3326             :         }
    3327             :         else
    3328             :         {
    3329             :             /* Consider paths forward through this to-state. */
    3330             :             bool       *nexthaspath;
    3331             :             int         i;
    3332             : 
    3333             :             /* If to-state was not already visited, recurse */
    3334       45342 :             if (haspaths[a->to->no] == NULL)
    3335             :             {
    3336        5678 :                 result = checkmatchall_recurse(nfa, a->to, haspaths);
    3337             :                 /* Fail if any recursive path fails */
    3338        5678 :                 if (!result)
    3339          48 :                     break;
    3340             :             }
    3341             :             else
    3342             :             {
    3343             :                 /* The previous visit must have found path(s) to the end */
    3344       39664 :                 result = true;
    3345             :             }
    3346             :             assert(a->to->tmp == NULL);
    3347       45294 :             nexthaspath = haspaths[a->to->no];
    3348             :             assert(nexthaspath != NULL);
    3349             : 
    3350             :             /*
    3351             :              * Now, for every path of length i from a->to to the string end,
    3352             :              * there is a path of length i + 1 from s to the string end.
    3353             :              */
    3354       45294 :             if (nexthaspath[DUPINF] != nexthaspath[DUPINF + 1])
    3355             :             {
    3356             :                 /*
    3357             :                  * a->to has a path of length exactly DUPINF, but not longer;
    3358             :                  * or it has paths of all lengths > DUPINF but not one of
    3359             :                  * exactly that length.  In either case, we cannot represent
    3360             :                  * the possible path lengths from s correctly, so fail.
    3361             :                  */
    3362           8 :                 result = false;
    3363           8 :                 break;
    3364             :             }
    3365             :             /* Merge knowledge of these path lengths into what we have */
    3366    11638502 :             for (i = 0; i < DUPINF; i++)
    3367    11593216 :                 haspath[i + 1] |= nexthaspath[i];
    3368             :             /* Infinity + 1 is still infinity */
    3369       45286 :             haspath[DUPINF + 1] |= nexthaspath[DUPINF + 1];
    3370             :         }
    3371             :     }
    3372             : 
    3373        6460 :     if (result && foundloop)
    3374             :     {
    3375             :         /*
    3376             :          * If there is a length-1 loop at this state, then find the shortest
    3377             :          * known path length to the end.  The loop means that every larger
    3378             :          * path length is possible, too.  (It doesn't matter whether any of
    3379             :          * the longer lengths were already known possible.)
    3380             :          */
    3381             :         int         i;
    3382             : 
    3383         236 :         for (i = 0; i <= DUPINF; i++)
    3384             :         {
    3385         236 :             if (haspath[i])
    3386         182 :                 break;
    3387             :         }
    3388       46902 :         for (i++; i <= DUPINF + 1; i++)
    3389       46720 :             haspath[i] = true;
    3390             :     }
    3391             : 
    3392             :     /* Report out the completed path length map */
    3393             :     assert(s->no < nfa->nstates);
    3394             :     assert(haspaths[s->no] == NULL);
    3395        6460 :     haspaths[s->no] = haspath;
    3396             : 
    3397             :     /* Mark state no longer busy */
    3398        6460 :     s->tmp = NULL;
    3399             : 
    3400        6460 :     return result;
    3401             : }
    3402             : 
    3403             : /*
    3404             :  * check_out_colors_match - subroutine for checkmatchall
    3405             :  *
    3406             :  * Check whether the set of states reachable from s by arcs of color co1
    3407             :  * is equivalent to the set reachable by arcs of color co2.
    3408             :  * checkmatchall already verified that all of the NFA's arcs are PLAIN,
    3409             :  * so we need not examine arc types here.
    3410             :  */
    3411             : static bool
    3412        2434 : check_out_colors_match(struct state *s, color co1, color co2)
    3413             : {
    3414        2434 :     bool        result = true;
    3415             :     struct arc *a;
    3416             : 
    3417             :     /*
    3418             :      * To do this in linear time, we assume that the NFA contains no duplicate
    3419             :      * arcs.  Run through the out-arcs, marking states reachable by arcs of
    3420             :      * color co1.  Run through again, un-marking states reachable by arcs of
    3421             :      * color co2; if we see a not-marked state, we know this co2 arc is
    3422             :      * unmatched.  Then run through again, checking for still-marked states,
    3423             :      * and in any case leaving all the tmp fields reset to NULL.
    3424             :      */
    3425       14094 :     for (a = s->outs; a != NULL; a = a->outchain)
    3426             :     {
    3427       11660 :         if (a->co == co1)
    3428             :         {
    3429             :             assert(a->to->tmp == NULL);
    3430        3684 :             a->to->tmp = a->to;
    3431             :         }
    3432             :     }
    3433       14094 :     for (a = s->outs; a != NULL; a = a->outchain)
    3434             :     {
    3435       11660 :         if (a->co == co2)
    3436             :         {
    3437        3988 :             if (a->to->tmp != NULL)
    3438        3680 :                 a->to->tmp = NULL;
    3439             :             else
    3440         308 :                 result = false; /* unmatched co2 arc */
    3441             :         }
    3442             :     }
    3443       14094 :     for (a = s->outs; a != NULL; a = a->outchain)
    3444             :     {
    3445       11660 :         if (a->co == co1)
    3446             :         {
    3447        3684 :             if (a->to->tmp != NULL)
    3448             :             {
    3449           4 :                 result = false; /* unmatched co1 arc */
    3450           4 :                 a->to->tmp = NULL;
    3451             :             }
    3452             :         }
    3453             :     }
    3454        2434 :     return result;
    3455             : }
    3456             : 
    3457             : /*
    3458             :  * check_in_colors_match - subroutine for checkmatchall
    3459             :  *
    3460             :  * Check whether the set of states that can reach s by arcs of color co1
    3461             :  * is equivalent to the set that can reach s by arcs of color co2.
    3462             :  * checkmatchall already verified that all of the NFA's arcs are PLAIN,
    3463             :  * so we need not examine arc types here.
    3464             :  */
    3465             : static bool
    3466        1876 : check_in_colors_match(struct state *s, color co1, color co2)
    3467             : {
    3468        1876 :     bool        result = true;
    3469             :     struct arc *a;
    3470             : 
    3471             :     /*
    3472             :      * Identical algorithm to check_out_colors_match, except examine the
    3473             :      * from-states of s' inarcs.
    3474             :      */
    3475        7004 :     for (a = s->ins; a != NULL; a = a->inchain)
    3476             :     {
    3477        5128 :         if (a->co == co1)
    3478             :         {
    3479             :             assert(a->from->tmp == NULL);
    3480        1596 :             a->from->tmp = a->from;
    3481             :         }
    3482             :     }
    3483        7004 :     for (a = s->ins; a != NULL; a = a->inchain)
    3484             :     {
    3485        5128 :         if (a->co == co2)
    3486             :         {
    3487        1766 :             if (a->from->tmp != NULL)
    3488        1592 :                 a->from->tmp = NULL;
    3489             :             else
    3490         174 :                 result = false; /* unmatched co2 arc */
    3491             :         }
    3492             :     }
    3493        7004 :     for (a = s->ins; a != NULL; a = a->inchain)
    3494             :     {
    3495        5128 :         if (a->co == co1)
    3496             :         {
    3497        1596 :             if (a->from->tmp != NULL)
    3498             :             {
    3499           4 :                 result = false; /* unmatched co1 arc */
    3500           4 :                 a->from->tmp = NULL;
    3501             :             }
    3502             :         }
    3503             :     }
    3504        1876 :     return result;
    3505             : }
    3506             : 
    3507             : /*
    3508             :  * compact - construct the compact representation of an NFA
    3509             :  */
    3510             : static void
    3511       13410 : compact(struct nfa *nfa,
    3512             :         struct cnfa *cnfa)
    3513             : {
    3514             :     struct state *s;
    3515             :     struct arc *a;
    3516             :     size_t      nstates;
    3517             :     size_t      narcs;
    3518             :     struct carc *ca;
    3519             :     struct carc *first;
    3520             : 
    3521             :     assert(!NISERR());
    3522             : 
    3523       13410 :     nstates = 0;
    3524       13410 :     narcs = 0;
    3525      166494 :     for (s = nfa->states; s != NULL; s = s->next)
    3526             :     {
    3527      153084 :         nstates++;
    3528      153084 :         narcs += s->nouts + 1;   /* need one extra for endmarker */
    3529             :     }
    3530             : 
    3531       13410 :     cnfa->stflags = (char *) MALLOC(nstates * sizeof(char));
    3532       13410 :     cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *));
    3533       13410 :     cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc));
    3534       13410 :     if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL)
    3535             :     {
    3536           0 :         if (cnfa->stflags != NULL)
    3537           0 :             FREE(cnfa->stflags);
    3538           0 :         if (cnfa->states != NULL)
    3539           0 :             FREE(cnfa->states);
    3540           0 :         if (cnfa->arcs != NULL)
    3541           0 :             FREE(cnfa->arcs);
    3542           0 :         NERR(REG_ESPACE);
    3543           0 :         return;
    3544             :     }
    3545       13410 :     cnfa->nstates = nstates;
    3546       13410 :     cnfa->pre = nfa->pre->no;
    3547       13410 :     cnfa->post = nfa->post->no;
    3548       13410 :     cnfa->bos[0] = nfa->bos[0];
    3549       13410 :     cnfa->bos[1] = nfa->bos[1];
    3550       13410 :     cnfa->eos[0] = nfa->eos[0];
    3551       13410 :     cnfa->eos[1] = nfa->eos[1];
    3552       13410 :     cnfa->ncolors = maxcolor(nfa->cm) + 1;
    3553       13410 :     cnfa->flags = nfa->flags;
    3554       13410 :     cnfa->minmatchall = nfa->minmatchall;
    3555       13410 :     cnfa->maxmatchall = nfa->maxmatchall;
    3556             : 
    3557       13410 :     ca = cnfa->arcs;
    3558      166494 :     for (s = nfa->states; s != NULL; s = s->next)
    3559             :     {
    3560             :         assert((size_t) s->no < nstates);
    3561      153084 :         cnfa->stflags[s->no] = 0;
    3562      153084 :         cnfa->states[s->no] = ca;
    3563      153084 :         first = ca;
    3564     1574962 :         for (a = s->outs; a != NULL; a = a->outchain)
    3565     1421878 :             switch (a->type)
    3566             :             {
    3567     1421708 :                 case PLAIN:
    3568     1421708 :                     ca->co = a->co;
    3569     1421708 :                     ca->to = a->to->no;
    3570     1421708 :                     ca++;
    3571     1421708 :                     break;
    3572         170 :                 case LACON:
    3573             :                     assert(s->no != cnfa->pre);
    3574             :                     assert(a->co >= 0);
    3575         170 :                     ca->co = (color) (cnfa->ncolors + a->co);
    3576         170 :                     ca->to = a->to->no;
    3577         170 :                     ca++;
    3578         170 :                     cnfa->flags |= HASLACONS;
    3579         170 :                     break;
    3580           0 :                 default:
    3581           0 :                     NERR(REG_ASSERT);
    3582           0 :                     return;
    3583             :             }
    3584      153084 :         carcsort(first, ca - first);
    3585      153084 :         ca->co = COLORLESS;
    3586      153084 :         ca->to = 0;
    3587      153084 :         ca++;
    3588             :     }
    3589             :     assert(ca == &cnfa->arcs[narcs]);
    3590             :     assert(cnfa->nstates != 0);
    3591             : 
    3592             :     /* mark no-progress states */
    3593       62324 :     for (a = nfa->pre->outs; a != NULL; a = a->outchain)
    3594       48914 :         cnfa->stflags[a->to->no] = CNFA_NOPROGRESS;
    3595       13410 :     cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS;
    3596             : }
    3597             : 
    3598             : /*
    3599             :  * carcsort - sort compacted-NFA arcs by color
    3600             :  */
    3601             : static void
    3602      153084 : carcsort(struct carc *first, size_t n)
    3603             : {
    3604      153084 :     if (n > 1)
    3605       39774 :         qsort(first, n, sizeof(struct carc), carc_cmp);
    3606      153084 : }
    3607             : 
    3608             : static int
    3609    15116276 : carc_cmp(const void *a, const void *b)
    3610             : {
    3611    15116276 :     const struct carc *aa = (const struct carc *) a;
    3612    15116276 :     const struct carc *bb = (const struct carc *) b;
    3613             : 
    3614    15116276 :     if (aa->co < bb->co)
    3615      137270 :         return -1;
    3616    14979006 :     if (aa->co > bb->co)
    3617      187214 :         return +1;
    3618    14791792 :     if (aa->to < bb->to)
    3619    10145848 :         return -1;
    3620     4645944 :     if (aa->to > bb->to)
    3621     4645944 :         return +1;
    3622             :     /* This is unreached, since there should be no duplicate arcs now: */
    3623           0 :     return 0;
    3624             : }
    3625             : 
    3626             : /*
    3627             :  * freecnfa - free a compacted NFA
    3628             :  */
    3629             : static void
    3630        5458 : freecnfa(struct cnfa *cnfa)
    3631             : {
    3632             :     assert(!NULLCNFA(*cnfa));   /* not empty already */
    3633        5458 :     FREE(cnfa->stflags);
    3634        5458 :     FREE(cnfa->states);
    3635        5458 :     FREE(cnfa->arcs);
    3636        5458 :     ZAPCNFA(*cnfa);
    3637        5458 : }
    3638             : 
    3639             : /*
    3640             :  * dumpnfa - dump an NFA in human-readable form
    3641             :  */
    3642             : static void
    3643           0 : dumpnfa(struct nfa *nfa,
    3644             :         FILE *f)
    3645             : {
    3646             : #ifdef REG_DEBUG
    3647             :     struct state *s;
    3648             :     int         nstates = 0;
    3649             :     int         narcs = 0;
    3650             : 
    3651             :     fprintf(f, "pre %d, post %d", nfa->pre->no, nfa->post->no);
    3652             :     if (nfa->bos[0] != COLORLESS)
    3653             :         fprintf(f, ", bos [%ld]", (long) nfa->bos[0]);
    3654             :     if (nfa->bos[1] != COLORLESS)
    3655             :         fprintf(f, ", bol [%ld]", (long) nfa->bos[1]);
    3656             :     if (nfa->eos[0] != COLORLESS)
    3657             :         fprintf(f, ", eos [%ld]", (long) nfa->eos[0]);
    3658             :     if (nfa->eos[1] != COLORLESS)
    3659             :         fprintf(f, ", eol [%ld]", (long) nfa->eos[1]);
    3660             :     if (nfa->flags & HASLACONS)
    3661             :         fprintf(f, ", haslacons");
    3662             :     if (nfa->flags & MATCHALL)
    3663             :     {
    3664             :         fprintf(f, ", minmatchall %d", nfa->minmatchall);
    3665             :         if (nfa->maxmatchall == DUPINF)
    3666             :             fprintf(f, ", maxmatchall inf");
    3667             :         else
    3668             :             fprintf(f, ", maxmatchall %d", nfa->maxmatchall);
    3669             :     }
    3670             :     fprintf(f, "\n");
    3671             :     for (s = nfa->states; s != NULL; s = s->next)
    3672             :     {
    3673             :         dumpstate(s, f);
    3674             :         nstates++;
    3675             :         narcs += s->nouts;
    3676             :     }
    3677             :     fprintf(f, "total of %d states, %d arcs\n", nstates, narcs);
    3678             :     if (nfa->parent == NULL)
    3679             :         dumpcolors(nfa->cm, f);
    3680             :     fflush(f);
    3681             : #endif
    3682           0 : }
    3683             : 
    3684             : #ifdef REG_DEBUG                /* subordinates of dumpnfa */
    3685             : 
    3686             : /*
    3687             :  * dumpstate - dump an NFA state in human-readable form
    3688             :  */
    3689             : static void
    3690             : dumpstate(struct state *s,
    3691             :           FILE *f)
    3692             : {
    3693             :     struct arc *a;
    3694             : 
    3695             :     fprintf(f, "%d%s%c", s->no, (s->tmp != NULL) ? "T" : "",
    3696             :             (s->flag) ? s->flag : '.');
    3697             :     if (s->prev != NULL && s->prev->next != s)
    3698             :         fprintf(f, "\tstate chain bad\n");
    3699             :     if (s->nouts == 0)
    3700             :         fprintf(f, "\tno out arcs\n");
    3701             :     else
    3702             :         dumparcs(s, f);
    3703             :     for (a = s->ins; a != NULL; a = a->inchain)
    3704             :     {
    3705             :         if (a->to != s)
    3706             :             fprintf(f, "\tlink from %d to %d on %d's in-chain\n",
    3707             :                     a->from->no, a->to->no, s->no);
    3708             :     }
    3709             :     fflush(f);
    3710             : }
    3711             : 
    3712             : /*
    3713             :  * dumparcs - dump out-arcs in human-readable form
    3714             :  */
    3715             : static void
    3716             : dumparcs(struct state *s,
    3717             :          FILE *f)
    3718             : {
    3719             :     int         pos;
    3720             :     struct arc *a;
    3721             : 
    3722             :     /* printing oldest arcs first is usually clearer */
    3723             :     a = s->outs;
    3724             :     assert(a != NULL);
    3725             :     while (a->outchain != NULL)
    3726             :         a = a->outchain;
    3727             :     pos = 1;
    3728             :     do
    3729             :     {
    3730             :         dumparc(a, s, f);
    3731             :         if (pos == 5)
    3732             :         {
    3733             :             fprintf(f, "\n");
    3734             :             pos = 1;
    3735             :         }
    3736             :         else
    3737             :             pos++;
    3738             :         a = a->outchainRev;
    3739             :     } while (a != NULL);
    3740             :     if (pos != 1)
    3741             :         fprintf(f, "\n");
    3742             : }
    3743             : 
    3744             : /*
    3745             :  * dumparc - dump one outarc in readable form, including prefixing tab
    3746             :  */
    3747             : static void
    3748             : dumparc(struct arc *a,
    3749             :         struct state *s,
    3750             :         FILE *f)
    3751             : {
    3752             :     struct arc *aa;
    3753             : 
    3754             :     fprintf(f, "\t");
    3755             :     switch (a->type)
    3756             :     {
    3757             :         case PLAIN:
    3758             :             if (a->co == RAINBOW)
    3759             :                 fprintf(f, "[*]");
    3760             :             else
    3761             :                 fprintf(f, "[%ld]", (long) a->co);
    3762             :             break;
    3763             :         case AHEAD:
    3764             :             if (a->co == RAINBOW)
    3765             :                 fprintf(f, ">*>");
    3766             :             else
    3767             :                 fprintf(f, ">%ld>", (long) a->co);
    3768             :             break;
    3769             :         case BEHIND:
    3770             :             if (a->co == RAINBOW)
    3771             :                 fprintf(f, "<*<");
    3772             :             else
    3773             :                 fprintf(f, "<%ld<", (long) a->co);
    3774             :             break;
    3775             :         case LACON:
    3776             :             fprintf(f, ":%ld:", (long) a->co);
    3777             :             break;
    3778             :         case '^':
    3779             :         case '$':
    3780             :             fprintf(f, "%c%d", a->type, (int) a->co);
    3781             :             break;
    3782             :         case EMPTY:
    3783             :             break;
    3784             :         default:
    3785             :             fprintf(f, "0x%x/0%lo", a->type, (long) a->co);
    3786             :             break;
    3787             :     }
    3788             :     if (a->from != s)
    3789             :         fprintf(f, "?%d?", a->from->no);
    3790             :     for (aa = a->from->outs; aa != NULL; aa = aa->outchain)
    3791             :         if (aa == a)
    3792             :             break;              /* NOTE BREAK OUT */
    3793             :     if (aa == NULL)
    3794             :         fprintf(f, "?!?");        /* missing from out-chain */
    3795             :     fprintf(f, "->");
    3796             :     if (a->to == NULL)
    3797             :     {
    3798             :         fprintf(f, "NULL");
    3799             :         return;
    3800             :     }
    3801             :     fprintf(f, "%d", a->to->no);
    3802             :     for (aa = a->to->ins; aa != NULL; aa = aa->inchain)
    3803             :         if (aa == a)
    3804             :             break;              /* NOTE BREAK OUT */
    3805             :     if (aa == NULL)
    3806             :         fprintf(f, "?!?");        /* missing from in-chain */
    3807             : }
    3808             : #endif                          /* REG_DEBUG */
    3809             : 
    3810             : /*
    3811             :  * dumpcnfa - dump a compacted NFA in human-readable form
    3812             :  */
    3813             : #ifdef REG_DEBUG
    3814             : static void
    3815             : dumpcnfa(struct cnfa *cnfa,
    3816             :          FILE *f)
    3817             : {
    3818             :     int         st;
    3819             : 
    3820             :     fprintf(f, "pre %d, post %d", cnfa->pre, cnfa->post);
    3821             :     if (cnfa->bos[0] != COLORLESS)
    3822             :         fprintf(f, ", bos [%ld]", (long) cnfa->bos[0]);
    3823             :     if (cnfa->bos[1] != COLORLESS)
    3824             :         fprintf(f, ", bol [%ld]", (long) cnfa->bos[1]);
    3825             :     if (cnfa->eos[0] != COLORLESS)
    3826             :         fprintf(f, ", eos [%ld]", (long) cnfa->eos[0]);
    3827             :     if (cnfa->eos[1] != COLORLESS)
    3828             :         fprintf(f, ", eol [%ld]", (long) cnfa->eos[1]);
    3829             :     if (cnfa->flags & HASLACONS)
    3830             :         fprintf(f, ", haslacons");
    3831             :     if (cnfa->flags & MATCHALL)
    3832             :     {
    3833             :         fprintf(f, ", minmatchall %d", cnfa->minmatchall);
    3834             :         if (cnfa->maxmatchall == DUPINF)
    3835             :             fprintf(f, ", maxmatchall inf");
    3836             :         else
    3837             :             fprintf(f, ", maxmatchall %d", cnfa->maxmatchall);
    3838             :     }
    3839             :     fprintf(f, "\n");
    3840             :     for (st = 0; st < cnfa->nstates; st++)
    3841             :         dumpcstate(st, cnfa, f);
    3842             :     fflush(f);
    3843             : }
    3844             : #endif
    3845             : 
    3846             : #ifdef REG_DEBUG                /* subordinates of dumpcnfa */
    3847             : 
    3848             : /*
    3849             :  * dumpcstate - dump a compacted-NFA state in human-readable form
    3850             :  */
    3851             : static void
    3852             : dumpcstate(int st,
    3853             :            struct cnfa *cnfa,
    3854             :            FILE *f)
    3855             : {
    3856             :     struct carc *ca;
    3857             :     int         pos;
    3858             : 
    3859             :     fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : ".");
    3860             :     pos = 1;
    3861             :     for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
    3862             :     {
    3863             :         if (ca->co == RAINBOW)
    3864             :             fprintf(f, "\t[*]->%d", ca->to);
    3865             :         else if (ca->co < cnfa->ncolors)
    3866             :             fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to);
    3867             :         else
    3868             :             fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to);
    3869             :         if (pos == 5)
    3870             :         {
    3871             :             fprintf(f, "\n");
    3872             :             pos = 1;
    3873             :         }
    3874             :         else
    3875             :             pos++;
    3876             :     }
    3877             :     if (ca == cnfa->states[st] || pos != 1)
    3878             :         fprintf(f, "\n");
    3879             :     fflush(f);
    3880             : }
    3881             : 
    3882             : #endif                          /* REG_DEBUG */

Generated by: LCOV version 1.14