LCOV - PostgreSQL 19devel - src/backend/regex/regc

LCOV - code coverage report

Current view:	top level - src/backend/regex - regc_nfa.c (source / functions)		Hit	Total	Coverage
Test:	PostgreSQL 19devel	Lines:	1129	1236	91.3 %
Date:	2025-07-29 03:18:01	Functions:	61	62	98.4 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*
       2             :  * NFA utilities.
       3             :  * This file is #included by regcomp.c.
       4             :  *
       5             :  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
       6             :  *
       7             :  * Development of this software was funded, in part, by Cray Research Inc.,
       8             :  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
       9             :  * Corporation, none of whom are responsible for the results.  The author
      10             :  * thanks all of them.
      11             :  *
      12             :  * Redistribution and use in source and binary forms -- with or without
      13             :  * modification -- are permitted for any purpose, provided that
      14             :  * redistributions in source form retain this entire copyright notice and
      15             :  * indicate the origin and nature of any modifications.
      16             :  *
      17             :  * I'd appreciate being given credit for this package in the documentation
      18             :  * of software which uses it, but that is not a requirement.
      19             :  *
      20             :  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
      21             :  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
      22             :  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
      23             :  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
      24             :  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
      25             :  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
      26             :  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
      27             :  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
      28             :  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
      29             :  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
      30             :  *
      31             :  * src/backend/regex/regc_nfa.c
      32             :  *
      33             :  *
      34             :  * One or two things that technically ought to be in here
      35             :  * are actually in color.c, thanks to some incestuous relationships in
      36             :  * the color chains.
      37             :  */
      38             : 
      39             : #define NISERR()    VISERR(nfa->v)
      40             : #define NERR(e)     VERR(nfa->v, (e))
      41             : 
      42             : 
      43             : /*
      44             :  * newnfa - set up an NFA
      45             :  */
      46             : static struct nfa *             /* the NFA, or NULL */
      47       19802 : newnfa(struct vars *v,
      48             :        struct colormap *cm,
      49             :        struct nfa *parent)      /* NULL if primary NFA */
      50             : {
      51             :     struct nfa *nfa;
      52             : 
      53       19802 :     nfa = (struct nfa *) MALLOC(sizeof(struct nfa));
      54       19802 :     if (nfa == NULL)
      55             :     {
      56           0 :         ERR(REG_ESPACE);
      57           0 :         return NULL;
      58             :     }
      59             : 
      60             :     /* Make the NFA minimally valid, so freenfa() will behave sanely */
      61       19802 :     nfa->states = NULL;
      62       19802 :     nfa->slast = NULL;
      63       19802 :     nfa->freestates = NULL;
      64       19802 :     nfa->freearcs = NULL;
      65       19802 :     nfa->lastsb = NULL;
      66       19802 :     nfa->lastab = NULL;
      67       19802 :     nfa->lastsbused = 0;
      68       19802 :     nfa->lastabused = 0;
      69       19802 :     nfa->nstates = 0;
      70       19802 :     nfa->cm = cm;
      71       19802 :     nfa->v = v;
      72       19802 :     nfa->bos[0] = nfa->bos[1] = COLORLESS;
      73       19802 :     nfa->eos[0] = nfa->eos[1] = COLORLESS;
      74       19802 :     nfa->flags = 0;
      75       19802 :     nfa->minmatchall = nfa->maxmatchall = -1;
      76       19802 :     nfa->parent = parent;        /* Precedes newfstate so parent is valid. */
      77             : 
      78             :     /* Create required infrastructure */
      79       19802 :     nfa->post = newfstate(nfa, '@'); /* number 0 */
      80       19802 :     nfa->pre = newfstate(nfa, '>'); /* number 1 */
      81       19802 :     nfa->init = newstate(nfa);   /* may become invalid later */
      82       19802 :     nfa->final = newstate(nfa);
      83       19802 :     if (ISERR())
      84             :     {
      85           0 :         freenfa(nfa);
      86           0 :         return NULL;
      87             :     }
      88       19802 :     rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->pre, nfa->init);
      89       19802 :     newarc(nfa, '^', 1, nfa->pre, nfa->init);
      90       19802 :     newarc(nfa, '^', 0, nfa->pre, nfa->init);
      91       19802 :     rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->final, nfa->post);
      92       19802 :     newarc(nfa, '$', 1, nfa->final, nfa->post);
      93       19802 :     newarc(nfa, '$', 0, nfa->final, nfa->post);
      94             : 
      95       19802 :     if (ISERR())
      96             :     {
      97           0 :         freenfa(nfa);
      98           0 :         return NULL;
      99             :     }
     100       19802 :     return nfa;
     101             : }
     102             : 
     103             : /*
     104             :  * freenfa - free an entire NFA
     105             :  */
     106             : static void
     107       19802 : freenfa(struct nfa *nfa)
     108             : {
     109             :     struct statebatch *sb;
     110             :     struct statebatch *sbnext;
     111             :     struct arcbatch *ab;
     112             :     struct arcbatch *abnext;
     113             : 
     114       41308 :     for (sb = nfa->lastsb; sb != NULL; sb = sbnext)
     115             :     {
     116       21506 :         sbnext = sb->next;
     117       21506 :         nfa->v->spaceused -= STATEBATCHSIZE(sb->nstates);
     118       21506 :         FREE(sb);
     119             :     }
     120       19802 :     nfa->lastsb = NULL;
     121       55798 :     for (ab = nfa->lastab; ab != NULL; ab = abnext)
     122             :     {
     123       35996 :         abnext = ab->next;
     124       35996 :         nfa->v->spaceused -= ARCBATCHSIZE(ab->narcs);
     125       35996 :         FREE(ab);
     126             :     }
     127       19802 :     nfa->lastab = NULL;
     128             : 
     129       19802 :     nfa->nstates = -1;
     130       19802 :     FREE(nfa);
     131       19802 : }
     132             : 
     133             : /*
     134             :  * newstate - allocate an NFA state, with zero flag value
     135             :  */
     136             : static struct state *           /* NULL on error */
     137      519194 : newstate(struct nfa *nfa)
     138             : {
     139             :     struct state *s;
     140             : 
     141             :     /*
     142             :      * This is a handy place to check for operation cancel during regex
     143             :      * compilation, since no code path will go very long without making a new
     144             :      * state or arc.
     145             :      */
     146      519194 :     INTERRUPT(nfa->v->re);
     147             : 
     148             :     /* first, recycle anything that's on the freelist */
     149      519194 :     if (nfa->freestates != NULL)
     150             :     {
     151       33928 :         s = nfa->freestates;
     152       33928 :         nfa->freestates = s->next;
     153             :     }
     154             :     /* otherwise, is there anything left in the last statebatch? */
     155      485266 :     else if (nfa->lastsb != NULL && nfa->lastsbused < nfa->lastsb->nstates)
     156             :     {
     157      463760 :         s = &nfa->lastsb->s[nfa->lastsbused++];
     158             :     }
     159             :     /* otherwise, need to allocate a new statebatch */
     160             :     else
     161             :     {
     162             :         struct statebatch *newSb;
     163             :         size_t      nstates;
     164             : 
     165       21506 :         if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
     166             :         {
     167           0 :             NERR(REG_ETOOBIG);
     168           0 :             return NULL;
     169             :         }
     170       21506 :         nstates = (nfa->lastsb != NULL) ? nfa->lastsb->nstates * 2 : FIRSTSBSIZE;
     171       21506 :         if (nstates > MAXSBSIZE)
     172          50 :             nstates = MAXSBSIZE;
     173       21506 :         newSb = (struct statebatch *) MALLOC(STATEBATCHSIZE(nstates));
     174       21506 :         if (newSb == NULL)
     175             :         {
     176           0 :             NERR(REG_ESPACE);
     177           0 :             return NULL;
     178             :         }
     179       21506 :         nfa->v->spaceused += STATEBATCHSIZE(nstates);
     180       21506 :         newSb->nstates = nstates;
     181       21506 :         newSb->next = nfa->lastsb;
     182       21506 :         nfa->lastsb = newSb;
     183       21506 :         nfa->lastsbused = 1;
     184       21506 :         s = &newSb->s[0];
     185             :     }
     186             : 
     187             :     assert(nfa->nstates >= 0);
     188      519194 :     s->no = nfa->nstates++;
     189      519194 :     s->flag = 0;
     190      519194 :     if (nfa->states == NULL)
     191       19802 :         nfa->states = s;
     192      519194 :     s->nins = 0;
     193      519194 :     s->ins = NULL;
     194      519194 :     s->nouts = 0;
     195      519194 :     s->outs = NULL;
     196      519194 :     s->tmp = NULL;
     197      519194 :     s->next = NULL;
     198      519194 :     if (nfa->slast != NULL)
     199             :     {
     200             :         assert(nfa->slast->next == NULL);
     201      499392 :         nfa->slast->next = s;
     202             :     }
     203      519194 :     s->prev = nfa->slast;
     204      519194 :     nfa->slast = s;
     205      519194 :     return s;
     206             : }
     207             : 
     208             : /*
     209             :  * newfstate - allocate an NFA state with a specified flag value
     210             :  */
     211             : static struct state *           /* NULL on error */
     212       39604 : newfstate(struct nfa *nfa, int flag)
     213             : {
     214             :     struct state *s;
     215             : 
     216       39604 :     s = newstate(nfa);
     217       39604 :     if (s != NULL)
     218       39604 :         s->flag = (char) flag;
     219       39604 :     return s;
     220             : }
     221             : 
     222             : /*
     223             :  * dropstate - delete a state's inarcs and outarcs and free it
     224             :  */
     225             : static void
     226      215106 : dropstate(struct nfa *nfa,
     227             :           struct state *s)
     228             : {
     229             :     struct arc *a;
     230             : 
     231      252572 :     while ((a = s->ins) != NULL)
     232       37466 :         freearc(nfa, a);
     233      354770 :     while ((a = s->outs) != NULL)
     234      139664 :         freearc(nfa, a);
     235      215106 :     freestate(nfa, s);
     236      215106 : }
     237             : 
     238             : /*
     239             :  * freestate - free a state, which has no in-arcs or out-arcs
     240             :  */
     241             : static void
     242      224894 : freestate(struct nfa *nfa,
     243             :           struct state *s)
     244             : {
     245             :     assert(s != NULL);
     246             :     assert(s->nins == 0 && s->nouts == 0);
     247             : 
     248      224894 :     s->no = FREESTATE;
     249      224894 :     s->flag = 0;
     250      224894 :     if (s->next != NULL)
     251      212416 :         s->next->prev = s->prev;
     252             :     else
     253             :     {
     254             :         assert(s == nfa->slast);
     255       12478 :         nfa->slast = s->prev;
     256             :     }
     257      224894 :     if (s->prev != NULL)
     258      224894 :         s->prev->next = s->next;
     259             :     else
     260             :     {
     261             :         assert(s == nfa->states);
     262           0 :         nfa->states = s->next;
     263             :     }
     264      224894 :     s->prev = NULL;
     265      224894 :     s->next = nfa->freestates;    /* don't delete it, put it on the free list */
     266      224894 :     nfa->freestates = s;
     267      224894 : }
     268             : 
     269             : /*
     270             :  * newarc - set up a new arc within an NFA
     271             :  *
     272             :  * This function checks to make sure that no duplicate arcs are created.
     273             :  * In general we never want duplicates.
     274             :  *
     275             :  * However: in principle, a RAINBOW arc is redundant with any plain arc
     276             :  * (unless that arc is for a pseudocolor).  But we don't try to recognize
     277             :  * that redundancy, either here or in allied operations such as moveins().
     278             :  * The pseudocolor consideration makes that more costly than it seems worth.
     279             :  */
     280             : static void
     281     1904702 : newarc(struct nfa *nfa,
     282             :        int t,
     283             :        color co,
     284             :        struct state *from,
     285             :        struct state *to)
     286             : {
     287             :     struct arc *a;
     288             : 
     289             :     assert(from != NULL && to != NULL);
     290             : 
     291             :     /*
     292             :      * This is a handy place to check for operation cancel during regex
     293             :      * compilation, since no code path will go very long without making a new
     294             :      * state or arc.
     295             :      */
     296     1904702 :     INTERRUPT(nfa->v->re);
     297             : 
     298             :     /* check for duplicate arc, using whichever chain is shorter */
     299     1904702 :     if (from->nouts <= to->nins)
     300             :     {
     301     6031728 :         for (a = from->outs; a != NULL; a = a->outchain)
     302     5129962 :             if (a->to == to && a->co == co && a->type == t)
     303      170652 :                 return;
     304             :     }
     305             :     else
     306             :     {
     307    76782148 :         for (a = to->ins; a != NULL; a = a->inchain)
     308    76121504 :             if (a->from == from && a->co == co && a->type == t)
     309      171640 :                 return;
     310             :     }
     311             : 
     312             :     /* no dup, so create the arc */
     313     1562410 :     createarc(nfa, t, co, from, to);
     314             : }
     315             : 
     316             : /*
     317             :  * createarc - create a new arc within an NFA
     318             :  *
     319             :  * This function must *only* be used after verifying that there is no existing
     320             :  * identical arc (same type/color/from/to).
     321             :  */
     322             : static void
     323    17449602 : createarc(struct nfa *nfa,
     324             :           int t,
     325             :           color co,
     326             :           struct state *from,
     327             :           struct state *to)
     328             : {
     329             :     struct arc *a;
     330             : 
     331    17449602 :     a = allocarc(nfa);
     332    17449602 :     if (NISERR())
     333       11418 :         return;
     334             :     assert(a != NULL);
     335             : 
     336    17438184 :     a->type = t;
     337    17438184 :     a->co = co;
     338    17438184 :     a->to = to;
     339    17438184 :     a->from = from;
     340             : 
     341             :     /*
     342             :      * Put the new arc on the beginning, not the end, of the chains; it's
     343             :      * simpler here, and freearc() is the same cost either way.  See also the
     344             :      * logic in moveins() and its cohorts, as well as fixempties().
     345             :      */
     346    17438184 :     a->inchain = to->ins;
     347    17438184 :     a->inchainRev = NULL;
     348    17438184 :     if (to->ins)
     349    16821238 :         to->ins->inchainRev = a;
     350    17438184 :     to->ins = a;
     351    17438184 :     a->outchain = from->outs;
     352    17438184 :     a->outchainRev = NULL;
     353    17438184 :     if (from->outs)
     354    16904890 :         from->outs->outchainRev = a;
     355    17438184 :     from->outs = a;
     356             : 
     357    17438184 :     from->nouts++;
     358    17438184 :     to->nins++;
     359             : 
     360    17438184 :     if (COLORED(a) && nfa->parent == NULL)
     361     1599290 :         colorchain(nfa->cm, a);
     362             : }
     363             : 
     364             : /*
     365             :  * allocarc - allocate a new arc within an NFA
     366             :  */
     367             : static struct arc *             /* NULL for failure */
     368    17449602 : allocarc(struct nfa *nfa)
     369             : {
     370             :     struct arc *a;
     371             : 
     372             :     /* first, recycle anything that's on the freelist */
     373    17449602 :     if (nfa->freearcs != NULL)
     374             :     {
     375     1386892 :         a = nfa->freearcs;
     376     1386892 :         nfa->freearcs = a->freechain;
     377             :     }
     378             :     /* otherwise, is there anything left in the last arcbatch? */
     379    16062710 :     else if (nfa->lastab != NULL && nfa->lastabused < nfa->lastab->narcs)
     380             :     {
     381    16015296 :         a = &nfa->lastab->a[nfa->lastabused++];
     382             :     }
     383             :     /* otherwise, need to allocate a new arcbatch */
     384             :     else
     385             :     {
     386             :         struct arcbatch *newAb;
     387             :         size_t      narcs;
     388             : 
     389       47414 :         if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
     390             :         {
     391       11418 :             NERR(REG_ETOOBIG);
     392       11418 :             return NULL;
     393             :         }
     394       35996 :         narcs = (nfa->lastab != NULL) ? nfa->lastab->narcs * 2 : FIRSTABSIZE;
     395       35996 :         if (narcs > MAXABSIZE)
     396       15058 :             narcs = MAXABSIZE;
     397       35996 :         newAb = (struct arcbatch *) MALLOC(ARCBATCHSIZE(narcs));
     398       35996 :         if (newAb == NULL)
     399             :         {
     400           0 :             NERR(REG_ESPACE);
     401           0 :             return NULL;
     402             :         }
     403       35996 :         nfa->v->spaceused += ARCBATCHSIZE(narcs);
     404       35996 :         newAb->narcs = narcs;
     405       35996 :         newAb->next = nfa->lastab;
     406       35996 :         nfa->lastab = newAb;
     407       35996 :         nfa->lastabused = 1;
     408       35996 :         a = &newAb->a[0];
     409             :     }
     410             : 
     411    17438184 :     return a;
     412             : }
     413             : 
     414             : /*
     415             :  * freearc - free an arc
     416             :  */
     417             : static void
     418     1612798 : freearc(struct nfa *nfa,
     419             :         struct arc *victim)
     420             : {
     421     1612798 :     struct state *from = victim->from;
     422     1612798 :     struct state *to = victim->to;
     423             :     struct arc *predecessor;
     424             : 
     425             :     assert(victim->type != 0);
     426             : 
     427             :     /* take it off color chain if necessary */
     428     1612798 :     if (COLORED(victim) && nfa->parent == NULL)
     429      739348 :         uncolorchain(nfa->cm, victim);
     430             : 
     431             :     /* take it off source's out-chain */
     432             :     assert(from != NULL);
     433     1612798 :     predecessor = victim->outchainRev;
     434     1612798 :     if (predecessor == NULL)
     435             :     {
     436             :         assert(from->outs == victim);
     437      465090 :         from->outs = victim->outchain;
     438             :     }
     439             :     else
     440             :     {
     441             :         assert(predecessor->outchain == victim);
     442     1147708 :         predecessor->outchain = victim->outchain;
     443             :     }
     444     1612798 :     if (victim->outchain != NULL)
     445             :     {
     446             :         assert(victim->outchain->outchainRev == victim);
     447     1024468 :         victim->outchain->outchainRev = predecessor;
     448             :     }
     449     1612798 :     from->nouts--;
     450             : 
     451             :     /* take it off target's in-chain */
     452             :     assert(to != NULL);
     453     1612798 :     predecessor = victim->inchainRev;
     454     1612798 :     if (predecessor == NULL)
     455             :     {
     456             :         assert(to->ins == victim);
     457      777288 :         to->ins = victim->inchain;
     458             :     }
     459             :     else
     460             :     {
     461             :         assert(predecessor->inchain == victim);
     462      835510 :         predecessor->inchain = victim->inchain;
     463             :     }
     464     1612798 :     if (victim->inchain != NULL)
     465             :     {
     466             :         assert(victim->inchain->inchainRev == victim);
     467     1055708 :         victim->inchain->inchainRev = predecessor;
     468             :     }
     469     1612798 :     to->nins--;
     470             : 
     471             :     /* clean up and place on NFA's free list */
     472     1612798 :     victim->type = 0;
     473     1612798 :     victim->from = NULL;     /* precautions... */
     474     1612798 :     victim->to = NULL;
     475     1612798 :     victim->inchain = NULL;
     476     1612798 :     victim->inchainRev = NULL;
     477     1612798 :     victim->outchain = NULL;
     478     1612798 :     victim->outchainRev = NULL;
     479     1612798 :     victim->freechain = nfa->freearcs;
     480     1612798 :     nfa->freearcs = victim;
     481     1612798 : }
     482             : 
     483             : /*
     484             :  * changearcsource - flip an arc to have a different from state
     485             :  *
     486             :  * Caller must have verified that there is no pre-existing duplicate arc.
     487             :  */
     488             : static void
     489         588 : changearcsource(struct arc *a, struct state *newfrom)
     490             : {
     491         588 :     struct state *oldfrom = a->from;
     492             :     struct arc *predecessor;
     493             : 
     494             :     assert(oldfrom != newfrom);
     495             : 
     496             :     /* take it off old source's out-chain */
     497             :     assert(oldfrom != NULL);
     498         588 :     predecessor = a->outchainRev;
     499         588 :     if (predecessor == NULL)
     500             :     {
     501             :         assert(oldfrom->outs == a);
     502         588 :         oldfrom->outs = a->outchain;
     503             :     }
     504             :     else
     505             :     {
     506             :         assert(predecessor->outchain == a);
     507           0 :         predecessor->outchain = a->outchain;
     508             :     }
     509         588 :     if (a->outchain != NULL)
     510             :     {
     511             :         assert(a->outchain->outchainRev == a);
     512         574 :         a->outchain->outchainRev = predecessor;
     513             :     }
     514         588 :     oldfrom->nouts--;
     515             : 
     516         588 :     a->from = newfrom;
     517             : 
     518             :     /* prepend it to new source's out-chain */
     519         588 :     a->outchain = newfrom->outs;
     520         588 :     a->outchainRev = NULL;
     521         588 :     if (newfrom->outs)
     522         588 :         newfrom->outs->outchainRev = a;
     523         588 :     newfrom->outs = a;
     524         588 :     newfrom->nouts++;
     525         588 : }
     526             : 
     527             : /*
     528             :  * changearctarget - flip an arc to have a different to state
     529             :  *
     530             :  * Caller must have verified that there is no pre-existing duplicate arc.
     531             :  */
     532             : static void
     533         320 : changearctarget(struct arc *a, struct state *newto)
     534             : {
     535         320 :     struct state *oldto = a->to;
     536             :     struct arc *predecessor;
     537             : 
     538             :     assert(oldto != newto);
     539             : 
     540             :     /* take it off old target's in-chain */
     541             :     assert(oldto != NULL);
     542         320 :     predecessor = a->inchainRev;
     543         320 :     if (predecessor == NULL)
     544             :     {
     545             :         assert(oldto->ins == a);
     546         320 :         oldto->ins = a->inchain;
     547             :     }
     548             :     else
     549             :     {
     550             :         assert(predecessor->inchain == a);
     551           0 :         predecessor->inchain = a->inchain;
     552             :     }
     553         320 :     if (a->inchain != NULL)
     554             :     {
     555             :         assert(a->inchain->inchainRev == a);
     556         312 :         a->inchain->inchainRev = predecessor;
     557             :     }
     558         320 :     oldto->nins--;
     559             : 
     560         320 :     a->to = newto;
     561             : 
     562             :     /* prepend it to new target's in-chain */
     563         320 :     a->inchain = newto->ins;
     564         320 :     a->inchainRev = NULL;
     565         320 :     if (newto->ins)
     566         320 :         newto->ins->inchainRev = a;
     567         320 :     newto->ins = a;
     568         320 :     newto->nins++;
     569         320 : }
     570             : 
     571             : /*
     572             :  * hasnonemptyout - Does state have a non-EMPTY out arc?
     573             :  */
     574             : static int
     575      239436 : hasnonemptyout(struct state *s)
     576             : {
     577             :     struct arc *a;
     578             : 
     579      267662 :     for (a = s->outs; a != NULL; a = a->outchain)
     580             :     {
     581      263702 :         if (a->type != EMPTY)
     582      235476 :             return 1;
     583             :     }
     584        3960 :     return 0;
     585             : }
     586             : 
     587             : /*
     588             :  * findarc - find arc, if any, from given source with given type and color
     589             :  * If there is more than one such arc, the result is random.
     590             :  */
     591             : static struct arc *
     592         638 : findarc(struct state *s,
     593             :         int type,
     594             :         color co)
     595             : {
     596             :     struct arc *a;
     597             : 
     598        1924 :     for (a = s->outs; a != NULL; a = a->outchain)
     599        1292 :         if (a->type == type && a->co == co)
     600           6 :             return a;
     601         632 :     return NULL;
     602             : }
     603             : 
     604             : /*
     605             :  * cparc - allocate a new arc within an NFA, copying details from old one
     606             :  */
     607             : static void
     608     1500330 : cparc(struct nfa *nfa,
     609             :       struct arc *oa,
     610             :       struct state *from,
     611             :       struct state *to)
     612             : {
     613     1500330 :     newarc(nfa, oa->type, oa->co, from, to);
     614     1500330 : }
     615             : 
     616             : /*
     617             :  * sortins - sort the in arcs of a state by from/color/type
     618             :  */
     619             : static void
     620       30806 : sortins(struct nfa *nfa,
     621             :         struct state *s)
     622             : {
     623             :     struct arc **sortarray;
     624             :     struct arc *a;
     625       30806 :     int         n = s->nins;
     626             :     int         i;
     627             : 
     628       30806 :     if (n <= 1)
     629           4 :         return;                 /* nothing to do */
     630             :     /* make an array of arc pointers ... */
     631       30802 :     sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
     632       30802 :     if (sortarray == NULL)
     633             :     {
     634           0 :         NERR(REG_ESPACE);
     635           0 :         return;
     636             :     }
     637       30802 :     i = 0;
     638      136740 :     for (a = s->ins; a != NULL; a = a->inchain)
     639      105938 :         sortarray[i++] = a;
     640             :     assert(i == n);
     641             :     /* ... sort the array */
     642       30802 :     qsort(sortarray, n, sizeof(struct arc *), sortins_cmp);
     643             :     /* ... and rebuild arc list in order */
     644             :     /* it seems worth special-casing first and last items to simplify loop */
     645       30802 :     a = sortarray[0];
     646       30802 :     s->ins = a;
     647       30802 :     a->inchain = sortarray[1];
     648       30802 :     a->inchainRev = NULL;
     649       75136 :     for (i = 1; i < n - 1; i++)
     650             :     {
     651       44334 :         a = sortarray[i];
     652       44334 :         a->inchain = sortarray[i + 1];
     653       44334 :         a->inchainRev = sortarray[i - 1];
     654             :     }
     655       30802 :     a = sortarray[i];
     656       30802 :     a->inchain = NULL;
     657       30802 :     a->inchainRev = sortarray[i - 1];
     658       30802 :     FREE(sortarray);
     659             : }
     660             : 
     661             : static int
     662    75788308 : sortins_cmp(const void *a, const void *b)
     663             : {
     664    75788308 :     const struct arc *aa = *((const struct arc *const *) a);
     665    75788308 :     const struct arc *bb = *((const struct arc *const *) b);
     666             : 
     667             :     /* we check the fields in the order they are most likely to be different */
     668    75788308 :     if (aa->from->no < bb->from->no)
     669    60277810 :         return -1;
     670    15510498 :     if (aa->from->no > bb->from->no)
     671    15048726 :         return 1;
     672      461772 :     if (aa->co < bb->co)
     673      245572 :         return -1;
     674      216200 :     if (aa->co > bb->co)
     675      212976 :         return 1;
     676        3224 :     if (aa->type < bb->type)
     677         136 :         return -1;
     678        3088 :     if (aa->type > bb->type)
     679          66 :         return 1;
     680        3022 :     return 0;
     681             : }
     682             : 
     683             : /*
     684             :  * sortouts - sort the out arcs of a state by to/color/type
     685             :  */
     686             : static void
     687          32 : sortouts(struct nfa *nfa,
     688             :          struct state *s)
     689             : {
     690             :     struct arc **sortarray;
     691             :     struct arc *a;
     692          32 :     int         n = s->nouts;
     693             :     int         i;
     694             : 
     695          32 :     if (n <= 1)
     696           0 :         return;                 /* nothing to do */
     697             :     /* make an array of arc pointers ... */
     698          32 :     sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
     699          32 :     if (sortarray == NULL)
     700             :     {
     701           0 :         NERR(REG_ESPACE);
     702           0 :         return;
     703             :     }
     704          32 :     i = 0;
     705         968 :     for (a = s->outs; a != NULL; a = a->outchain)
     706         936 :         sortarray[i++] = a;
     707             :     assert(i == n);
     708             :     /* ... sort the array */
     709          32 :     qsort(sortarray, n, sizeof(struct arc *), sortouts_cmp);
     710             :     /* ... and rebuild arc list in order */
     711             :     /* it seems worth special-casing first and last items to simplify loop */
     712          32 :     a = sortarray[0];
     713          32 :     s->outs = a;
     714          32 :     a->outchain = sortarray[1];
     715          32 :     a->outchainRev = NULL;
     716         904 :     for (i = 1; i < n - 1; i++)
     717             :     {
     718         872 :         a = sortarray[i];
     719         872 :         a->outchain = sortarray[i + 1];
     720         872 :         a->outchainRev = sortarray[i - 1];
     721             :     }
     722          32 :     a = sortarray[i];
     723          32 :     a->outchain = NULL;
     724          32 :     a->outchainRev = sortarray[i - 1];
     725          32 :     FREE(sortarray);
     726             : }
     727             : 
     728             : static int
     729        5492 : sortouts_cmp(const void *a, const void *b)
     730             : {
     731        5492 :     const struct arc *aa = *((const struct arc *const *) a);
     732        5492 :     const struct arc *bb = *((const struct arc *const *) b);
     733             : 
     734             :     /* we check the fields in the order they are most likely to be different */
     735        5492 :     if (aa->to->no < bb->to->no)
     736         610 :         return -1;
     737        4882 :     if (aa->to->no > bb->to->no)
     738         160 :         return 1;
     739        4722 :     if (aa->co < bb->co)
     740        2562 :         return -1;
     741        2160 :     if (aa->co > bb->co)
     742        2116 :         return 1;
     743          44 :     if (aa->type < bb->type)
     744           2 :         return -1;
     745          42 :     if (aa->type > bb->type)
     746          14 :         return 1;
     747          28 :     return 0;
     748             : }
     749             : 
     750             : /*
     751             :  * Common decision logic about whether to use arc-by-arc operations or
     752             :  * sort/merge.  If there's just a few source arcs we cannot recoup the
     753             :  * cost of sorting the destination arc list, no matter how large it is.
     754             :  * Otherwise, limit the number of arc-by-arc comparisons to about 1000
     755             :  * (a somewhat arbitrary choice, but the breakeven point would probably
     756             :  * be machine dependent anyway).
     757             :  */
     758             : #define BULK_ARC_OP_USE_SORT(nsrcarcs, ndestarcs) \
     759             :     ((nsrcarcs) < 4 ? 0 : ((nsrcarcs) > 32 || (ndestarcs) > 32))
     760             : 
     761             : /*
     762             :  * moveins - move all in arcs of a state to another state
     763             :  *
     764             :  * You might think this could be done better by just updating the
     765             :  * existing arcs, and you would be right if it weren't for the need
     766             :  * for duplicate suppression, which makes it easier to just make new
     767             :  * ones to exploit the suppression built into newarc.
     768             :  *
     769             :  * However, if we have a whole lot of arcs to deal with, retail duplicate
     770             :  * checks become too slow.  In that case we proceed by sorting and merging
     771             :  * the arc lists, and then we can indeed just update the arcs in-place.
     772             :  *
     773             :  * On the other hand, it's also true that this is frequently called with
     774             :  * a brand-new newState that has no existing in-arcs.  In that case,
     775             :  * de-duplication is unnecessary, so we can just blindly move all the arcs.
     776             :  */
     777             : static void
     778      260922 : moveins(struct nfa *nfa,
     779             :         struct state *oldState,
     780             :         struct state *newState)
     781             : {
     782             :     assert(oldState != newState);
     783             : 
     784      260922 :     if (newState->nins == 0)
     785             :     {
     786             :         /* No need for de-duplication */
     787             :         struct arc *a;
     788             : 
     789      216952 :         while ((a = oldState->ins) != NULL)
     790             :         {
     791      111542 :             createarc(nfa, a->type, a->co, a->from, newState);
     792      111542 :             freearc(nfa, a);
     793             :         }
     794             :     }
     795      155512 :     else if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
     796      155420 :     {
     797             :         /* With not too many arcs, just do them one at a time */
     798             :         struct arc *a;
     799             : 
     800      382688 :         while ((a = oldState->ins) != NULL)
     801             :         {
     802      227268 :             cparc(nfa, a, a->from, newState);
     803      227268 :             freearc(nfa, a);
     804             :         }
     805             :     }
     806             :     else
     807             :     {
     808             :         /*
     809             :          * With many arcs, use a sort-merge approach.  Note changearctarget()
     810             :          * will put the arc onto the front of newState's chain, so it does not
     811             :          * break our walk through the sorted part of the chain.
     812             :          */
     813             :         struct arc *oa;
     814             :         struct arc *na;
     815             : 
     816             :         /*
     817             :          * Because we bypass newarc() in this code path, we'd better include a
     818             :          * cancel check.
     819             :          */
     820          92 :         INTERRUPT(nfa->v->re);
     821             : 
     822          92 :         sortins(nfa, oldState);
     823          92 :         sortins(nfa, newState);
     824          92 :         if (NISERR())
     825           0 :             return;             /* might have failed to sort */
     826          92 :         oa = oldState->ins;
     827          92 :         na = newState->ins;
     828        3398 :         while (oa != NULL && na != NULL)
     829             :         {
     830        3214 :             struct arc *a = oa;
     831             : 
     832        3214 :             switch (sortins_cmp(&oa, &na))
     833             :             {
     834         166 :                 case -1:
     835             :                     /* newState does not have anything matching oa */
     836         166 :                     oa = oa->inchain;
     837             : 
     838             :                     /*
     839             :                      * Rather than doing createarc+freearc, we can just unlink
     840             :                      * and relink the existing arc struct.
     841             :                      */
     842         166 :                     changearctarget(a, newState);
     843         166 :                     break;
     844         472 :                 case 0:
     845             :                     /* match, advance in both lists */
     846         472 :                     oa = oa->inchain;
     847         472 :                     na = na->inchain;
     848             :                     /* ... and drop duplicate arc from oldState */
     849         472 :                     freearc(nfa, a);
     850         472 :                     break;
     851        2576 :                 case +1:
     852             :                     /* advance only na; oa might have a match later */
     853        2576 :                     na = na->inchain;
     854        2576 :                     break;
     855        3214 :                 default:
     856             :                     assert(NOTREACHED);
     857             :             }
     858             :         }
     859         246 :         while (oa != NULL)
     860             :         {
     861             :             /* newState does not have anything matching oa */
     862         154 :             struct arc *a = oa;
     863             : 
     864         154 :             oa = oa->inchain;
     865         154 :             changearctarget(a, newState);
     866             :         }
     867             :     }
     868             : 
     869             :     assert(oldState->nins == 0);
     870             :     assert(oldState->ins == NULL);
     871             : }
     872             : 
     873             : /*
     874             :  * copyins - copy in arcs of a state to another state
     875             :  *
     876             :  * The comments for moveins() apply here as well.  However, in current
     877             :  * usage, this is *only* called with brand-new target states, so that
     878             :  * only the "no need for de-duplication" code path is ever reached.
     879             :  * We keep the rest #ifdef'd out in case it's needed in the future.
     880             :  */
     881             : static void
     882       18950 : copyins(struct nfa *nfa,
     883             :         struct state *oldState,
     884             :         struct state *newState)
     885             : {
     886             :     assert(oldState != newState);
     887             :     assert(newState->nins == 0); /* see comment above */
     888             : 
     889       18950 :     if (newState->nins == 0)
     890             :     {
     891             :         /* No need for de-duplication */
     892             :         struct arc *a;
     893             : 
     894      260316 :         for (a = oldState->ins; a != NULL; a = a->inchain)
     895      241366 :             createarc(nfa, a->type, a->co, a->from, newState);
     896             :     }
     897             : #ifdef NOT_USED                 /* see comment above */
     898             :     else if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
     899             :     {
     900             :         /* With not too many arcs, just do them one at a time */
     901             :         struct arc *a;
     902             : 
     903             :         for (a = oldState->ins; a != NULL; a = a->inchain)
     904             :             cparc(nfa, a, a->from, newState);
     905             :     }
     906             :     else
     907             :     {
     908             :         /*
     909             :          * With many arcs, use a sort-merge approach.  Note that createarc()
     910             :          * will put new arcs onto the front of newState's chain, so it does
     911             :          * not break our walk through the sorted part of the chain.
     912             :          */
     913             :         struct arc *oa;
     914             :         struct arc *na;
     915             : 
     916             :         /*
     917             :          * Because we bypass newarc() in this code path, we'd better include a
     918             :          * cancel check.
     919             :          */
     920             :         INTERRUPT(nfa->v->re);
     921             : 
     922             :         sortins(nfa, oldState);
     923             :         sortins(nfa, newState);
     924             :         if (NISERR())
     925             :             return;             /* might have failed to sort */
     926             :         oa = oldState->ins;
     927             :         na = newState->ins;
     928             :         while (oa != NULL && na != NULL)
     929             :         {
     930             :             struct arc *a = oa;
     931             : 
     932             :             switch (sortins_cmp(&oa, &na))
     933             :             {
     934             :                 case -1:
     935             :                     /* newState does not have anything matching oa */
     936             :                     oa = oa->inchain;
     937             :                     createarc(nfa, a->type, a->co, a->from, newState);
     938             :                     break;
     939             :                 case 0:
     940             :                     /* match, advance in both lists */
     941             :                     oa = oa->inchain;
     942             :                     na = na->inchain;
     943             :                     break;
     944             :                 case +1:
     945             :                     /* advance only na; oa might have a match later */
     946             :                     na = na->inchain;
     947             :                     break;
     948             :                 default:
     949             :                     assert(NOTREACHED);
     950             :             }
     951             :         }
     952             :         while (oa != NULL)
     953             :         {
     954             :             /* newState does not have anything matching oa */
     955             :             struct arc *a = oa;
     956             : 
     957             :             oa = oa->inchain;
     958             :             createarc(nfa, a->type, a->co, a->from, newState);
     959             :         }
     960             :     }
     961             : #endif                          /* NOT_USED */
     962       18950 : }
     963             : 
     964             : /*
     965             :  * mergeins - merge a list of inarcs into a state
     966             :  *
     967             :  * This is much like copyins, but the source arcs are listed in an array,
     968             :  * and are not guaranteed unique.  It's okay to clobber the array contents.
     969             :  */
     970             : static void
     971      274592 : mergeins(struct nfa *nfa,
     972             :          struct state *s,
     973             :          struct arc **arcarray,
     974             :          int arccount)
     975             : {
     976             :     struct arc *na;
     977             :     int         i;
     978             :     int         j;
     979             : 
     980      274592 :     if (arccount <= 0)
     981      243970 :         return;
     982             : 
     983             :     /*
     984             :      * Because we bypass newarc() in this code path, we'd better include a
     985             :      * cancel check.
     986             :      */
     987       30622 :     INTERRUPT(nfa->v->re);
     988             : 
     989             :     /* Sort existing inarcs as well as proposed new ones */
     990       30622 :     sortins(nfa, s);
     991       30622 :     if (NISERR())
     992           0 :         return;                 /* might have failed to sort */
     993             : 
     994       30622 :     qsort(arcarray, arccount, sizeof(struct arc *), sortins_cmp);
     995             : 
     996             :     /*
     997             :      * arcarray very likely includes dups, so we must eliminate them.  (This
     998             :      * could be folded into the next loop, but it's not worth the trouble.)
     999             :      */
    1000       30622 :     j = 0;
    1001    15030890 :     for (i = 1; i < arccount; i++)
    1002             :     {
    1003    15000268 :         switch (sortins_cmp(&arcarray[j], &arcarray[i]))
    1004             :         {
    1005    14999056 :             case -1:
    1006             :                 /* non-dup */
    1007    14999056 :                 arcarray[++j] = arcarray[i];
    1008    14999056 :                 break;
    1009        1212 :             case 0:
    1010             :                 /* dup */
    1011        1212 :                 break;
    1012    15000268 :             default:
    1013             :                 /* trouble */
    1014             :                 assert(NOTREACHED);
    1015             :         }
    1016             :     }
    1017       30622 :     arccount = j + 1;
    1018             : 
    1019             :     /*
    1020             :      * Now merge into s' inchain.  Note that createarc() will put new arcs
    1021             :      * onto the front of s's chain, so it does not break our walk through the
    1022             :      * sorted part of the chain.
    1023             :      */
    1024       30622 :     i = 0;
    1025       30622 :     na = s->ins;
    1026    15061092 :     while (i < arccount && na != NULL)
    1027             :     {
    1028    15030470 :         struct arc *a = arcarray[i];
    1029             : 
    1030    15030470 :         switch (sortins_cmp(&a, &na))
    1031             :         {
    1032    14987258 :             case -1:
    1033             :                 /* s does not have anything matching a */
    1034    14987258 :                 createarc(nfa, a->type, a->co, a->from, s);
    1035    14987258 :                 i++;
    1036    14987258 :                 break;
    1037          20 :             case 0:
    1038             :                 /* match, advance in both lists */
    1039          20 :                 i++;
    1040          20 :                 na = na->inchain;
    1041          20 :                 break;
    1042       43192 :             case +1:
    1043             :                 /* advance only na; array might have a match later */
    1044       43192 :                 na = na->inchain;
    1045       43192 :                 break;
    1046    15030470 :             default:
    1047             :                 assert(NOTREACHED);
    1048             :         }
    1049             :     }
    1050       73022 :     while (i < arccount)
    1051             :     {
    1052             :         /* s does not have anything matching a */
    1053       42400 :         struct arc *a = arcarray[i];
    1054             : 
    1055       42400 :         createarc(nfa, a->type, a->co, a->from, s);
    1056       42400 :         i++;
    1057             :     }
    1058             : }
    1059             : 
    1060             : /*
    1061             :  * moveouts - move all out arcs of a state to another state
    1062             :  *
    1063             :  * See comments for moveins()
    1064             :  */
    1065             : static void
    1066       70244 : moveouts(struct nfa *nfa,
    1067             :          struct state *oldState,
    1068             :          struct state *newState)
    1069             : {
    1070             :     assert(oldState != newState);
    1071             : 
    1072       70244 :     if (newState->nouts == 0)
    1073             :     {
    1074             :         /* No need for de-duplication */
    1075             :         struct arc *a;
    1076             : 
    1077       55892 :         while ((a = oldState->outs) != NULL)
    1078             :         {
    1079       30740 :             createarc(nfa, a->type, a->co, newState, a->to);
    1080       30740 :             freearc(nfa, a);
    1081             :         }
    1082             :     }
    1083       45092 :     else if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
    1084       45076 :     {
    1085             :         /* With not too many arcs, just do them one at a time */
    1086             :         struct arc *a;
    1087             : 
    1088      108776 :         while ((a = oldState->outs) != NULL)
    1089             :         {
    1090       63700 :             cparc(nfa, a, newState, a->to);
    1091       63700 :             freearc(nfa, a);
    1092             :         }
    1093             :     }
    1094             :     else
    1095             :     {
    1096             :         /*
    1097             :          * With many arcs, use a sort-merge approach.  Note changearcsource()
    1098             :          * will put the arc onto the front of newState's chain, so it does not
    1099             :          * break our walk through the sorted part of the chain.
    1100             :          */
    1101             :         struct arc *oa;
    1102             :         struct arc *na;
    1103             : 
    1104             :         /*
    1105             :          * Because we bypass newarc() in this code path, we'd better include a
    1106             :          * cancel check.
    1107             :          */
    1108          16 :         INTERRUPT(nfa->v->re);
    1109             : 
    1110          16 :         sortouts(nfa, oldState);
    1111          16 :         sortouts(nfa, newState);
    1112          16 :         if (NISERR())
    1113           0 :             return;             /* might have failed to sort */
    1114          16 :         oa = oldState->outs;
    1115          16 :         na = newState->outs;
    1116         592 :         while (oa != NULL && na != NULL)
    1117             :         {
    1118         560 :             struct arc *a = oa;
    1119             : 
    1120         560 :             switch (sortouts_cmp(&oa, &na))
    1121             :             {
    1122         516 :                 case -1:
    1123             :                     /* newState does not have anything matching oa */
    1124         516 :                     oa = oa->outchain;
    1125             : 
    1126             :                     /*
    1127             :                      * Rather than doing createarc+freearc, we can just unlink
    1128             :                      * and relink the existing arc struct.
    1129             :                      */
    1130         516 :                     changearcsource(a, newState);
    1131         516 :                     break;
    1132          28 :                 case 0:
    1133             :                     /* match, advance in both lists */
    1134          28 :                     oa = oa->outchain;
    1135          28 :                     na = na->outchain;
    1136             :                     /* ... and drop duplicate arc from oldState */
    1137          28 :                     freearc(nfa, a);
    1138          28 :                     break;
    1139          16 :                 case +1:
    1140             :                     /* advance only na; oa might have a match later */
    1141          16 :                     na = na->outchain;
    1142          16 :                     break;
    1143         560 :                 default:
    1144             :                     assert(NOTREACHED);
    1145             :             }
    1146             :         }
    1147          88 :         while (oa != NULL)
    1148             :         {
    1149             :             /* newState does not have anything matching oa */
    1150          72 :             struct arc *a = oa;
    1151             : 
    1152          72 :             oa = oa->outchain;
    1153          72 :             changearcsource(a, newState);
    1154             :         }
    1155             :     }
    1156             : 
    1157             :     assert(oldState->nouts == 0);
    1158             :     assert(oldState->outs == NULL);
    1159             : }
    1160             : 
    1161             : /*
    1162             :  * copyouts - copy out arcs of a state to another state
    1163             :  *
    1164             :  * See comments for copyins()
    1165             :  */
    1166             : static void
    1167       14342 : copyouts(struct nfa *nfa,
    1168             :          struct state *oldState,
    1169             :          struct state *newState)
    1170             : {
    1171             :     assert(oldState != newState);
    1172             :     assert(newState->nouts == 0);    /* see comment above */
    1173             : 
    1174       14342 :     if (newState->nouts == 0)
    1175             :     {
    1176             :         /* No need for de-duplication */
    1177             :         struct arc *a;
    1178             : 
    1179      488228 :         for (a = oldState->outs; a != NULL; a = a->outchain)
    1180      473886 :             createarc(nfa, a->type, a->co, newState, a->to);
    1181             :     }
    1182             : #ifdef NOT_USED                 /* see comment above */
    1183             :     else if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
    1184             :     {
    1185             :         /* With not too many arcs, just do them one at a time */
    1186             :         struct arc *a;
    1187             : 
    1188             :         for (a = oldState->outs; a != NULL; a = a->outchain)
    1189             :             cparc(nfa, a, newState, a->to);
    1190             :     }
    1191             :     else
    1192             :     {
    1193             :         /*
    1194             :          * With many arcs, use a sort-merge approach.  Note that createarc()
    1195             :          * will put new arcs onto the front of newState's chain, so it does
    1196             :          * not break our walk through the sorted part of the chain.
    1197             :          */
    1198             :         struct arc *oa;
    1199             :         struct arc *na;
    1200             : 
    1201             :         /*
    1202             :          * Because we bypass newarc() in this code path, we'd better include a
    1203             :          * cancel check.
    1204             :          */
    1205             :         INTERRUPT(nfa->v->re);
    1206             : 
    1207             :         sortouts(nfa, oldState);
    1208             :         sortouts(nfa, newState);
    1209             :         if (NISERR())
    1210             :             return;             /* might have failed to sort */
    1211             :         oa = oldState->outs;
    1212             :         na = newState->outs;
    1213             :         while (oa != NULL && na != NULL)
    1214             :         {
    1215             :             struct arc *a = oa;
    1216             : 
    1217             :             switch (sortouts_cmp(&oa, &na))
    1218             :             {
    1219             :                 case -1:
    1220             :                     /* newState does not have anything matching oa */
    1221             :                     oa = oa->outchain;
    1222             :                     createarc(nfa, a->type, a->co, newState, a->to);
    1223             :                     break;
    1224             :                 case 0:
    1225             :                     /* match, advance in both lists */
    1226             :                     oa = oa->outchain;
    1227             :                     na = na->outchain;
    1228             :                     break;
    1229             :                 case +1:
    1230             :                     /* advance only na; oa might have a match later */
    1231             :                     na = na->outchain;
    1232             :                     break;
    1233             :                 default:
    1234             :                     assert(NOTREACHED);
    1235             :             }
    1236             :         }
    1237             :         while (oa != NULL)
    1238             :         {
    1239             :             /* newState does not have anything matching oa */
    1240             :             struct arc *a = oa;
    1241             : 
    1242             :             oa = oa->outchain;
    1243             :             createarc(nfa, a->type, a->co, newState, a->to);
    1244             :         }
    1245             :     }
    1246             : #endif                          /* NOT_USED */
    1247       14342 : }
    1248             : 
    1249             : /*
    1250             :  * cloneouts - copy out arcs of a state to another state pair, modifying type
    1251             :  *
    1252             :  * This is only used to convert PLAIN arcs to AHEAD/BEHIND arcs, which share
    1253             :  * the same interpretation of "co".  It wouldn't be sensible with LACONs.
    1254             :  */
    1255             : static void
    1256         350 : cloneouts(struct nfa *nfa,
    1257             :           struct state *old,
    1258             :           struct state *from,
    1259             :           struct state *to,
    1260             :           int type)
    1261             : {
    1262             :     struct arc *a;
    1263             : 
    1264             :     assert(old != from);
    1265             :     assert(type == AHEAD || type == BEHIND);
    1266             : 
    1267        1306 :     for (a = old->outs; a != NULL; a = a->outchain)
    1268             :     {
    1269             :         assert(a->type == PLAIN);
    1270         956 :         newarc(nfa, type, a->co, from, to);
    1271             :     }
    1272         350 : }
    1273             : 
    1274             : /*
    1275             :  * delsub - delete a sub-NFA, updating subre pointers if necessary
    1276             :  *
    1277             :  * This uses a recursive traversal of the sub-NFA, marking already-seen
    1278             :  * states using their tmp pointer.
    1279             :  */
    1280             : static void
    1281       10094 : delsub(struct nfa *nfa,
    1282             :        struct state *lp,        /* the sub-NFA goes from here... */
    1283             :        struct state *rp)        /* ...to here, *not* inclusive */
    1284             : {
    1285             :     assert(lp != rp);
    1286             : 
    1287       10094 :     rp->tmp = rp;                /* mark end */
    1288             : 
    1289       10094 :     deltraverse(nfa, lp, lp);
    1290       10094 :     if (NISERR())
    1291           0 :         return;                 /* asserts might not hold after failure */
    1292             :     assert(lp->nouts == 0 && rp->nins == 0);  /* did the job */
    1293             :     assert(lp->no != FREESTATE && rp->no != FREESTATE); /* no more */
    1294             : 
    1295       10094 :     rp->tmp = NULL;              /* unmark end */
    1296       10094 :     lp->tmp = NULL;              /* and begin, marked by deltraverse */
    1297             : }
    1298             : 
    1299             : /*
    1300             :  * deltraverse - the recursive heart of delsub
    1301             :  * This routine's basic job is to destroy all out-arcs of the state.
    1302             :  */
    1303             : static void
    1304       29626 : deltraverse(struct nfa *nfa,
    1305             :             struct state *leftend,
    1306             :             struct state *s)
    1307             : {
    1308             :     struct arc *a;
    1309             :     struct state *to;
    1310             : 
    1311             :     /* Since this is recursive, it could be driven to stack overflow */
    1312       29626 :     if (STACK_TOO_DEEP(nfa->v->re))
    1313             :     {
    1314           0 :         NERR(REG_ETOOBIG);
    1315           0 :         return;
    1316             :     }
    1317             : 
    1318       29626 :     if (s->nouts == 0)
    1319         202 :         return;                 /* nothing to do */
    1320       29424 :     if (s->tmp != NULL)
    1321        9922 :         return;                 /* already in progress */
    1322             : 
    1323       19502 :     s->tmp = s;                  /* mark as in progress */
    1324             : 
    1325       39034 :     while ((a = s->outs) != NULL)
    1326             :     {
    1327       19532 :         to = a->to;
    1328       19532 :         deltraverse(nfa, leftend, to);
    1329       19532 :         if (NISERR())
    1330           0 :             return;             /* asserts might not hold after failure */
    1331             :         assert(to->nouts == 0 || to->tmp != NULL);
    1332       19532 :         freearc(nfa, a);
    1333       19532 :         if (to->nins == 0 && to->tmp == NULL)
    1334             :         {
    1335             :             assert(to->nouts == 0);
    1336        9408 :             freestate(nfa, to);
    1337             :         }
    1338             :     }
    1339             : 
    1340             :     assert(s->no != FREESTATE); /* we're still here */
    1341             :     assert(s == leftend || s->nins != 0);    /* and still reachable */
    1342             :     assert(s->nouts == 0);       /* but have no outarcs */
    1343             : 
    1344       19502 :     s->tmp = NULL;               /* we're done here */
    1345             : }
    1346             : 
    1347             : /*
    1348             :  * dupnfa - duplicate sub-NFA
    1349             :  *
    1350             :  * Another recursive traversal, this time using tmp to point to duplicates
    1351             :  * as well as mark already-seen states.  (You knew there was a reason why
    1352             :  * it's a state pointer, didn't you? :-))
    1353             :  */
    1354             : static void
    1355       14898 : dupnfa(struct nfa *nfa,
    1356             :        struct state *start,     /* duplicate of subNFA starting here */
    1357             :        struct state *stop,      /* and stopping here */
    1358             :        struct state *from,      /* stringing duplicate from here */
    1359             :        struct state *to)        /* to here */
    1360             : {
    1361       14898 :     if (start == stop)
    1362             :     {
    1363           0 :         newarc(nfa, EMPTY, 0, from, to);
    1364           0 :         return;
    1365             :     }
    1366             : 
    1367       14898 :     stop->tmp = to;
    1368       14898 :     duptraverse(nfa, start, from);
    1369             :     /* done, except for clearing out the tmp pointers */
    1370             : 
    1371       14898 :     stop->tmp = NULL;
    1372       14898 :     cleartraverse(nfa, start);
    1373             : }
    1374             : 
    1375             : /*
    1376             :  * duptraverse - recursive heart of dupnfa
    1377             :  */
    1378             : static void
    1379      381534 : duptraverse(struct nfa *nfa,
    1380             :             struct state *s,
    1381             :             struct state *stmp) /* s's duplicate, or NULL */
    1382             : {
    1383             :     struct arc *a;
    1384             : 
    1385             :     /* Since this is recursive, it could be driven to stack overflow */
    1386      381534 :     if (STACK_TOO_DEEP(nfa->v->re))
    1387             :     {
    1388           0 :         NERR(REG_ETOOBIG);
    1389           0 :         return;
    1390             :     }
    1391             : 
    1392      381534 :     if (s->tmp != NULL)
    1393      118844 :         return;                 /* already done */
    1394             : 
    1395      262690 :     s->tmp = (stmp == NULL) ? newstate(nfa) : stmp;
    1396      262690 :     if (s->tmp == NULL)
    1397             :     {
    1398             :         assert(NISERR());
    1399           0 :         return;
    1400             :     }
    1401             : 
    1402      629326 :     for (a = s->outs; a != NULL && !NISERR(); a = a->outchain)
    1403             :     {
    1404      366636 :         duptraverse(nfa, a->to, (struct state *) NULL);
    1405      366636 :         if (NISERR())
    1406           0 :             break;
    1407             :         assert(a->to->tmp != NULL);
    1408      366636 :         cparc(nfa, a, s->tmp, a->to->tmp);
    1409             :     }
    1410             : }
    1411             : 
    1412             : /*
    1413             :  * removeconstraints - remove any constraints in an NFA
    1414             :  *
    1415             :  * Constraint arcs are replaced by empty arcs, essentially treating all
    1416             :  * constraints as automatically satisfied.
    1417             :  */
    1418             : static void
    1419         202 : removeconstraints(struct nfa *nfa,
    1420             :                   struct state *start,  /* process subNFA starting here */
    1421             :                   struct state *stop)   /* and stopping here */
    1422             : {
    1423         202 :     if (start == stop)
    1424           0 :         return;
    1425             : 
    1426         202 :     stop->tmp = stop;
    1427         202 :     removetraverse(nfa, start);
    1428             :     /* done, except for clearing out the tmp pointers */
    1429             : 
    1430         202 :     stop->tmp = NULL;
    1431         202 :     cleartraverse(nfa, start);
    1432             : }
    1433             : 
    1434             : /*
    1435             :  * removetraverse - recursive heart of removeconstraints
    1436             :  */
    1437             : static void
    1438         562 : removetraverse(struct nfa *nfa,
    1439             :                struct state *s)
    1440             : {
    1441             :     struct arc *a;
    1442             :     struct arc *oa;
    1443             : 
    1444             :     /* Since this is recursive, it could be driven to stack overflow */
    1445         562 :     if (STACK_TOO_DEEP(nfa->v->re))
    1446             :     {
    1447           0 :         NERR(REG_ETOOBIG);
    1448           0 :         return;
    1449             :     }
    1450             : 
    1451         562 :     if (s->tmp != NULL)
    1452         252 :         return;                 /* already done */
    1453             : 
    1454         310 :     s->tmp = s;
    1455         670 :     for (a = s->outs; a != NULL && !NISERR(); a = oa)
    1456             :     {
    1457         360 :         removetraverse(nfa, a->to);
    1458         360 :         if (NISERR())
    1459           0 :             break;
    1460         360 :         oa = a->outchain;
    1461         360 :         switch (a->type)
    1462             :         {
    1463         346 :             case PLAIN:
    1464             :             case EMPTY:
    1465             :             case CANTMATCH:
    1466             :                 /* nothing to do */
    1467         346 :                 break;
    1468          14 :             case AHEAD:
    1469             :             case BEHIND:
    1470             :             case '^':
    1471             :             case '$':
    1472             :             case LACON:
    1473             :                 /* replace it */
    1474          14 :                 newarc(nfa, EMPTY, 0, s, a->to);
    1475          14 :                 freearc(nfa, a);
    1476          14 :                 break;
    1477           0 :             default:
    1478           0 :                 NERR(REG_ASSERT);
    1479           0 :                 break;
    1480             :         }
    1481             :     }
    1482             : }
    1483             : 
    1484             : /*
    1485             :  * cleartraverse - recursive cleanup for algorithms that leave tmp ptrs set
    1486             :  */
    1487             : static void
    1488     2168238 : cleartraverse(struct nfa *nfa,
    1489             :               struct state *s)
    1490             : {
    1491             :     struct arc *a;
    1492             : 
    1493             :     /* Since this is recursive, it could be driven to stack overflow */
    1494     2168238 :     if (STACK_TOO_DEEP(nfa->v->re))
    1495             :     {
    1496           0 :         NERR(REG_ETOOBIG);
    1497           0 :         return;
    1498             :     }
    1499             : 
    1500     2168238 :     if (s->tmp == NULL)
    1501     1239032 :         return;
    1502      929206 :     s->tmp = NULL;
    1503             : 
    1504     3043234 :     for (a = s->outs; a != NULL; a = a->outchain)
    1505     2114028 :         cleartraverse(nfa, a->to);
    1506             : }
    1507             : 
    1508             : /*
    1509             :  * single_color_transition - does getting from s1 to s2 cross one PLAIN arc?
    1510             :  *
    1511             :  * If traversing from s1 to s2 requires a single PLAIN match (possibly of any
    1512             :  * of a set of colors), return a state whose outarc list contains only PLAIN
    1513             :  * arcs of those color(s).  Otherwise return NULL.
    1514             :  *
    1515             :  * This is used before optimizing the NFA, so there may be EMPTY arcs, which
    1516             :  * we should ignore; the possibility of an EMPTY is why the result state could
    1517             :  * be different from s1.
    1518             :  *
    1519             :  * It's worth troubling to handle multiple parallel PLAIN arcs here because a
    1520             :  * bracket construct such as [abc] might yield either one or several parallel
    1521             :  * PLAIN arcs depending on earlier atoms in the expression.  We'd rather that
    1522             :  * that implementation detail not create user-visible performance differences.
    1523             :  */
    1524             : static struct state *
    1525         254 : single_color_transition(struct state *s1, struct state *s2)
    1526             : {
    1527             :     struct arc *a;
    1528             : 
    1529             :     /* Ignore leading EMPTY arc, if any */
    1530         254 :     if (s1->nouts == 1 && s1->outs->type == EMPTY)
    1531         254 :         s1 = s1->outs->to;
    1532             :     /* Likewise for any trailing EMPTY arc */
    1533         254 :     if (s2->nins == 1 && s2->ins->type == EMPTY)
    1534         254 :         s2 = s2->ins->from;
    1535             :     /* Perhaps we could have a single-state loop in between, if so reject */
    1536         254 :     if (s1 == s2)
    1537           0 :         return NULL;
    1538             :     /* s1 must have at least one outarc... */
    1539         254 :     if (s1->outs == NULL)
    1540           0 :         return NULL;
    1541             :     /* ... and they must all be PLAIN arcs to s2 */
    1542         436 :     for (a = s1->outs; a != NULL; a = a->outchain)
    1543             :     {
    1544         268 :         if (a->type != PLAIN || a->to != s2)
    1545          86 :             return NULL;
    1546             :     }
    1547             :     /* OK, return s1 as the possessor of the relevant outarcs */
    1548         168 :     return s1;
    1549             : }
    1550             : 
    1551             : /*
    1552             :  * specialcolors - fill in special colors for an NFA
    1553             :  */
    1554             : static void
    1555       19564 : specialcolors(struct nfa *nfa)
    1556             : {
    1557             :     /* false colors for BOS, BOL, EOS, EOL */
    1558       19564 :     if (nfa->parent == NULL)
    1559             :     {
    1560        7860 :         nfa->bos[0] = pseudocolor(nfa->cm);
    1561        7860 :         nfa->bos[1] = pseudocolor(nfa->cm);
    1562        7860 :         nfa->eos[0] = pseudocolor(nfa->cm);
    1563        7860 :         nfa->eos[1] = pseudocolor(nfa->cm);
    1564             :     }
    1565             :     else
    1566             :     {
    1567             :         assert(nfa->parent->bos[0] != COLORLESS);
    1568       11704 :         nfa->bos[0] = nfa->parent->bos[0];
    1569             :         assert(nfa->parent->bos[1] != COLORLESS);
    1570       11704 :         nfa->bos[1] = nfa->parent->bos[1];
    1571             :         assert(nfa->parent->eos[0] != COLORLESS);
    1572       11704 :         nfa->eos[0] = nfa->parent->eos[0];
    1573             :         assert(nfa->parent->eos[1] != COLORLESS);
    1574       11704 :         nfa->eos[1] = nfa->parent->eos[1];
    1575             :     }
    1576       19564 : }
    1577             : 
    1578             : /*
    1579             :  * optimize - optimize an NFA
    1580             :  *
    1581             :  * The main goal of this function is not so much "optimization" (though it
    1582             :  * does try to get rid of useless NFA states) as reducing the NFA to a form
    1583             :  * the regex executor can handle.  The executor, and indeed the cNFA format
    1584             :  * that is its input, can only handle PLAIN and LACON arcs.  The output of
    1585             :  * the regex parser also includes EMPTY (do-nothing) arcs, as well as
    1586             :  * ^, $, AHEAD, and BEHIND constraint arcs, which we must get rid of here.
    1587             :  * We first get rid of EMPTY arcs and then deal with the constraint arcs.
    1588             :  * The hardest part of either job is to get rid of circular loops of the
    1589             :  * target arc type.  We would have to do that in any case, though, as such a
    1590             :  * loop would otherwise allow the executor to cycle through the loop endlessly
    1591             :  * without making any progress in the input string.
    1592             :  */
    1593             : static long                     /* re_info bits */
    1594       19558 : optimize(struct nfa *nfa,
    1595             :          FILE *f)               /* for debug output; NULL none */
    1596             : {
    1597             : #ifdef REG_DEBUG
    1598             :     int         verbose = (f != NULL) ? 1 : 0;
    1599             : 
    1600             :     if (verbose)
    1601             :         fprintf(f, "\ninitial cleanup:\n");
    1602             : #endif
    1603             :     /* If we have any CANTMATCH arcs, drop them; but this is uncommon */
    1604       19558 :     if (nfa->flags & HASCANTMATCH)
    1605             :     {
    1606          12 :         removecantmatch(nfa);
    1607          12 :         nfa->flags &= ~HASCANTMATCH;
    1608             :     }
    1609       19558 :     cleanup(nfa);               /* may simplify situation */
    1610             : #ifdef REG_DEBUG
    1611             :     if (verbose)
    1612             :         dumpnfa(nfa, f);
    1613             :     if (verbose)
    1614             :         fprintf(f, "\nempties:\n");
    1615             : #endif
    1616       19558 :     fixempties(nfa, f);         /* get rid of EMPTY arcs */
    1617             : #ifdef REG_DEBUG
    1618             :     if (verbose)
    1619             :         fprintf(f, "\nconstraints:\n");
    1620             : #endif
    1621       19558 :     fixconstraintloops(nfa, f); /* get rid of constraint loops */
    1622       19558 :     pullback(nfa, f);           /* pull back constraints backward */
    1623       19558 :     pushfwd(nfa, f);            /* push fwd constraints forward */
    1624             : #ifdef REG_DEBUG
    1625             :     if (verbose)
    1626             :         fprintf(f, "\nfinal cleanup:\n");
    1627             : #endif
    1628       19558 :     cleanup(nfa);               /* final tidying */
    1629             : #ifdef REG_DEBUG
    1630             :     if (verbose)
    1631             :         dumpnfa(nfa, f);
    1632             : #endif
    1633       19558 :     return analyze(nfa);        /* and analysis */
    1634             : }
    1635             : 
    1636             : /*
    1637             :  * pullback - pull back constraints backward to eliminate them
    1638             :  */
    1639             : static void
    1640       19558 : pullback(struct nfa *nfa,
    1641             :          FILE *f)               /* for debug output; NULL none */
    1642             : {
    1643             :     struct state *s;
    1644             :     struct state *nexts;
    1645             :     struct arc *a;
    1646             :     struct arc *nexta;
    1647             :     struct state *intermediates;
    1648             :     int         progress;
    1649             : 
    1650             :     /* find and pull until there are no more */
    1651             :     do
    1652             :     {
    1653       30914 :         progress = 0;
    1654      475936 :         for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
    1655             :         {
    1656      445022 :             nexts = s->next;
    1657      445022 :             intermediates = NULL;
    1658     1924420 :             for (a = s->outs; a != NULL && !NISERR(); a = nexta)
    1659             :             {
    1660     1479398 :                 nexta = a->outchain;
    1661     1479398 :                 if (a->type == '^' || a->type == BEHIND)
    1662      100794 :                     if (pull(nfa, a, &intermediates))
    1663       38710 :                         progress = 1;
    1664             :             }
    1665             :             /* clear tmp fields of intermediate states created here */
    1666      447482 :             while (intermediates != NULL)
    1667             :             {
    1668        2460 :                 struct state *ns = intermediates->tmp;
    1669             : 
    1670        2460 :                 intermediates->tmp = NULL;
    1671        2460 :                 intermediates = ns;
    1672             :             }
    1673             :             /* if s is now useless, get rid of it */
    1674      445022 :             if ((s->nins == 0 || s->nouts == 0) && !s->flag)
    1675       33502 :                 dropstate(nfa, s);
    1676             :         }
    1677       30914 :         if (progress && f != NULL)
    1678           0 :             dumpnfa(nfa, f);
    1679       30914 :     } while (progress && !NISERR());
    1680       19558 :     if (NISERR())
    1681           6 :         return;
    1682             : 
    1683             :     /*
    1684             :      * Any ^ constraints we were able to pull to the start state can now be
    1685             :      * replaced by PLAIN arcs referencing the BOS or BOL colors.  There should
    1686             :      * be no other ^ or BEHIND arcs left in the NFA, though we do not check
    1687             :      * that here (compact() will fail if so).
    1688             :      */
    1689       73348 :     for (a = nfa->pre->outs; a != NULL; a = nexta)
    1690             :     {
    1691       53796 :         nexta = a->outchain;
    1692       53796 :         if (a->type == '^')
    1693             :         {
    1694             :             assert(a->co == 0 || a->co == 1);
    1695       38748 :             newarc(nfa, PLAIN, nfa->bos[a->co], a->from, a->to);
    1696       38748 :             freearc(nfa, a);
    1697             :         }
    1698             :     }
    1699             : }
    1700             : 
    1701             : /*
    1702             :  * pull - pull a back constraint backward past its source state
    1703             :  *
    1704             :  * Returns 1 if successful (which it always is unless the source is the
    1705             :  * start state or we have an internal error), 0 if nothing happened.
    1706             :  *
    1707             :  * A significant property of this function is that it deletes no pre-existing
    1708             :  * states, and no outarcs of the constraint's from state other than the given
    1709             :  * constraint arc.  This makes the loops in pullback() safe, at the cost that
    1710             :  * we may leave useless states behind.  Therefore, we leave it to pullback()
    1711             :  * to delete such states.
    1712             :  *
    1713             :  * If the from state has multiple back-constraint outarcs, and/or multiple
    1714             :  * compatible constraint inarcs, we only need to create one new intermediate
    1715             :  * state per combination of predecessor and successor states.  *intermediates
    1716             :  * points to a list of such intermediate states for this from state (chained
    1717             :  * through their tmp fields).
    1718             :  */
    1719             : static int
    1720      100794 : pull(struct nfa *nfa,
    1721             :      struct arc *con,
    1722             :      struct state **intermediates)
    1723             : {
    1724      100794 :     struct state *from = con->from;
    1725      100794 :     struct state *to = con->to;
    1726             :     struct arc *a;
    1727             :     struct arc *nexta;
    1728             :     struct state *s;
    1729             : 
    1730             :     assert(from != to);         /* should have gotten rid of this earlier */
    1731      100794 :     if (from->flag)              /* can't pull back beyond start */
    1732       62084 :         return 0;
    1733       38710 :     if (from->nins == 0)
    1734             :     {                           /* unreachable */
    1735        7304 :         freearc(nfa, con);
    1736        7304 :         return 1;
    1737             :     }
    1738             : 
    1739             :     /*
    1740             :      * First, clone from state if necessary to avoid other outarcs.  This may
    1741             :      * seem wasteful, but it simplifies the logic, and we'll get rid of the
    1742             :      * clone state again at the bottom.
    1743             :      */
    1744       31406 :     if (from->nouts > 1)
    1745             :     {
    1746       18950 :         s = newstate(nfa);
    1747       18950 :         if (NISERR())
    1748           0 :             return 0;
    1749       18950 :         copyins(nfa, from, s);  /* duplicate inarcs */
    1750       18950 :         cparc(nfa, con, s, to); /* move constraint arc */
    1751       18950 :         freearc(nfa, con);
    1752       18950 :         if (NISERR())
    1753           0 :             return 0;
    1754       18950 :         from = s;
    1755       18950 :         con = from->outs;
    1756             :     }
    1757             :     assert(from->nouts == 1);
    1758             : 
    1759             :     /* propagate the constraint into the from state's inarcs */
    1760      318204 :     for (a = from->ins; a != NULL && !NISERR(); a = nexta)
    1761             :     {
    1762      286798 :         nexta = a->inchain;
    1763      286798 :         switch (combine(nfa, con, a))
    1764             :         {
    1765       87740 :             case INCOMPATIBLE:  /* destroy the arc */
    1766       87740 :                 freearc(nfa, a);
    1767       87740 :                 break;
    1768       17020 :             case SATISFIED:     /* no action needed */
    1769       17020 :                 break;
    1770      180334 :             case COMPATIBLE:    /* swap the two arcs, more or less */
    1771             :                 /* need an intermediate state, but might have one already */
    1772      203920 :                 for (s = *intermediates; s != NULL; s = s->tmp)
    1773             :                 {
    1774             :                     assert(s->nins > 0 && s->nouts > 0);
    1775      201460 :                     if (s->ins->from == a->from && s->outs->to == to)
    1776      177874 :                         break;
    1777             :                 }
    1778      180334 :                 if (s == NULL)
    1779             :                 {
    1780        2460 :                     s = newstate(nfa);
    1781        2460 :                     if (NISERR())
    1782           0 :                         return 0;
    1783        2460 :                     s->tmp = *intermediates;
    1784        2460 :                     *intermediates = s;
    1785             :                 }
    1786      180334 :                 cparc(nfa, con, a->from, s);
    1787      180334 :                 cparc(nfa, a, s, to);
    1788      180334 :                 freearc(nfa, a);
    1789      180334 :                 break;
    1790        1704 :             case REPLACEARC:    /* replace arc's color */
    1791        1704 :                 newarc(nfa, a->type, con->co, a->from, to);
    1792        1704 :                 freearc(nfa, a);
    1793        1704 :                 break;
    1794           0 :             default:
    1795             :                 assert(NOTREACHED);
    1796           0 :                 break;
    1797             :         }
    1798             :     }
    1799             : 
    1800             :     /* remaining inarcs, if any, incorporate the constraint */
    1801       31406 :     moveins(nfa, from, to);
    1802       31406 :     freearc(nfa, con);
    1803             :     /* from state is now useless, but we leave it to pullback() to clean up */
    1804       31406 :     return 1;
    1805             : }
    1806             : 
    1807             : /*
    1808             :  * pushfwd - push forward constraints forward to eliminate them
    1809             :  */
    1810             : static void
    1811       19558 : pushfwd(struct nfa *nfa,
    1812             :         FILE *f)                /* for debug output; NULL none */
    1813             : {
    1814             :     struct state *s;
    1815             :     struct state *nexts;
    1816             :     struct arc *a;
    1817             :     struct arc *nexta;
    1818             :     struct state *intermediates;
    1819             :     int         progress;
    1820             : 
    1821             :     /* find and push until there are no more */
    1822             :     do
    1823             :     {
    1824       29272 :         progress = 0;
    1825      410826 :         for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
    1826             :         {
    1827      381554 :             nexts = s->next;
    1828      381554 :             intermediates = NULL;
    1829     1701606 :             for (a = s->ins; a != NULL && !NISERR(); a = nexta)
    1830             :             {
    1831     1320052 :                 nexta = a->inchain;
    1832     1320052 :                 if (a->type == '$' || a->type == AHEAD)
    1833       72490 :                     if (push(nfa, a, &intermediates))
    1834       22386 :                         progress = 1;
    1835             :             }
    1836             :             /* clear tmp fields of intermediate states created here */
    1837      381558 :             while (intermediates != NULL)
    1838             :             {
    1839           4 :                 struct state *ns = intermediates->tmp;
    1840             : 
    1841           4 :                 intermediates->tmp = NULL;
    1842           4 :                 intermediates = ns;
    1843             :             }
    1844             :             /* if s is now useless, get rid of it */
    1845      381554 :             if ((s->nins == 0 || s->nouts == 0) && !s->flag)
    1846       22946 :                 dropstate(nfa, s);
    1847             :         }
    1848       29272 :         if (progress && f != NULL)
    1849           0 :             dumpnfa(nfa, f);
    1850       29272 :     } while (progress && !NISERR());
    1851       19558 :     if (NISERR())
    1852           6 :         return;
    1853             : 
    1854             :     /*
    1855             :      * Any $ constraints we were able to push to the post state can now be
    1856             :      * replaced by PLAIN arcs referencing the EOS or EOL colors.  There should
    1857             :      * be no other $ or AHEAD arcs left in the NFA, though we do not check
    1858             :      * that here (compact() will fail if so).
    1859             :      */
    1860       61402 :     for (a = nfa->post->ins; a != NULL; a = nexta)
    1861             :     {
    1862       41850 :         nexta = a->inchain;
    1863       41850 :         if (a->type == '$')
    1864             :         {
    1865             :             assert(a->co == 0 || a->co == 1);
    1866       29900 :             newarc(nfa, PLAIN, nfa->eos[a->co], a->from, a->to);
    1867       29900 :             freearc(nfa, a);
    1868             :         }
    1869             :     }
    1870             : }
    1871             : 
    1872             : /*
    1873             :  * push - push a forward constraint forward past its destination state
    1874             :  *
    1875             :  * Returns 1 if successful (which it always is unless the destination is the
    1876             :  * post state or we have an internal error), 0 if nothing happened.
    1877             :  *
    1878             :  * A significant property of this function is that it deletes no pre-existing
    1879             :  * states, and no inarcs of the constraint's to state other than the given
    1880             :  * constraint arc.  This makes the loops in pushfwd() safe, at the cost that
    1881             :  * we may leave useless states behind.  Therefore, we leave it to pushfwd()
    1882             :  * to delete such states.
    1883             :  *
    1884             :  * If the to state has multiple forward-constraint inarcs, and/or multiple
    1885             :  * compatible constraint outarcs, we only need to create one new intermediate
    1886             :  * state per combination of predecessor and successor states.  *intermediates
    1887             :  * points to a list of such intermediate states for this to state (chained
    1888             :  * through their tmp fields).
    1889             :  */
    1890             : static int
    1891       72490 : push(struct nfa *nfa,
    1892             :      struct arc *con,
    1893             :      struct state **intermediates)
    1894             : {
    1895       72490 :     struct state *from = con->from;
    1896       72490 :     struct state *to = con->to;
    1897             :     struct arc *a;
    1898             :     struct arc *nexta;
    1899             :     struct state *s;
    1900             : 
    1901             :     assert(to != from);         /* should have gotten rid of this earlier */
    1902       72490 :     if (to->flag)                /* can't push forward beyond end */
    1903       50104 :         return 0;
    1904       22386 :     if (to->nouts == 0)
    1905             :     {                           /* dead end */
    1906         820 :         freearc(nfa, con);
    1907         820 :         return 1;
    1908             :     }
    1909             : 
    1910             :     /*
    1911             :      * First, clone to state if necessary to avoid other inarcs.  This may
    1912             :      * seem wasteful, but it simplifies the logic, and we'll get rid of the
    1913             :      * clone state again at the bottom.
    1914             :      */
    1915       21566 :     if (to->nins > 1)
    1916             :     {
    1917       11370 :         s = newstate(nfa);
    1918       11370 :         if (NISERR())
    1919           0 :             return 0;
    1920       11370 :         copyouts(nfa, to, s);   /* duplicate outarcs */
    1921       11370 :         cparc(nfa, con, from, s);   /* move constraint arc */
    1922       11370 :         freearc(nfa, con);
    1923       11370 :         if (NISERR())
    1924           0 :             return 0;
    1925       11370 :         to = s;
    1926       11370 :         con = to->ins;
    1927             :     }
    1928             :     assert(to->nins == 1);
    1929             : 
    1930             :     /* propagate the constraint into the to state's outarcs */
    1931      130612 :     for (a = to->outs; a != NULL && !NISERR(); a = nexta)
    1932             :     {
    1933      109046 :         nexta = a->outchain;
    1934      109046 :         switch (combine(nfa, con, a))
    1935             :         {
    1936       91928 :             case INCOMPATIBLE:  /* destroy the arc */
    1937       91928 :                 freearc(nfa, a);
    1938       91928 :                 break;
    1939       14544 :             case SATISFIED:     /* no action needed */
    1940       14544 :                 break;
    1941          16 :             case COMPATIBLE:    /* swap the two arcs, more or less */
    1942             :                 /* need an intermediate state, but might have one already */
    1943          16 :                 for (s = *intermediates; s != NULL; s = s->tmp)
    1944             :                 {
    1945             :                     assert(s->nins > 0 && s->nouts > 0);
    1946          12 :                     if (s->ins->from == from && s->outs->to == a->to)
    1947          12 :                         break;
    1948             :                 }
    1949          16 :                 if (s == NULL)
    1950             :                 {
    1951           4 :                     s = newstate(nfa);
    1952           4 :                     if (NISERR())
    1953           0 :                         return 0;
    1954           4 :                     s->tmp = *intermediates;
    1955           4 :                     *intermediates = s;
    1956             :                 }
    1957          16 :                 cparc(nfa, con, s, a->to);
    1958          16 :                 cparc(nfa, a, from, s);
    1959          16 :                 freearc(nfa, a);
    1960          16 :                 break;
    1961        2558 :             case REPLACEARC:    /* replace arc's color */
    1962        2558 :                 newarc(nfa, a->type, con->co, from, a->to);
    1963        2558 :                 freearc(nfa, a);
    1964        2558 :                 break;
    1965           0 :             default:
    1966             :                 assert(NOTREACHED);
    1967           0 :                 break;
    1968             :         }
    1969             :     }
    1970             : 
    1971             :     /* remaining outarcs, if any, incorporate the constraint */
    1972       21566 :     moveouts(nfa, to, from);
    1973       21566 :     freearc(nfa, con);
    1974             :     /* to state is now useless, but we leave it to pushfwd() to clean up */
    1975       21566 :     return 1;
    1976             : }
    1977             : 
    1978             : /*
    1979             :  * combine - constraint lands on an arc, what happens?
    1980             :  *
    1981             :  * #def INCOMPATIBLE    1   // destroys arc
    1982             :  * #def SATISFIED       2   // constraint satisfied
    1983             :  * #def COMPATIBLE      3   // compatible but not satisfied yet
    1984             :  * #def REPLACEARC      4   // replace arc's color with constraint color
    1985             :  */
    1986             : static int
    1987      395844 : combine(struct nfa *nfa,
    1988             :         struct arc *con,
    1989             :         struct arc *a)
    1990             : {
    1991             : #define  CA(ct,at)   (((ct)<<CHAR_BIT) | (at))
    1992             : 
    1993      395844 :     switch (CA(con->type, a->type))
    1994             :     {
    1995       89764 :         case CA('^', PLAIN):    /* newlines are handled separately */
    1996             :         case CA('$', PLAIN):
    1997       89764 :             return INCOMPATIBLE;
    1998             :             break;
    1999       37164 :         case CA(AHEAD, PLAIN):  /* color constraints meet colors */
    2000             :         case CA(BEHIND, PLAIN):
    2001       37164 :             if (con->co == a->co)
    2002        1654 :                 return SATISFIED;
    2003       35510 :             if (con->co == RAINBOW)
    2004             :             {
    2005             :                 /* con is satisfied unless arc's color is a pseudocolor */
    2006           4 :                 if (!(nfa->cm->cd[a->co].flags & PSEUDO))
    2007           4 :                     return SATISFIED;
    2008             :             }
    2009       35506 :             else if (a->co == RAINBOW)
    2010             :             {
    2011             :                 /* con is incompatible if it's for a pseudocolor */
    2012             :                 /* (this is hypothetical; we make no such constraints today) */
    2013        4254 :                 if (nfa->cm->cd[con->co].flags & PSEUDO)
    2014           0 :                     return INCOMPATIBLE;
    2015             :                 /* otherwise, constraint constrains arc to be only its color */
    2016        4254 :                 return REPLACEARC;
    2017             :             }
    2018       31252 :             return INCOMPATIBLE;
    2019             :             break;
    2020       51158 :         case CA('^', '^'):      /* collision, similar constraints */
    2021             :         case CA('$', '$'):
    2022       51158 :             if (con->co == a->co) /* true duplication */
    2023       28984 :                 return SATISFIED;
    2024       22174 :             return INCOMPATIBLE;
    2025             :             break;
    2026       24620 :         case CA(AHEAD, AHEAD):  /* collision, similar constraints */
    2027             :         case CA(BEHIND, BEHIND):
    2028       24620 :             if (con->co == a->co) /* true duplication */
    2029         918 :                 return SATISFIED;
    2030       23702 :             if (con->co == RAINBOW)
    2031             :             {
    2032             :                 /* con is satisfied unless arc's color is a pseudocolor */
    2033           4 :                 if (!(nfa->cm->cd[a->co].flags & PSEUDO))
    2034           4 :                     return SATISFIED;
    2035             :             }
    2036       23698 :             else if (a->co == RAINBOW)
    2037             :             {
    2038             :                 /* con is incompatible if it's for a pseudocolor */
    2039             :                 /* (this is hypothetical; we make no such constraints today) */
    2040           8 :                 if (nfa->cm->cd[con->co].flags & PSEUDO)
    2041           0 :                     return INCOMPATIBLE;
    2042             :                 /* otherwise, constraint constrains arc to be only its color */
    2043           8 :                 return REPLACEARC;
    2044             :             }
    2045       23690 :             return INCOMPATIBLE;
    2046             :             break;
    2047       12788 :         case CA('^', BEHIND):   /* collision, dissimilar constraints */
    2048             :         case CA(BEHIND, '^'):
    2049             :         case CA('$', AHEAD):
    2050             :         case CA(AHEAD, '$'):
    2051       12788 :             return INCOMPATIBLE;
    2052             :             break;
    2053      180350 :         case CA('^', '$'):      /* constraints passing each other */
    2054             :         case CA('^', AHEAD):
    2055             :         case CA(BEHIND, '$'):
    2056             :         case CA(BEHIND, AHEAD):
    2057             :         case CA('$', '^'):
    2058             :         case CA('$', BEHIND):
    2059             :         case CA(AHEAD, '^'):
    2060             :         case CA(AHEAD, BEHIND):
    2061             :         case CA('^', LACON):
    2062             :         case CA(BEHIND, LACON):
    2063             :         case CA('$', LACON):
    2064             :         case CA(AHEAD, LACON):
    2065      180350 :             return COMPATIBLE;
    2066             :             break;
    2067             :     }
    2068             :     assert(NOTREACHED);
    2069           0 :     return INCOMPATIBLE;        /* for benefit of blind compilers */
    2070             : }
    2071             : 
    2072             : /*
    2073             :  * fixempties - get rid of EMPTY arcs
    2074             :  */
    2075             : static void
    2076       19558 : fixempties(struct nfa *nfa,
    2077             :            FILE *f)             /* for debug output; NULL none */
    2078             : {
    2079             :     struct state *s;
    2080             :     struct state *s2;
    2081             :     struct state *nexts;
    2082             :     struct arc *a;
    2083             :     struct arc *nexta;
    2084             :     int         totalinarcs;
    2085             :     struct arc **inarcsorig;
    2086             :     struct arc **arcarray;
    2087             :     int         arccount;
    2088             :     int         prevnins;
    2089             :     int         nskip;
    2090             : 
    2091             :     /*
    2092             :      * First, get rid of any states whose sole out-arc is an EMPTY, since
    2093             :      * they're basically just aliases for their successor.  The parsing
    2094             :      * algorithm creates enough of these that it's worth special-casing this.
    2095             :      */
    2096      450704 :     for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
    2097             :     {
    2098      431146 :         nexts = s->next;
    2099      431146 :         if (s->flag || s->nouts != 1)
    2100      111588 :             continue;
    2101      319558 :         a = s->outs;
    2102             :         assert(a != NULL && a->outchain == NULL);
    2103      319558 :         if (a->type != EMPTY)
    2104      195452 :             continue;
    2105      124106 :         if (s != a->to)
    2106      124106 :             moveins(nfa, s, a->to);
    2107      124106 :         dropstate(nfa, s);
    2108             :     }
    2109             : 
    2110             :     /*
    2111             :      * Similarly, get rid of any state with a single EMPTY in-arc, by folding
    2112             :      * it into its predecessor.
    2113             :      */
    2114      326598 :     for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
    2115             :     {
    2116      307040 :         nexts = s->next;
    2117             :         /* while we're at it, ensure tmp fields are clear for next step */
    2118             :         assert(s->tmp == NULL);
    2119      307040 :         if (s->flag || s->nins != 1)
    2120      105912 :             continue;
    2121      201128 :         a = s->ins;
    2122             :         assert(a != NULL && a->inchain == NULL);
    2123      201128 :         if (a->type != EMPTY)
    2124      177602 :             continue;
    2125       23526 :         if (s != a->from)
    2126       23526 :             moveouts(nfa, s, a->from);
    2127       23526 :         dropstate(nfa, s);
    2128             :     }
    2129             : 
    2130       19558 :     if (NISERR())
    2131           0 :         return;
    2132             : 
    2133             :     /*
    2134             :      * For each remaining NFA state, find all other states from which it is
    2135             :      * reachable by a chain of one or more EMPTY arcs.  Then generate new arcs
    2136             :      * that eliminate the need for each such chain.
    2137             :      *
    2138             :      * We could replace a chain of EMPTY arcs that leads from a "from" state
    2139             :      * to a "to" state either by pushing non-EMPTY arcs forward (linking
    2140             :      * directly from "from"'s predecessors to "to") or by pulling them back
    2141             :      * (linking directly from "from" to "to"'s successors).  We choose to
    2142             :      * always do the former; this choice is somewhat arbitrary, but the
    2143             :      * approach below requires that we uniformly do one or the other.
    2144             :      *
    2145             :      * Suppose we have a chain of N successive EMPTY arcs (where N can easily
    2146             :      * approach the size of the NFA).  All of the intermediate states must
    2147             :      * have additional inarcs and outarcs, else they'd have been removed by
    2148             :      * the steps above.  Assuming their inarcs are mostly not empties, we will
    2149             :      * add O(N^2) arcs to the NFA, since a non-EMPTY inarc leading to any one
    2150             :      * state in the chain must be duplicated to lead to all its successor
    2151             :      * states as well.  So there is no hope of doing less than O(N^2) work;
    2152             :      * however, we should endeavor to keep the big-O cost from being even
    2153             :      * worse than that, which it can easily become without care.  In
    2154             :      * particular, suppose we were to copy all S1's inarcs forward to S2, and
    2155             :      * then also to S3, and then later we consider pushing S2's inarcs forward
    2156             :      * to S3.  If we include the arcs already copied from S1 in that, we'd be
    2157             :      * doing O(N^3) work.  (The duplicate-arc elimination built into newarc()
    2158             :      * and its cohorts would get rid of the extra arcs, but not without cost.)
    2159             :      *
    2160             :      * We can avoid this cost by treating only arcs that existed at the start
    2161             :      * of this phase as candidates to be pushed forward.  To identify those,
    2162             :      * we remember the first inarc each state had to start with.  We rely on
    2163             :      * the fact that newarc() and friends put new arcs on the front of their
    2164             :      * to-states' inchains, and that this phase never deletes arcs, so that
    2165             :      * the original arcs must be the last arcs in their to-states' inchains.
    2166             :      *
    2167             :      * So the process here is that, for each state in the NFA, we gather up
    2168             :      * all non-EMPTY inarcs of states that can reach the target state via
    2169             :      * EMPTY arcs.  We then sort, de-duplicate, and merge these arcs into the
    2170             :      * target state's inchain.  (We can safely use sort-merge for this as long
    2171             :      * as we update each state's original-arcs pointer after we add arcs to
    2172             :      * it; the sort step of mergeins probably changed the order of the old
    2173             :      * arcs.)
    2174             :      *
    2175             :      * Another refinement worth making is that, because we only add non-EMPTY
    2176             :      * arcs during this phase, and all added arcs have the same from-state as
    2177             :      * the non-EMPTY arc they were cloned from, we know ahead of time that any
    2178             :      * states having only EMPTY outarcs will be useless for lack of outarcs
    2179             :      * after we drop the EMPTY arcs.  (They cannot gain non-EMPTY outarcs if
    2180             :      * they had none to start with.)  So we need not bother to update the
    2181             :      * inchains of such states at all.
    2182             :      */
    2183             : 
    2184             :     /* Remember the states' first original inarcs */
    2185             :     /* ... and while at it, count how many old inarcs there are altogether */
    2186       19558 :     inarcsorig = (struct arc **) MALLOC(nfa->nstates * sizeof(struct arc *));
    2187       19558 :     if (inarcsorig == NULL)
    2188             :     {
    2189           0 :         NERR(REG_ESPACE);
    2190           0 :         return;
    2191             :     }
    2192       19558 :     totalinarcs = 0;
    2193      303072 :     for (s = nfa->states; s != NULL; s = s->next)
    2194             :     {
    2195      283514 :         inarcsorig[s->no] = s->ins;
    2196      283514 :         totalinarcs += s->nins;
    2197             :     }
    2198             : 
    2199             :     /*
    2200             :      * Create a workspace for accumulating the inarcs to be added to the
    2201             :      * current target state.  totalinarcs is probably a considerable
    2202             :      * overestimate of the space needed, but the NFA is unlikely to be large
    2203             :      * enough at this point to make it worth being smarter.
    2204             :      */
    2205       19558 :     arcarray = (struct arc **) MALLOC(totalinarcs * sizeof(struct arc *));
    2206       19558 :     if (arcarray == NULL)
    2207             :     {
    2208           0 :         NERR(REG_ESPACE);
    2209           0 :         FREE(inarcsorig);
    2210           0 :         return;
    2211             :     }
    2212             : 
    2213             :     /* And iterate over the target states */
    2214      298110 :     for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
    2215             :     {
    2216             :         /* Ignore target states without non-EMPTY outarcs, per note above */
    2217      278552 :         if (!s->flag && !hasnonemptyout(s))
    2218        3960 :             continue;
    2219             : 
    2220             :         /* Find predecessor states and accumulate their original inarcs */
    2221      274592 :         arccount = 0;
    2222    15261706 :         for (s2 = emptyreachable(nfa, s, s, inarcsorig); s2 != s; s2 = nexts)
    2223             :         {
    2224             :             /* Add s2's original inarcs to arcarray[], but ignore empties */
    2225    45003018 :             for (a = inarcsorig[s2->no]; a != NULL; a = a->inchain)
    2226             :             {
    2227    30015904 :                 if (a->type != EMPTY)
    2228    15030890 :                     arcarray[arccount++] = a;
    2229             :             }
    2230             : 
    2231             :             /* Reset the tmp fields as we walk back */
    2232    14987114 :             nexts = s2->tmp;
    2233    14987114 :             s2->tmp = NULL;
    2234             :         }
    2235      274592 :         s->tmp = NULL;
    2236             :         assert(arccount <= totalinarcs);
    2237             : 
    2238             :         /* Remember how many original inarcs this state has */
    2239      274592 :         prevnins = s->nins;
    2240             : 
    2241             :         /* Add non-duplicate inarcs to target state */
    2242      274592 :         mergeins(nfa, s, arcarray, arccount);
    2243             : 
    2244             :         /* Now we must update the state's inarcsorig pointer */
    2245      274592 :         nskip = s->nins - prevnins;
    2246      274592 :         a = s->ins;
    2247    15292832 :         while (nskip-- > 0)
    2248    15018240 :             a = a->inchain;
    2249      274592 :         inarcsorig[s->no] = a;
    2250             :     }
    2251             : 
    2252       19558 :     FREE(arcarray);
    2253       19558 :     FREE(inarcsorig);
    2254             : 
    2255       19558 :     if (NISERR())
    2256           6 :         return;
    2257             : 
    2258             :     /*
    2259             :      * Now remove all the EMPTY arcs, since we don't need them anymore.
    2260             :      */
    2261      285054 :     for (s = nfa->states; s != NULL; s = s->next)
    2262             :     {
    2263     1520678 :         for (a = s->outs; a != NULL; a = nexta)
    2264             :         {
    2265     1255176 :             nexta = a->outchain;
    2266     1255176 :             if (a->type == EMPTY)
    2267       25482 :                 freearc(nfa, a);
    2268             :         }
    2269             :     }
    2270             : 
    2271             :     /*
    2272             :      * And remove any states that have become useless.  (This cleanup is not
    2273             :      * very thorough, and would be even less so if we tried to combine it with
    2274             :      * the previous step; but cleanup() will take care of anything we miss.)
    2275             :      */
    2276      285054 :     for (s = nfa->states; s != NULL; s = nexts)
    2277             :     {
    2278      265502 :         nexts = s->next;
    2279      265502 :         if ((s->nins == 0 || s->nouts == 0) && !s->flag)
    2280        3960 :             dropstate(nfa, s);
    2281             :     }
    2282             : 
    2283       19552 :     if (f != NULL)
    2284           0 :         dumpnfa(nfa, f);
    2285             : }
    2286             : 
    2287             : /*
    2288             :  * emptyreachable - recursively find all states that can reach s by EMPTY arcs
    2289             :  *
    2290             :  * The return value is the last such state found.  Its tmp field links back
    2291             :  * to the next-to-last such state, and so on back to s, so that all these
    2292             :  * states can be located without searching the whole NFA.
    2293             :  *
    2294             :  * Since this is only used in fixempties(), we pass in the inarcsorig[] array
    2295             :  * maintained by that function.  This lets us skip over all new inarcs, which
    2296             :  * are certainly not EMPTY arcs.
    2297             :  *
    2298             :  * The maximum recursion depth here is equal to the length of the longest
    2299             :  * loop-free chain of EMPTY arcs, which is surely no more than the size of
    2300             :  * the NFA ... but that could still be enough to cause trouble.
    2301             :  */
    2302             : static struct state *
    2303    15261706 : emptyreachable(struct nfa *nfa,
    2304             :                struct state *s,
    2305             :                struct state *lastfound,
    2306             :                struct arc **inarcsorig)
    2307             : {
    2308             :     struct arc *a;
    2309             : 
    2310             :     /* Since this is recursive, it could be driven to stack overflow */
    2311    15261706 :     if (STACK_TOO_DEEP(nfa->v->re))
    2312             :     {
    2313           0 :         NERR(REG_ETOOBIG);
    2314           0 :         return lastfound;
    2315             :     }
    2316             : 
    2317    15261706 :     s->tmp = lastfound;
    2318    15261706 :     lastfound = s;
    2319    45719534 :     for (a = inarcsorig[s->no]; a != NULL; a = a->inchain)
    2320             :     {
    2321    30457828 :         if (a->type == EMPTY && a->from->tmp == NULL)
    2322    14987114 :             lastfound = emptyreachable(nfa, a->from, lastfound, inarcsorig);
    2323             :     }
    2324    15261706 :     return lastfound;
    2325             : }
    2326             : 
    2327             : /*
    2328             :  * isconstraintarc - detect whether an arc is of a constraint type
    2329             :  */
    2330             : static inline int
    2331     2591798 : isconstraintarc(struct arc *a)
    2332             : {
    2333     2591798 :     switch (a->type)
    2334             :     {
    2335      330374 :         case '^':
    2336             :         case '$':
    2337             :         case BEHIND:
    2338             :         case AHEAD:
    2339             :         case LACON:
    2340      330374 :             return 1;
    2341             :     }
    2342     2261424 :     return 0;
    2343             : }
    2344             : 
    2345             : /*
    2346             :  * hasconstraintout - does state have a constraint out arc?
    2347             :  */
    2348             : static int
    2349       25262 : hasconstraintout(struct state *s)
    2350             : {
    2351             :     struct arc *a;
    2352             : 
    2353       47920 :     for (a = s->outs; a != NULL; a = a->outchain)
    2354             :     {
    2355       38444 :         if (isconstraintarc(a))
    2356       15786 :             return 1;
    2357             :     }
    2358        9476 :     return 0;
    2359             : }
    2360             : 
    2361             : /*
    2362             :  * fixconstraintloops - get rid of loops containing only constraint arcs
    2363             :  *
    2364             :  * A loop of states that contains only constraint arcs is useless, since
    2365             :  * passing around the loop represents no forward progress.  Moreover, it
    2366             :  * would cause infinite looping in pullback/pushfwd, so we need to get rid
    2367             :  * of such loops before doing that.
    2368             :  */
    2369             : static void
    2370       19558 : fixconstraintloops(struct nfa *nfa,
    2371             :                    FILE *f)     /* for debug output; NULL none */
    2372             : {
    2373             :     struct state *s;
    2374             :     struct state *nexts;
    2375             :     struct arc *a;
    2376             :     struct arc *nexta;
    2377             :     int         hasconstraints;
    2378             : 
    2379             :     /*
    2380             :      * In the trivial case of a state that loops to itself, we can just drop
    2381             :      * the constraint arc altogether.  This is worth special-casing because
    2382             :      * such loops are far more common than loops containing multiple states.
    2383             :      * While we're at it, note whether any constraint arcs survive.
    2384             :      */
    2385       19558 :     hasconstraints = 0;
    2386      281100 :     for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
    2387             :     {
    2388      261542 :         nexts = s->next;
    2389             :         /* while we're at it, ensure tmp fields are clear for next step */
    2390             :         assert(s->tmp == NULL);
    2391     1487278 :         for (a = s->outs; a != NULL && !NISERR(); a = nexta)
    2392             :         {
    2393     1225736 :             nexta = a->outchain;
    2394     1225736 :             if (isconstraintarc(a))
    2395             :             {
    2396      122990 :                 if (a->to == s)
    2397         352 :                     freearc(nfa, a);
    2398             :                 else
    2399      122638 :                     hasconstraints = 1;
    2400             :             }
    2401             :         }
    2402             :         /* If we removed all the outarcs, the state is useless. */
    2403      261542 :         if (s->nouts == 0 && !s->flag)
    2404           0 :             dropstate(nfa, s);
    2405             :     }
    2406             : 
    2407             :     /* Nothing to do if no remaining constraint arcs */
    2408       19558 :     if (NISERR() || !hasconstraints)
    2409          10 :         return;
    2410             : 
    2411             :     /*
    2412             :      * Starting from each remaining NFA state, search outwards for a
    2413             :      * constraint loop.  If we find a loop, break the loop, then start the
    2414             :      * search over.  (We could possibly retain some state from the first scan,
    2415             :      * but it would complicate things greatly, and multi-state constraint
    2416             :      * loops are rare enough that it's not worth optimizing the case.)
    2417             :      */
    2418       19548 : restart:
    2419      296016 :     for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
    2420             :     {
    2421      276468 :         if (findconstraintloop(nfa, s))
    2422         412 :             goto restart;
    2423             :     }
    2424             : 
    2425       19548 :     if (NISERR())
    2426           0 :         return;
    2427             : 
    2428             :     /*
    2429             :      * Now remove any states that have become useless.  (This cleanup is not
    2430             :      * very thorough, and would be even less so if we tried to combine it with
    2431             :      * the previous step; but cleanup() will take care of anything we miss.)
    2432             :      *
    2433             :      * Because findconstraintloop intentionally doesn't reset all tmp fields,
    2434             :      * we have to clear them after it's done.  This is a convenient place to
    2435             :      * do that, too.
    2436             :      */
    2437      282804 :     for (s = nfa->states; s != NULL; s = nexts)
    2438             :     {
    2439      263256 :         nexts = s->next;
    2440      263256 :         s->tmp = NULL;
    2441      263256 :         if ((s->nins == 0 || s->nouts == 0) && !s->flag)
    2442         426 :             dropstate(nfa, s);
    2443             :     }
    2444             : 
    2445       19548 :     if (f != NULL)
    2446           0 :         dumpnfa(nfa, f);
    2447             : }
    2448             : 
    2449             : /*
    2450             :  * findconstraintloop - recursively find a loop of constraint arcs
    2451             :  *
    2452             :  * If we find a loop, break it by calling breakconstraintloop(), then
    2453             :  * return 1; otherwise return 0.
    2454             :  *
    2455             :  * State tmp fields are guaranteed all NULL on a success return, because
    2456             :  * breakconstraintloop does that.  After a failure return, any state that
    2457             :  * is known not to be part of a loop is marked with s->tmp == s; this allows
    2458             :  * us not to have to re-prove that fact on later calls.  (This convention is
    2459             :  * workable because we already eliminated single-state loops.)
    2460             :  *
    2461             :  * Note that the found loop doesn't necessarily include the first state we
    2462             :  * are called on.  Any loop reachable from that state will do.
    2463             :  *
    2464             :  * The maximum recursion depth here is one more than the length of the longest
    2465             :  * loop-free chain of constraint arcs, which is surely no more than the size
    2466             :  * of the NFA ... but that could still be enough to cause trouble.
    2467             :  */
    2468             : static int
    2469      439234 : findconstraintloop(struct nfa *nfa, struct state *s)
    2470             : {
    2471             :     struct arc *a;
    2472             : 
    2473             :     /* Since this is recursive, it could be driven to stack overflow */
    2474      439234 :     if (STACK_TOO_DEEP(nfa->v->re))
    2475             :     {
    2476           0 :         NERR(REG_ETOOBIG);
    2477           0 :         return 1;               /* to exit as quickly as possible */
    2478             :     }
    2479             : 
    2480      439234 :     if (s->tmp != NULL)
    2481             :     {
    2482             :         /* Already proven uninteresting? */
    2483      158026 :         if (s->tmp == s)
    2484      157614 :             return 0;
    2485             :         /* Found a loop involving s */
    2486         412 :         breakconstraintloop(nfa, s);
    2487             :         /* The tmp fields have been cleaned up by breakconstraintloop */
    2488         412 :         return 1;
    2489             :     }
    2490     1577634 :     for (a = s->outs; a != NULL; a = a->outchain)
    2491             :     {
    2492     1297744 :         if (isconstraintarc(a))
    2493             :         {
    2494      162766 :             struct state *sto = a->to;
    2495             : 
    2496             :             assert(sto != s);
    2497      162766 :             s->tmp = sto;
    2498      162766 :             if (findconstraintloop(nfa, sto))
    2499        1318 :                 return 1;
    2500             :         }
    2501             :     }
    2502             : 
    2503             :     /*
    2504             :      * If we get here, no constraint loop exists leading out from s.  Mark it
    2505             :      * with s->tmp == s so we need not rediscover that fact again later.
    2506             :      */
    2507      279890 :     s->tmp = s;
    2508      279890 :     return 0;
    2509             : }
    2510             : 
    2511             : /*
    2512             :  * breakconstraintloop - break a loop of constraint arcs
    2513             :  *
    2514             :  * sinitial is any one member state of the loop.  Each loop member's tmp
    2515             :  * field links to its successor within the loop.  (Note that this function
    2516             :  * will reset all the tmp fields to NULL.)
    2517             :  *
    2518             :  * We can break the loop by, for any one state S1 in the loop, cloning its
    2519             :  * loop successor state S2 (and possibly following states), and then moving
    2520             :  * all S1->S2 constraint arcs to point to the cloned S2.  The cloned S2 should
    2521             :  * copy any non-constraint outarcs of S2.  Constraint outarcs should be
    2522             :  * dropped if they point back to S1, else they need to be copied as arcs to
    2523             :  * similarly cloned states S3, S4, etc.  In general, each cloned state copies
    2524             :  * non-constraint outarcs, drops constraint outarcs that would lead to itself
    2525             :  * or any earlier cloned state, and sends other constraint outarcs to newly
    2526             :  * cloned states.  No cloned state will have any inarcs that aren't constraint
    2527             :  * arcs or do not lead from S1 or earlier-cloned states.  It's okay to drop
    2528             :  * constraint back-arcs since they would not take us to any state we've not
    2529             :  * already been in; therefore, no new constraint loop is created.  In this way
    2530             :  * we generate a modified NFA that can still represent every useful state
    2531             :  * sequence, but not sequences that represent state loops with no consumption
    2532             :  * of input data.  Note that the set of cloned states will certainly include
    2533             :  * all of the loop member states other than S1, and it may also include
    2534             :  * non-loop states that are reachable from S2 via constraint arcs.  This is
    2535             :  * important because there is no guarantee that findconstraintloop found a
    2536             :  * maximal loop (and searching for one would be NP-hard, so don't try).
    2537             :  * Frequently the "non-loop states" are actually part of a larger loop that
    2538             :  * we didn't notice, and indeed there may be several overlapping loops.
    2539             :  * This technique ensures convergence in such cases, while considering only
    2540             :  * the originally-found loop does not.
    2541             :  *
    2542             :  * If there is only one S1->S2 constraint arc, then that constraint is
    2543             :  * certainly satisfied when we enter any of the clone states.  This means that
    2544             :  * in the common case where many of the constraint arcs are identically
    2545             :  * labeled, we can merge together clone states linked by a similarly-labeled
    2546             :  * constraint: if we can get to the first one we can certainly get to the
    2547             :  * second, so there's no need to distinguish.  This greatly reduces the number
    2548             :  * of new states needed, so we preferentially break the given loop at a state
    2549             :  * pair where this is true.
    2550             :  *
    2551             :  * Furthermore, it's fairly common to find that a cloned successor state has
    2552             :  * no outarcs, especially if we're a bit aggressive about removing unnecessary
    2553             :  * outarcs.  If that happens, then there is simply not any interesting state
    2554             :  * that can be reached through the predecessor's loop arcs, which means we can
    2555             :  * break the loop just by removing those loop arcs, with no new states added.
    2556             :  */
    2557             : static void
    2558         412 : breakconstraintloop(struct nfa *nfa, struct state *sinitial)
    2559             : {
    2560             :     struct state *s;
    2561             :     struct state *shead;
    2562             :     struct state *stail;
    2563             :     struct state *sclone;
    2564             :     struct state *nexts;
    2565             :     struct arc *refarc;
    2566             :     struct arc *a;
    2567             :     struct arc *nexta;
    2568             : 
    2569             :     /*
    2570             :      * Start by identifying which loop step we want to break at.
    2571             :      * Preferentially this is one with only one constraint arc.  (XXX are
    2572             :      * there any other secondary heuristics we want to use here?)  Set refarc
    2573             :      * to point to the selected lone constraint arc, if there is one.
    2574             :      */
    2575         412 :     refarc = NULL;
    2576         412 :     s = sinitial;
    2577             :     do
    2578             :     {
    2579        1060 :         nexts = s->tmp;
    2580             :         assert(nexts != s);     /* should not see any one-element loops */
    2581        1060 :         if (refarc == NULL)
    2582             :         {
    2583         664 :             int         narcs = 0;
    2584             : 
    2585        7600 :             for (a = s->outs; a != NULL; a = a->outchain)
    2586             :             {
    2587        6936 :                 if (a->to == nexts && isconstraintarc(a))
    2588             :                 {
    2589        2592 :                     refarc = a;
    2590        2592 :                     narcs++;
    2591             :                 }
    2592             :             }
    2593             :             assert(narcs > 0);
    2594         664 :             if (narcs > 1)
    2595         372 :                 refarc = NULL;  /* multiple constraint arcs here, no good */
    2596             :         }
    2597        1060 :         s = nexts;
    2598        1060 :     } while (s != sinitial);
    2599             : 
    2600         412 :     if (refarc)
    2601             :     {
    2602             :         /* break at the refarc */
    2603         292 :         shead = refarc->from;
    2604         292 :         stail = refarc->to;
    2605             :         assert(stail == shead->tmp);
    2606             :     }
    2607             :     else
    2608             :     {
    2609             :         /* for lack of a better idea, break after sinitial */
    2610         120 :         shead = sinitial;
    2611         120 :         stail = sinitial->tmp;
    2612             :     }
    2613             : 
    2614             :     /*
    2615             :      * Reset the tmp fields so that we can use them for local storage in
    2616             :      * clonesuccessorstates.  (findconstraintloop won't mind, since it's just
    2617             :      * going to abandon its search anyway.)
    2618             :      */
    2619       34722 :     for (s = nfa->states; s != NULL; s = s->next)
    2620       34310 :         s->tmp = NULL;
    2621             : 
    2622             :     /*
    2623             :      * Recursively build clone state(s) as needed.
    2624             :      */
    2625         412 :     sclone = newstate(nfa);
    2626         412 :     if (sclone == NULL)
    2627             :     {
    2628             :         assert(NISERR());
    2629           0 :         return;
    2630             :     }
    2631             : 
    2632         412 :     clonesuccessorstates(nfa, stail, sclone, shead, refarc,
    2633             :                          NULL, NULL, nfa->nstates);
    2634             : 
    2635         412 :     if (NISERR())
    2636           0 :         return;
    2637             : 
    2638             :     /*
    2639             :      * It's possible that sclone has no outarcs at all, in which case it's
    2640             :      * useless.  (We don't try extremely hard to get rid of useless states
    2641             :      * here, but this is an easy and fairly common case.)
    2642             :      */
    2643         412 :     if (sclone->nouts == 0)
    2644             :     {
    2645          98 :         freestate(nfa, sclone);
    2646          98 :         sclone = NULL;
    2647             :     }
    2648             : 
    2649             :     /*
    2650             :      * Move shead's constraint-loop arcs to point to sclone, or just drop them
    2651             :      * if we discovered we don't need sclone.
    2652             :      */
    2653        4512 :     for (a = shead->outs; a != NULL; a = nexta)
    2654             :     {
    2655        4100 :         nexta = a->outchain;
    2656        4100 :         if (a->to == stail && isconstraintarc(a))
    2657             :         {
    2658         978 :             if (sclone)
    2659         824 :                 cparc(nfa, a, shead, sclone);
    2660         978 :             freearc(nfa, a);
    2661         978 :             if (NISERR())
    2662           0 :                 break;
    2663             :         }
    2664             :     }
    2665             : }
    2666             : 
    2667             : /*
    2668             :  * clonesuccessorstates - create a tree of constraint-arc successor states
    2669             :  *
    2670             :  * ssource is the state to be cloned, and sclone is the state to copy its
    2671             :  * outarcs into.  sclone's inarcs, if any, should already be set up.
    2672             :  *
    2673             :  * spredecessor is the original predecessor state that we are trying to build
    2674             :  * successors for (it may not be the immediate predecessor of ssource).
    2675             :  * refarc, if not NULL, is the original constraint arc that is known to have
    2676             :  * been traversed out of spredecessor to reach the successor(s).
    2677             :  *
    2678             :  * For each cloned successor state, we transiently create a "donemap" that is
    2679             :  * a boolean array showing which source states we've already visited for this
    2680             :  * clone state.  This prevents infinite recursion as well as useless repeat
    2681             :  * visits to the same state subtree (which can add up fast, since typical NFAs
    2682             :  * have multiple redundant arc pathways).  Each donemap is a char array
    2683             :  * indexed by state number.  The donemaps are all of the same size "nstates",
    2684             :  * which is nfa->nstates as of the start of the recursion.  This is enough to
    2685             :  * have entries for all pre-existing states, but *not* entries for clone
    2686             :  * states created during the recursion.  That's okay since we have no need to
    2687             :  * mark those.
    2688             :  *
    2689             :  * curdonemap is NULL when recursing to a new sclone state, or sclone's
    2690             :  * donemap when we are recursing without having created a new state (which we
    2691             :  * do when we decide we can merge a successor state into the current clone
    2692             :  * state).  outerdonemap is NULL at the top level and otherwise the parent
    2693             :  * clone state's donemap.
    2694             :  *
    2695             :  * The successor states we create and fill here form a strict tree structure,
    2696             :  * with each state having exactly one predecessor, except that the toplevel
    2697             :  * state has no inarcs as yet (breakconstraintloop will add its inarcs from
    2698             :  * spredecessor after we're done).  Thus, we can examine sclone's inarcs back
    2699             :  * to the root, plus refarc if any, to identify the set of constraints already
    2700             :  * known valid at the current point.  This allows us to avoid generating extra
    2701             :  * successor states.
    2702             :  */
    2703             : static void
    2704        3622 : clonesuccessorstates(struct nfa *nfa,
    2705             :                      struct state *ssource,
    2706             :                      struct state *sclone,
    2707             :                      struct state *spredecessor,
    2708             :                      struct arc *refarc,
    2709             :                      char *curdonemap,
    2710             :                      char *outerdonemap,
    2711             :                      int nstates)
    2712             : {
    2713             :     char       *donemap;
    2714             :     struct arc *a;
    2715             : 
    2716             :     /* Since this is recursive, it could be driven to stack overflow */
    2717        3622 :     if (STACK_TOO_DEEP(nfa->v->re))
    2718             :     {
    2719           0 :         NERR(REG_ETOOBIG);
    2720           0 :         return;
    2721             :     }
    2722             : 
    2723             :     /* If this state hasn't already got a donemap, create one */
    2724        3622 :     donemap = curdonemap;
    2725        3622 :     if (donemap == NULL)
    2726             :     {
    2727        1820 :         donemap = (char *) MALLOC(nstates * sizeof(char));
    2728        1820 :         if (donemap == NULL)
    2729             :         {
    2730           0 :             NERR(REG_ESPACE);
    2731           0 :             return;
    2732             :         }
    2733             : 
    2734        1820 :         if (outerdonemap != NULL)
    2735             :         {
    2736             :             /*
    2737             :              * Not at outermost recursion level, so copy the outer level's
    2738             :              * donemap; this ensures that we see states in process of being
    2739             :              * visited at outer levels, or already merged into predecessor
    2740             :              * states, as ones we shouldn't traverse back to.
    2741             :              */
    2742        1408 :             memcpy(donemap, outerdonemap, nstates * sizeof(char));
    2743             :         }
    2744             :         else
    2745             :         {
    2746             :             /* At outermost level, only spredecessor is off-limits */
    2747         412 :             memset(donemap, 0, nstates * sizeof(char));
    2748             :             assert(spredecessor->no < nstates);
    2749         412 :             donemap[spredecessor->no] = 1;
    2750             :         }
    2751             :     }
    2752             : 
    2753             :     /* Mark ssource as visited in the donemap */
    2754             :     assert(ssource->no < nstates);
    2755             :     assert(donemap[ssource->no] == 0);
    2756        3622 :     donemap[ssource->no] = 1;
    2757             : 
    2758             :     /*
    2759             :      * We proceed by first cloning all of ssource's outarcs, creating new
    2760             :      * clone states as needed but not doing more with them than that.  Then in
    2761             :      * a second pass, recurse to process the child clone states.  This allows
    2762             :      * us to have only one child clone state per reachable source state, even
    2763             :      * when there are multiple outarcs leading to the same state.  Also, when
    2764             :      * we do visit a child state, its set of inarcs is known exactly, which
    2765             :      * makes it safe to apply the constraint-is-already-checked optimization.
    2766             :      * Also, this ensures that we've merged all the states we can into the
    2767             :      * current clone before we recurse to any children, thus possibly saving
    2768             :      * them from making extra images of those states.
    2769             :      *
    2770             :      * While this function runs, child clone states of the current state are
    2771             :      * marked by setting their tmp fields to point to the original state they
    2772             :      * were cloned from.  This makes it possible to detect multiple outarcs
    2773             :      * leading to the same state, and also makes it easy to distinguish clone
    2774             :      * states from original states (which will have tmp == NULL).
    2775             :      */
    2776       29926 :     for (a = ssource->outs; a != NULL && !NISERR(); a = a->outchain)
    2777             :     {
    2778       26304 :         struct state *sto = a->to;
    2779             : 
    2780             :         /*
    2781             :          * We do not consider cloning successor states that have no constraint
    2782             :          * outarcs; just link to them as-is.  They cannot be part of a
    2783             :          * constraint loop so there is no need to make copies.  In particular,
    2784             :          * this rule keeps us from trying to clone the post state, which would
    2785             :          * be a bad idea.
    2786             :          */
    2787       26304 :         if (isconstraintarc(a) && hasconstraintout(sto))
    2788       10970 :         {
    2789             :             struct state *prevclone;
    2790             :             int         canmerge;
    2791             :             struct arc *a2;
    2792             : 
    2793             :             /*
    2794             :              * Back-link constraint arcs must not be followed.  Nor is there a
    2795             :              * need to revisit states previously merged into this clone.
    2796             :              */
    2797             :             assert(sto->no < nstates);
    2798       15786 :             if (donemap[sto->no] != 0)
    2799        4816 :                 continue;
    2800             : 
    2801             :             /*
    2802             :              * Check whether we already have a child clone state for this
    2803             :              * source state.
    2804             :              */
    2805       10970 :             prevclone = NULL;
    2806       37524 :             for (a2 = sclone->outs; a2 != NULL; a2 = a2->outchain)
    2807             :             {
    2808       34314 :                 if (a2->to->tmp == sto)
    2809             :                 {
    2810        7760 :                     prevclone = a2->to;
    2811        7760 :                     break;
    2812             :                 }
    2813             :             }
    2814             : 
    2815             :             /*
    2816             :              * If this arc is labeled the same as refarc, or the same as any
    2817             :              * arc we must have traversed to get to sclone, then no additional
    2818             :              * constraints need to be met to get to sto, so we should just
    2819             :              * merge its outarcs into sclone.
    2820             :              */
    2821       10970 :             if (refarc && a->type == refarc->type && a->co == refarc->co)
    2822        1802 :                 canmerge = 1;
    2823             :             else
    2824             :             {
    2825             :                 struct state *s;
    2826             : 
    2827        9168 :                 canmerge = 0;
    2828       45892 :                 for (s = sclone; s->ins; s = s->ins->from)
    2829             :                 {
    2830       36724 :                     if (s->nins == 1 &&
    2831          36 :                         a->type == s->ins->type && a->co == s->ins->co)
    2832             :                     {
    2833           0 :                         canmerge = 1;
    2834           0 :                         break;
    2835             :                     }
    2836             :                 }
    2837             :             }
    2838             : 
    2839       10970 :             if (canmerge)
    2840             :             {
    2841             :                 /*
    2842             :                  * We can merge into sclone.  If we previously made a child
    2843             :                  * clone state, drop it; there's no need to visit it.  (This
    2844             :                  * can happen if ssource has multiple pathways to sto, and we
    2845             :                  * only just now found one that is provably a no-op.)
    2846             :                  */
    2847        1802 :                 if (prevclone)
    2848           0 :                     dropstate(nfa, prevclone);  /* kills our outarc, too */
    2849             : 
    2850             :                 /* Recurse to merge sto's outarcs into sclone */
    2851        1802 :                 clonesuccessorstates(nfa,
    2852             :                                      sto,
    2853             :                                      sclone,
    2854             :                                      spredecessor,
    2855             :                                      refarc,
    2856             :                                      donemap,
    2857             :                                      outerdonemap,
    2858             :                                      nstates);
    2859             :                 /* sto should now be marked as previously visited */
    2860             :                 assert(NISERR() || donemap[sto->no] == 1);
    2861             :             }
    2862        9168 :             else if (prevclone)
    2863             :             {
    2864             :                 /*
    2865             :                  * We already have a clone state for this successor, so just
    2866             :                  * make another arc to it.
    2867             :                  */
    2868        7760 :                 cparc(nfa, a, sclone, prevclone);
    2869             :             }
    2870             :             else
    2871             :             {
    2872             :                 /*
    2873             :                  * We need to create a new successor clone state.
    2874             :                  */
    2875             :                 struct state *stoclone;
    2876             : 
    2877        1408 :                 stoclone = newstate(nfa);
    2878        1408 :                 if (stoclone == NULL)
    2879             :                 {
    2880             :                     assert(NISERR());
    2881           0 :                     break;
    2882             :                 }
    2883             :                 /* Mark it as to what it's a clone of */
    2884        1408 :                 stoclone->tmp = sto;
    2885             :                 /* ... and add the outarc leading to it */
    2886        1408 :                 cparc(nfa, a, sclone, stoclone);
    2887             :             }
    2888             :         }
    2889             :         else
    2890             :         {
    2891             :             /*
    2892             :              * Non-constraint outarcs just get copied to sclone, as do outarcs
    2893             :              * leading to states with no constraint outarc.
    2894             :              */
    2895       10518 :             cparc(nfa, a, sclone, sto);
    2896             :         }
    2897             :     }
    2898             : 
    2899             :     /*
    2900             :      * If we are at outer level for this clone state, recurse to all its child
    2901             :      * clone states, clearing their tmp fields as we go.  (If we're not
    2902             :      * outermost for sclone, leave this to be done by the outer call level.)
    2903             :      * Note that if we have multiple outarcs leading to the same clone state,
    2904             :      * it will only be recursed-to once.
    2905             :      */
    2906        3622 :     if (curdonemap == NULL)
    2907             :     {
    2908       16166 :         for (a = sclone->outs; a != NULL && !NISERR(); a = a->outchain)
    2909             :         {
    2910       14346 :             struct state *stoclone = a->to;
    2911       14346 :             struct state *sto = stoclone->tmp;
    2912             : 
    2913       14346 :             if (sto != NULL)
    2914             :             {
    2915        1408 :                 stoclone->tmp = NULL;
    2916        1408 :                 clonesuccessorstates(nfa,
    2917             :                                      sto,
    2918             :                                      stoclone,
    2919             :                                      spredecessor,
    2920             :                                      refarc,
    2921             :                                      NULL,
    2922             :                                      donemap,
    2923             :                                      nstates);
    2924             :             }
    2925             :         }
    2926             : 
    2927             :         /* Don't forget to free sclone's donemap when done with it */
    2928        1820 :         FREE(donemap);
    2929             :     }
    2930             : }
    2931             : 
    2932             : /*
    2933             :  * removecantmatch - remove CANTMATCH arcs, which are no longer useful
    2934             :  * once we are done with the parsing phase.  (We need them only to
    2935             :  * preserve connectedness of NFA subgraphs during parsing.)
    2936             :  */
    2937             : static void
    2938          12 : removecantmatch(struct nfa *nfa)
    2939             : {
    2940             :     struct state *s;
    2941             : 
    2942          92 :     for (s = nfa->states; s != NULL; s = s->next)
    2943             :     {
    2944             :         struct arc *a;
    2945             :         struct arc *nexta;
    2946             : 
    2947         202 :         for (a = s->outs; a != NULL; a = nexta)
    2948             :         {
    2949         122 :             nexta = a->outchain;
    2950         122 :             if (a->type == CANTMATCH)
    2951             :             {
    2952           8 :                 freearc(nfa, a);
    2953           8 :                 if (NISERR())
    2954           0 :                     return;
    2955             :             }
    2956             :         }
    2957             :     }
    2958             : }
    2959             : 
    2960             : /*
    2961             :  * cleanup - clean up NFA after optimizations
    2962             :  */
    2963             : static void
    2964       39116 : cleanup(struct nfa *nfa)
    2965             : {
    2966             :     struct state *s;
    2967             :     struct state *nexts;
    2968             :     int         n;
    2969             : 
    2970       39116 :     if (NISERR())
    2971           6 :         return;
    2972             : 
    2973             :     /* clear out unreachable or dead-end states */
    2974             :     /* use pre to mark reachable, then post to mark can-reach-post */
    2975       39110 :     markreachable(nfa, nfa->pre, (struct state *) NULL, nfa->pre);
    2976       39110 :     markcanreach(nfa, nfa->post, nfa->pre, nfa->post);
    2977      711698 :     for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
    2978             :     {
    2979      672588 :         nexts = s->next;
    2980      672588 :         if (s->tmp != nfa->post && !s->flag)
    2981        6280 :             dropstate(nfa, s);
    2982             :     }
    2983             :     assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == nfa->post);
    2984       39110 :     cleartraverse(nfa, nfa->pre);
    2985             :     assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == NULL);
    2986             :     /* the nins==0 (final unreachable) case will be caught later */
    2987             : 
    2988             :     /* renumber surviving states */
    2989       39110 :     n = 0;
    2990      705418 :     for (s = nfa->states; s != NULL; s = s->next)
    2991      666308 :         s->no = n++;
    2992       39110 :     nfa->nstates = n;
    2993             : }
    2994             : 
    2995             : /*
    2996             :  * markreachable - recursive marking of reachable states
    2997             :  */
    2998             : static void
    2999     1786410 : markreachable(struct nfa *nfa,
    3000             :               struct state *s,
    3001             :               struct state *okay,   /* consider only states with this mark */
    3002             :               struct state *mark)   /* the value to mark with */
    3003             : {
    3004             :     struct arc *a;
    3005             : 
    3006             :     /* Since this is recursive, it could be driven to stack overflow */
    3007     1786410 :     if (STACK_TOO_DEEP(nfa->v->re))
    3008             :     {
    3009           0 :         NERR(REG_ETOOBIG);
    3010           0 :         return;
    3011             :     }
    3012             : 
    3013     1786410 :     if (s->tmp != okay)
    3014     1120112 :         return;
    3015      666298 :     s->tmp = mark;
    3016             : 
    3017     2413598 :     for (a = s->outs; a != NULL; a = a->outchain)
    3018     1747300 :         markreachable(nfa, a->to, okay, mark);
    3019             : }
    3020             : 
    3021             : /*
    3022             :  * markcanreach - recursive marking of states which can reach here
    3023             :  */
    3024             : static void
    3025     1787302 : markcanreach(struct nfa *nfa,
    3026             :              struct state *s,
    3027             :              struct state *okay,    /* consider only states with this mark */
    3028             :              struct state *mark)    /* the value to mark with */
    3029             : {
    3030             :     struct arc *a;
    3031             : 
    3032             :     /* Since this is recursive, it could be driven to stack overflow */
    3033     1787302 :     if (STACK_TOO_DEEP(nfa->v->re))
    3034             :     {
    3035           0 :         NERR(REG_ETOOBIG);
    3036           0 :         return;
    3037             :     }
    3038             : 
    3039     1787302 :     if (s->tmp != okay)
    3040     1121198 :         return;
    3041      666104 :     s->tmp = mark;
    3042             : 
    3043     2414296 :     for (a = s->ins; a != NULL; a = a->inchain)
    3044     1748192 :         markcanreach(nfa, a->from, okay, mark);
    3045             : }
    3046             : 
    3047             : /*
    3048             :  * analyze - ascertain potentially-useful facts about an optimized NFA
    3049             :  */
    3050             : static long                     /* re_info bits to be ORed in */
    3051       19558 : analyze(struct nfa *nfa)
    3052             : {
    3053             :     struct arc *a;
    3054             :     struct arc *aa;
    3055             : 
    3056       19558 :     if (NISERR())
    3057           6 :         return 0;
    3058             : 
    3059             :     /* Detect whether NFA can't match anything */
    3060       19552 :     if (nfa->pre->outs == NULL)
    3061          98 :         return REG_UIMPOSSIBLE;
    3062             : 
    3063             :     /* Detect whether NFA matches all strings (possibly with length bounds) */
    3064       19454 :     checkmatchall(nfa);
    3065             : 
    3066             :     /* Detect whether NFA can possibly match a zero-length string */
    3067       58746 :     for (a = nfa->pre->outs; a != NULL; a = a->outchain)
    3068     1545570 :         for (aa = a->to->outs; aa != NULL; aa = aa->outchain)
    3069     1506278 :             if (aa->to == nfa->post)
    3070        2192 :                 return REG_UEMPTYMATCH;
    3071       17262 :     return 0;
    3072             : }
    3073             : 
    3074             : /*
    3075             :  * checkmatchall - does the NFA represent no more than a string length test?
    3076             :  *
    3077             :  * If so, set nfa->minmatchall and nfa->maxmatchall correctly (they are -1
    3078             :  * to begin with) and set the MATCHALL bit in nfa->flags.
    3079             :  *
    3080             :  * To succeed, we require all arcs to be PLAIN RAINBOW arcs, except for those
    3081             :  * for pseudocolors (i.e., BOS/BOL/EOS/EOL).  We must be able to reach the
    3082             :  * post state via RAINBOW arcs, and if there are any loops in the graph, they
    3083             :  * must be loop-to-self arcs, ensuring that each loop iteration consumes
    3084             :  * exactly one character.  (Longer loops are problematic because they create
    3085             :  * non-consecutive possible match lengths; we have no good way to represent
    3086             :  * that situation for lengths beyond the DUPINF limit.)
    3087             :  *
    3088             :  * Pseudocolor arcs complicate things a little.  We know that they can only
    3089             :  * appear as pre-state outarcs (for BOS/BOL) or post-state inarcs (for
    3090             :  * EOS/EOL).  There, they must exactly replicate the parallel RAINBOW arcs,
    3091             :  * e.g. if the pre state has one RAINBOW outarc to state 2, it must have BOS
    3092             :  * and BOL outarcs to state 2, and no others.  Missing or extra pseudocolor
    3093             :  * arcs can occur, meaning that the NFA involves some constraint on the
    3094             :  * adjacent characters, which makes it not a matchall NFA.
    3095             :  */
    3096             : static void
    3097       19454 : checkmatchall(struct nfa *nfa)
    3098             : {
    3099             :     bool      **haspaths;
    3100             :     struct state *s;
    3101             :     int         i;
    3102             : 
    3103             :     /*
    3104             :      * If there are too many states, don't bother trying to detect matchall.
    3105             :      * This limit serves to bound the time and memory we could consume below.
    3106             :      * Note that even if the graph is all-RAINBOW, if there are significantly
    3107             :      * more than DUPINF states then it's likely that there are paths of length
    3108             :      * more than DUPINF, which would force us to fail anyhow.  In practice,
    3109             :      * plausible ways of writing a matchall regex with maximum finite path
    3110             :      * length K tend not to have very many more than K states.
    3111             :      */
    3112       19454 :     if (nfa->nstates > DUPINF * 2)
    3113          12 :         return;
    3114             : 
    3115             :     /*
    3116             :      * First, scan all the states to verify that only RAINBOW arcs appear,
    3117             :      * plus pseudocolor arcs adjacent to the pre and post states.  This lets
    3118             :      * us quickly eliminate most cases that aren't matchall NFAs.
    3119             :      */
    3120       72426 :     for (s = nfa->states; s != NULL; s = s->next)
    3121             :     {
    3122             :         struct arc *a;
    3123             : 
    3124      201682 :         for (a = s->outs; a != NULL; a = a->outchain)
    3125             :         {
    3126      148698 :             if (a->type != PLAIN)
    3127          90 :                 return;         /* any LACONs make it non-matchall */
    3128      148608 :             if (a->co != RAINBOW)
    3129             :             {
    3130       63678 :                 if (nfa->cm->cd[a->co].flags & PSEUDO)
    3131             :                 {
    3132             :                     /*
    3133             :                      * Pseudocolor arc: verify it's in a valid place (this
    3134             :                      * seems quite unlikely to fail, but let's be sure).
    3135             :                      */
    3136       45870 :                     if (s == nfa->pre &&
    3137       33398 :                         (a->co == nfa->bos[0] || a->co == nfa->bos[1]))
    3138             :                          /* okay BOS/BOL arc */ ;
    3139       12472 :                     else if (a->to == nfa->post &&
    3140       12472 :                              (a->co == nfa->eos[0] || a->co == nfa->eos[1]))
    3141             :                          /* okay EOS/EOL arc */ ;
    3142             :                     else
    3143           0 :                         return; /* unexpected pseudocolor arc */
    3144             :                     /* We'll check these arcs some more below. */
    3145             :                 }
    3146             :                 else
    3147       17808 :                     return;     /* any other color makes it non-matchall */
    3148             :             }
    3149             :         }
    3150             :         /* Also, assert that the tmp fields are available for use. */
    3151             :         assert(s->tmp == NULL);
    3152             :     }
    3153             : 
    3154             :     /*
    3155             :      * The next cheapest check we can make is to verify that the BOS/BOL
    3156             :      * outarcs of the pre state reach the same states as its RAINBOW outarcs.
    3157             :      * If they don't, the NFA expresses some constraints on the character
    3158             :      * before the matched string, making it non-matchall.  Likewise, the
    3159             :      * EOS/EOL inarcs of the post state must match its RAINBOW inarcs.
    3160             :      */
    3161        1544 :     if (!check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[0]) ||
    3162        1538 :         !check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[1]) ||
    3163        1202 :         !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[0]) ||
    3164        1194 :         !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[1]))
    3165         554 :         return;
    3166             : 
    3167             :     /*
    3168             :      * Initialize an array of path-length arrays, in which
    3169             :      * checkmatchall_recurse will return per-state results.  This lets us
    3170             :      * memo-ize the recursive search and avoid exponential time consumption.
    3171             :      */
    3172         990 :     haspaths = (bool **) MALLOC(nfa->nstates * sizeof(bool *));
    3173         990 :     if (haspaths == NULL)
    3174           0 :         return;                 /* fail quietly */
    3175         990 :     memset(haspaths, 0, nfa->nstates * sizeof(bool *));
    3176             : 
    3177             :     /*
    3178             :      * Recursively search the graph for all-RAINBOW paths to the "post" state,
    3179             :      * starting at the "pre" state, and computing the lengths of the paths.
    3180             :      * (Given the preceding checks, there should be at least one such path.
    3181             :      * However we could get back a false result anyway, in case there are
    3182             :      * multi-state loops, paths exceeding DUPINF+1 length, or non-algorithmic
    3183             :      * failures such as ENOMEM.)
    3184             :      */
    3185         990 :     if (checkmatchall_recurse(nfa, nfa->pre, haspaths))
    3186             :     {
    3187             :         /* The useful result is the path length array for the pre state */
    3188         966 :         bool       *haspath = haspaths[nfa->pre->no];
    3189             :         int         minmatch,
    3190             :                     maxmatch,
    3191             :                     morematch;
    3192             : 
    3193             :         assert(haspath != NULL);
    3194             : 
    3195             :         /*
    3196             :          * haspath[] now represents the set of possible path lengths; but we
    3197             :          * want to reduce that to a min and max value, because it doesn't seem
    3198             :          * worth complicating regexec.c to deal with nonconsecutive possible
    3199             :          * match lengths.  Find min and max of first run of lengths, then
    3200             :          * verify there are no nonconsecutive lengths.
    3201             :          */
    3202        3972 :         for (minmatch = 0; minmatch <= DUPINF + 1; minmatch++)
    3203             :         {
    3204        3972 :             if (haspath[minmatch])
    3205         966 :                 break;
    3206             :         }
    3207             :         assert(minmatch <= DUPINF + 1); /* else checkmatchall_recurse lied */
    3208       63054 :         for (maxmatch = minmatch; maxmatch < DUPINF + 1; maxmatch++)
    3209             :         {
    3210       62816 :             if (!haspath[maxmatch + 1])
    3211         728 :                 break;
    3212             :         }
    3213      181086 :         for (morematch = maxmatch + 1; morematch <= DUPINF + 1; morematch++)
    3214             :         {
    3215      180132 :             if (haspath[morematch])
    3216             :             {
    3217          12 :                 haspath = NULL; /* fail, there are nonconsecutive lengths */
    3218          12 :                 break;
    3219             :             }
    3220             :         }
    3221             : 
    3222         966 :         if (haspath != NULL)
    3223             :         {
    3224             :             /*
    3225             :              * Success, so record the info.  Here we have a fine point: the
    3226             :              * path length from the pre state includes the pre-to-initial
    3227             :              * transition, so it's one more than the actually matched string
    3228             :              * length.  (We avoided counting the final-to-post transition
    3229             :              * within checkmatchall_recurse, but not this one.)  This is why
    3230             :              * checkmatchall_recurse allows one more level of path length than
    3231             :              * might seem necessary.  This decrement also takes care of
    3232             :              * converting checkmatchall_recurse's definition of "infinity" as
    3233             :              * "DUPINF+1" to our normal representation as "DUPINF".
    3234             :              */
    3235             :             assert(minmatch > 0);    /* else pre and post states were adjacent */
    3236         954 :             nfa->minmatchall = minmatch - 1;
    3237         954 :             nfa->maxmatchall = maxmatch - 1;
    3238         954 :             nfa->flags |= MATCHALL;
    3239             :         }
    3240             :     }
    3241             : 
    3242             :     /* Clean up */
    3243       10406 :     for (i = 0; i < nfa->nstates; i++)
    3244             :     {
    3245        9416 :         if (haspaths[i] != NULL)
    3246        8426 :             FREE(haspaths[i]);
    3247             :     }
    3248         990 :     FREE(haspaths);
    3249             : }
    3250             : 
    3251             : /*
    3252             :  * checkmatchall_recurse - recursive search for checkmatchall
    3253             :  *
    3254             :  * s is the state to be examined in this recursion level.
    3255             :  * haspaths[] is an array of per-state exit path length arrays.
    3256             :  *
    3257             :  * We return true if the search was performed successfully, false if
    3258             :  * we had to fail because of multi-state loops or other internal reasons.
    3259             :  * (Because "dead" states that can't reach the post state have been
    3260             :  * eliminated, and we already verified that only RAINBOW and matching
    3261             :  * pseudocolor arcs exist, every state should have RAINBOW path(s) to
    3262             :  * the post state.  Hence we take a false result from recursive calls
    3263             :  * as meaning that we'd better fail altogether, not just that that
    3264             :  * particular state can't reach the post state.)
    3265             :  *
    3266             :  * On success, we store a malloc'd result array in haspaths[s->no],
    3267             :  * showing the possible path lengths from s to the post state.
    3268             :  * Each state's haspath[] array is of length DUPINF+2.  The entries from
    3269             :  * k = 0 to DUPINF are true if there is an all-RAINBOW path of length k
    3270             :  * from this state to the string end.  haspath[DUPINF+1] is true if all
    3271             :  * path lengths >= DUPINF+1 are possible.  (Situations that cannot be
    3272             :  * represented under these rules cause failure.)
    3273             :  *
    3274             :  * checkmatchall is responsible for eventually freeing the haspath[] arrays.
    3275             :  */
    3276             : static bool
    3277        8426 : checkmatchall_recurse(struct nfa *nfa, struct state *s, bool **haspaths)
    3278             : {
    3279        8426 :     bool        result = false;
    3280        8426 :     bool        foundloop = false;
    3281             :     bool       *haspath;
    3282             :     struct arc *a;
    3283             : 
    3284             :     /*
    3285             :      * Since this is recursive, it could be driven to stack overflow.  But we
    3286             :      * need not treat that as a hard failure; just deem the NFA non-matchall.
    3287             :      */
    3288        8426 :     if (STACK_TOO_DEEP(nfa->v->re))
    3289           0 :         return false;
    3290             : 
    3291             :     /* In case the search takes a long time, check for cancel */
    3292        8426 :     INTERRUPT(nfa->v->re);
    3293             : 
    3294             :     /* Create a haspath array for this state */
    3295        8426 :     haspath = (bool *) MALLOC((DUPINF + 2) * sizeof(bool));
    3296        8426 :     if (haspath == NULL)
    3297           0 :         return false;           /* again, treat as non-matchall */
    3298        8426 :     memset(haspath, 0, (DUPINF + 2) * sizeof(bool));
    3299             : 
    3300             :     /* Mark this state as being visited */
    3301             :     assert(s->tmp == NULL);
    3302        8426 :     s->tmp = s;
    3303             : 
    3304       82902 :     for (a = s->outs; a != NULL; a = a->outchain)
    3305             :     {
    3306       74572 :         if (a->co != RAINBOW)
    3307        6428 :             continue;           /* ignore pseudocolor arcs */
    3308       68144 :         if (a->to == nfa->post)
    3309             :         {
    3310             :             /* We found an all-RAINBOW path to the post state */
    3311         978 :             result = true;
    3312             : 
    3313             :             /*
    3314             :              * Mark this state as being zero steps away from the string end
    3315             :              * (the transition to the post state isn't counted).
    3316             :              */
    3317         978 :             haspath[0] = true;
    3318             :         }
    3319       67166 :         else if (a->to == s)
    3320             :         {
    3321             :             /* We found a cycle of length 1, which we'll deal with below. */
    3322         244 :             foundloop = true;
    3323             :         }
    3324       66922 :         else if (a->to->tmp != NULL)
    3325             :         {
    3326             :             /* It's busy, so we found a cycle of length > 1, so fail. */
    3327          12 :             result = false;
    3328          12 :             break;
    3329             :         }
    3330             :         else
    3331             :         {
    3332             :             /* Consider paths forward through this to-state. */
    3333             :             bool       *nexthaspath;
    3334             :             int         i;
    3335             : 
    3336             :             /* If to-state was not already visited, recurse */
    3337       66910 :             if (haspaths[a->to->no] == NULL)
    3338             :             {
    3339        7436 :                 result = checkmatchall_recurse(nfa, a->to, haspaths);
    3340             :                 /* Fail if any recursive path fails */
    3341        7436 :                 if (!result)
    3342          72 :                     break;
    3343             :             }
    3344             :             else
    3345             :             {
    3346             :                 /* The previous visit must have found path(s) to the end */
    3347       59474 :                 result = true;
    3348             :             }
    3349             :             assert(a->to->tmp == NULL);
    3350       66838 :             nexthaspath = haspaths[a->to->no];
    3351             :             assert(nexthaspath != NULL);
    3352             : 
    3353             :             /*
    3354             :              * Now, for every path of length i from a->to to the string end,
    3355             :              * there is a path of length i + 1 from s to the string end.
    3356             :              */
    3357       66838 :             if (nexthaspath[DUPINF] != nexthaspath[DUPINF + 1])
    3358             :             {
    3359             :                 /*
    3360             :                  * a->to has a path of length exactly DUPINF, but not longer;
    3361             :                  * or it has paths of all lengths > DUPINF but not one of
    3362             :                  * exactly that length.  In either case, we cannot represent
    3363             :                  * the possible path lengths from s correctly, so fail.
    3364             :                  */
    3365          12 :                 result = false;
    3366          12 :                 break;
    3367             :             }
    3368             :             /* Merge knowledge of these path lengths into what we have */
    3369    17174282 :             for (i = 0; i < DUPINF; i++)
    3370    17107456 :                 haspath[i + 1] |= nexthaspath[i];
    3371             :             /* Infinity + 1 is still infinity */
    3372       66826 :             haspath[DUPINF + 1] |= nexthaspath[DUPINF + 1];
    3373             :         }
    3374             :     }
    3375             : 
    3376        8426 :     if (result && foundloop)
    3377             :     {
    3378             :         /*
    3379             :          * If there is a length-1 loop at this state, then find the shortest
    3380             :          * known path length to the end.  The loop means that every larger
    3381             :          * path length is possible, too.  (It doesn't matter whether any of
    3382             :          * the longer lengths were already known possible.)
    3383             :          */
    3384             :         int         i;
    3385             : 
    3386         312 :         for (i = 0; i <= DUPINF; i++)
    3387             :         {
    3388         312 :             if (haspath[i])
    3389         244 :                 break;
    3390             :         }
    3391       62884 :         for (i++; i <= DUPINF + 1; i++)
    3392       62640 :             haspath[i] = true;
    3393             :     }
    3394             : 
    3395             :     /* Report out the completed path length map */
    3396             :     assert(s->no < nfa->nstates);
    3397             :     assert(haspaths[s->no] == NULL);
    3398        8426 :     haspaths[s->no] = haspath;
    3399             : 
    3400             :     /* Mark state no longer busy */
    3401        8426 :     s->tmp = NULL;
    3402             : 
    3403        8426 :     return result;
    3404             : }
    3405             : 
    3406             : /*
    3407             :  * check_out_colors_match - subroutine for checkmatchall
    3408             :  *
    3409             :  * Check whether the set of states reachable from s by arcs of color co1
    3410             :  * is equivalent to the set reachable by arcs of color co2.
    3411             :  * checkmatchall already verified that all of the NFA's arcs are PLAIN,
    3412             :  * so we need not examine arc types here.
    3413             :  */
    3414             : static bool
    3415        3082 : check_out_colors_match(struct state *s, color co1, color co2)
    3416             : {
    3417        3082 :     bool        result = true;
    3418             :     struct arc *a;
    3419             : 
    3420             :     /*
    3421             :      * To do this in linear time, we assume that the NFA contains no duplicate
    3422             :      * arcs.  Run through the out-arcs, marking states reachable by arcs of
    3423             :      * color co1.  Run through again, un-marking states reachable by arcs of
    3424             :      * color co2; if we see a not-marked state, we know this co2 arc is
    3425             :      * unmatched.  Then run through again, checking for still-marked states,
    3426             :      * and in any case leaving all the tmp fields reset to NULL.
    3427             :      */
    3428       18890 :     for (a = s->outs; a != NULL; a = a->outchain)
    3429             :     {
    3430       15808 :         if (a->co == co1)
    3431             :         {
    3432             :             assert(a->to->tmp == NULL);
    3433        5024 :             a->to->tmp = a->to;
    3434             :         }
    3435             :     }
    3436       18890 :     for (a = s->outs; a != NULL; a = a->outchain)
    3437             :     {
    3438       15808 :         if (a->co == co2)
    3439             :         {
    3440        5392 :             if (a->to->tmp != NULL)
    3441        5020 :                 a->to->tmp = NULL;
    3442             :             else
    3443         372 :                 result = false; /* unmatched co2 arc */
    3444             :         }
    3445             :     }
    3446       18890 :     for (a = s->outs; a != NULL; a = a->outchain)
    3447             :     {
    3448       15808 :         if (a->co == co1)
    3449             :         {
    3450        5024 :             if (a->to->tmp != NULL)
    3451             :             {
    3452           4 :                 result = false; /* unmatched co1 arc */
    3453           4 :                 a->to->tmp = NULL;
    3454             :             }
    3455             :         }
    3456             :     }
    3457        3082 :     return result;
    3458             : }
    3459             : 
    3460             : /*
    3461             :  * check_in_colors_match - subroutine for checkmatchall
    3462             :  *
    3463             :  * Check whether the set of states that can reach s by arcs of color co1
    3464             :  * is equivalent to the set that can reach s by arcs of color co2.
    3465             :  * checkmatchall already verified that all of the NFA's arcs are PLAIN,
    3466             :  * so we need not examine arc types here.
    3467             :  */
    3468             : static bool
    3469        2396 : check_in_colors_match(struct state *s, color co1, color co2)
    3470             : {
    3471        2396 :     bool        result = true;
    3472             :     struct arc *a;
    3473             : 
    3474             :     /*
    3475             :      * Identical algorithm to check_out_colors_match, except examine the
    3476             :      * from-states of s' inarcs.
    3477             :      */
    3478        8876 :     for (a = s->ins; a != NULL; a = a->inchain)
    3479             :     {
    3480        6480 :         if (a->co == co1)
    3481             :         {
    3482             :             assert(a->from->tmp == NULL);
    3483        2012 :             a->from->tmp = a->from;
    3484             :         }
    3485             :     }
    3486        8876 :     for (a = s->ins; a != NULL; a = a->inchain)
    3487             :     {
    3488        6480 :         if (a->co == co2)
    3489             :         {
    3490        2234 :             if (a->from->tmp != NULL)
    3491        2008 :                 a->from->tmp = NULL;
    3492             :             else
    3493         226 :                 result = false; /* unmatched co2 arc */
    3494             :         }
    3495             :     }
    3496        8876 :     for (a = s->ins; a != NULL; a = a->inchain)
    3497             :     {
    3498        6480 :         if (a->co == co1)
    3499             :         {
    3500        2012 :             if (a->from->tmp != NULL)
    3501             :             {
    3502           4 :                 result = false; /* unmatched co1 arc */
    3503           4 :                 a->from->tmp = NULL;
    3504             :             }
    3505             :         }
    3506             :     }
    3507        2396 :     return result;
    3508             : }
    3509             : 
    3510             : /*
    3511             :  * compact - construct the compact representation of an NFA
    3512             :  */
    3513             : static void
    3514       19552 : compact(struct nfa *nfa,
    3515             :         struct cnfa *cnfa)
    3516             : {
    3517             :     struct state *s;
    3518             :     struct arc *a;
    3519             :     size_t      nstates;
    3520             :     size_t      narcs;
    3521             :     struct carc *ca;
    3522             :     struct carc *first;
    3523             : 
    3524             :     assert(!NISERR());
    3525             : 
    3526       19552 :     nstates = 0;
    3527       19552 :     narcs = 0;
    3528      257686 :     for (s = nfa->states; s != NULL; s = s->next)
    3529             :     {
    3530      238134 :         nstates++;
    3531      238134 :         narcs += s->nouts + 1;   /* need one extra for endmarker */
    3532             :     }
    3533             : 
    3534       19552 :     cnfa->stflags = (char *) MALLOC(nstates * sizeof(char));
    3535       19552 :     cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *));
    3536       19552 :     cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc));
    3537       19552 :     if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL)
    3538             :     {
    3539           0 :         if (cnfa->stflags != NULL)
    3540           0 :             FREE(cnfa->stflags);
    3541           0 :         if (cnfa->states != NULL)
    3542           0 :             FREE(cnfa->states);
    3543           0 :         if (cnfa->arcs != NULL)
    3544           0 :             FREE(cnfa->arcs);
    3545           0 :         NERR(REG_ESPACE);
    3546           0 :         return;
    3547             :     }
    3548       19552 :     cnfa->nstates = nstates;
    3549       19552 :     cnfa->pre = nfa->pre->no;
    3550       19552 :     cnfa->post = nfa->post->no;
    3551       19552 :     cnfa->bos[0] = nfa->bos[0];
    3552       19552 :     cnfa->bos[1] = nfa->bos[1];
    3553       19552 :     cnfa->eos[0] = nfa->eos[0];
    3554       19552 :     cnfa->eos[1] = nfa->eos[1];
    3555       19552 :     cnfa->ncolors = maxcolor(nfa->cm) + 1;
    3556       19552 :     cnfa->flags = nfa->flags;
    3557       19552 :     cnfa->minmatchall = nfa->minmatchall;
    3558       19552 :     cnfa->maxmatchall = nfa->maxmatchall;
    3559             : 
    3560       19552 :     ca = cnfa->arcs;
    3561      257686 :     for (s = nfa->states; s != NULL; s = s->next)
    3562             :     {
    3563             :         assert((size_t) s->no < nstates);
    3564      238134 :         cnfa->stflags[s->no] = 0;
    3565      238134 :         cnfa->states[s->no] = ca;
    3566      238134 :         first = ca;
    3567     1783778 :         for (a = s->outs; a != NULL; a = a->outchain)
    3568     1545644 :             switch (a->type)
    3569             :             {
    3570     1545446 :                 case PLAIN:
    3571     1545446 :                     ca->co = a->co;
    3572     1545446 :                     ca->to = a->to->no;
    3573     1545446 :                     ca++;
    3574     1545446 :                     break;
    3575         198 :                 case LACON:
    3576             :                     assert(s->no != cnfa->pre);
    3577             :                     assert(a->co >= 0);
    3578         198 :                     ca->co = (color) (cnfa->ncolors + a->co);
    3579         198 :                     ca->to = a->to->no;
    3580         198 :                     ca++;
    3581         198 :                     cnfa->flags |= HASLACONS;
    3582         198 :                     break;
    3583           0 :                 default:
    3584           0 :                     NERR(REG_ASSERT);
    3585           0 :                     return;
    3586             :             }
    3587      238134 :         carcsort(first, ca - first);
    3588      238134 :         ca->co = COLORLESS;
    3589      238134 :         ca->to = 0;
    3590      238134 :         ca++;
    3591             :     }
    3592             :     assert(ca == &cnfa->arcs[narcs]);
    3593             :     assert(cnfa->nstates != 0);
    3594             : 
    3595             :     /* mark no-progress states */
    3596       81742 :     for (a = nfa->pre->outs; a != NULL; a = a->outchain)
    3597       62190 :         cnfa->stflags[a->to->no] = CNFA_NOPROGRESS;
    3598       19552 :     cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS;
    3599             : }
    3600             : 
    3601             : /*
    3602             :  * carcsort - sort compacted-NFA arcs by color
    3603             :  */
    3604             : static void
    3605      238134 : carcsort(struct carc *first, size_t n)
    3606             : {
    3607      238134 :     if (n > 1)
    3608       46106 :         qsort(first, n, sizeof(struct carc), carc_cmp);
    3609      238134 : }
    3610             : 
    3611             : static int
    3612    15451670 : carc_cmp(const void *a, const void *b)
    3613             : {
    3614    15451670 :     const struct carc *aa = (const struct carc *) a;
    3615    15451670 :     const struct carc *bb = (const struct carc *) b;
    3616             : 
    3617    15451670 :     if (aa->co < bb->co)
    3618      138834 :         return -1;
    3619    15312836 :     if (aa->co > bb->co)
    3620      202252 :         return +1;
    3621    15110584 :     if (aa->to < bb->to)
    3622    10363382 :         return -1;
    3623     4747202 :     if (aa->to > bb->to)
    3624     4747202 :         return +1;
    3625             :     /* This is unreached, since there should be no duplicate arcs now: */
    3626           0 :     return 0;
    3627             : }
    3628             : 
    3629             : /*
    3630             :  * freecnfa - free a compacted NFA
    3631             :  */
    3632             : static void
    3633        4296 : freecnfa(struct cnfa *cnfa)
    3634             : {
    3635             :     assert(!NULLCNFA(*cnfa));   /* not empty already */
    3636        4296 :     FREE(cnfa->stflags);
    3637        4296 :     FREE(cnfa->states);
    3638        4296 :     FREE(cnfa->arcs);
    3639        4296 :     ZAPCNFA(*cnfa);
    3640        4296 : }
    3641             : 
    3642             : /*
    3643             :  * dumpnfa - dump an NFA in human-readable form
    3644             :  */
    3645             : static void
    3646           0 : dumpnfa(struct nfa *nfa,
    3647             :         FILE *f)
    3648             : {
    3649             : #ifdef REG_DEBUG
    3650             :     struct state *s;
    3651             :     int         nstates = 0;
    3652             :     int         narcs = 0;
    3653             : 
    3654             :     fprintf(f, "pre %d, post %d", nfa->pre->no, nfa->post->no);
    3655             :     if (nfa->bos[0] != COLORLESS)
    3656             :         fprintf(f, ", bos [%ld]", (long) nfa->bos[0]);
    3657             :     if (nfa->bos[1] != COLORLESS)
    3658             :         fprintf(f, ", bol [%ld]", (long) nfa->bos[1]);
    3659             :     if (nfa->eos[0] != COLORLESS)
    3660             :         fprintf(f, ", eos [%ld]", (long) nfa->eos[0]);
    3661             :     if (nfa->eos[1] != COLORLESS)
    3662             :         fprintf(f, ", eol [%ld]", (long) nfa->eos[1]);
    3663             :     if (nfa->flags & HASLACONS)
    3664             :         fprintf(f, ", haslacons");
    3665             :     if (nfa->flags & HASCANTMATCH)
    3666             :         fprintf(f, ", hascantmatch");
    3667             :     if (nfa->flags & MATCHALL)
    3668             :     {
    3669             :         fprintf(f, ", minmatchall %d", nfa->minmatchall);
    3670             :         if (nfa->maxmatchall == DUPINF)
    3671             :             fprintf(f, ", maxmatchall inf");
    3672             :         else
    3673             :             fprintf(f, ", maxmatchall %d", nfa->maxmatchall);
    3674             :     }
    3675             :     fprintf(f, "\n");
    3676             :     for (s = nfa->states; s != NULL; s = s->next)
    3677             :     {
    3678             :         dumpstate(s, f);
    3679             :         nstates++;
    3680             :         narcs += s->nouts;
    3681             :     }
    3682             :     fprintf(f, "total of %d states, %d arcs\n", nstates, narcs);
    3683             :     if (nfa->parent == NULL)
    3684             :         dumpcolors(nfa->cm, f);
    3685             :     fflush(f);
    3686             : #endif
    3687           0 : }
    3688             : 
    3689             : #ifdef REG_DEBUG                /* subordinates of dumpnfa */
    3690             : 
    3691             : /*
    3692             :  * dumpstate - dump an NFA state in human-readable form
    3693             :  */
    3694             : static void
    3695             : dumpstate(struct state *s,
    3696             :           FILE *f)
    3697             : {
    3698             :     struct arc *a;
    3699             : 
    3700             :     fprintf(f, "%d%s%c", s->no, (s->tmp != NULL) ? "T" : "",
    3701             :             (s->flag) ? s->flag : '.');
    3702             :     if (s->prev != NULL && s->prev->next != s)
    3703             :         fprintf(f, "\tstate chain bad\n");
    3704             :     if (s->nouts == 0)
    3705             :         fprintf(f, "\tno out arcs\n");
    3706             :     else
    3707             :         dumparcs(s, f);
    3708             :     for (a = s->ins; a != NULL; a = a->inchain)
    3709             :     {
    3710             :         if (a->to != s)
    3711             :             fprintf(f, "\tlink from %d to %d on %d's in-chain\n",
    3712             :                     a->from->no, a->to->no, s->no);
    3713             :     }
    3714             :     fflush(f);
    3715             : }
    3716             : 
    3717             : /*
    3718             :  * dumparcs - dump out-arcs in human-readable form
    3719             :  */
    3720             : static void
    3721             : dumparcs(struct state *s,
    3722             :          FILE *f)
    3723             : {
    3724             :     int         pos;
    3725             :     struct arc *a;
    3726             : 
    3727             :     /* printing oldest arcs first is usually clearer */
    3728             :     a = s->outs;
    3729             :     assert(a != NULL);
    3730             :     while (a->outchain != NULL)
    3731             :         a = a->outchain;
    3732             :     pos = 1;
    3733             :     do
    3734             :     {
    3735             :         dumparc(a, s, f);
    3736             :         if (pos == 5)
    3737             :         {
    3738             :             fprintf(f, "\n");
    3739             :             pos = 1;
    3740             :         }
    3741             :         else
    3742             :             pos++;
    3743             :         a = a->outchainRev;
    3744             :     } while (a != NULL);
    3745             :     if (pos != 1)
    3746             :         fprintf(f, "\n");
    3747             : }
    3748             : 
    3749             : /*
    3750             :  * dumparc - dump one outarc in readable form, including prefixing tab
    3751             :  */
    3752             : static void
    3753             : dumparc(struct arc *a,
    3754             :         struct state *s,
    3755             :         FILE *f)
    3756             : {
    3757             :     struct arc *aa;
    3758             : 
    3759             :     fprintf(f, "\t");
    3760             :     switch (a->type)
    3761             :     {
    3762             :         case PLAIN:
    3763             :             if (a->co == RAINBOW)
    3764             :                 fprintf(f, "[*]");
    3765             :             else
    3766             :                 fprintf(f, "[%ld]", (long) a->co);
    3767             :             break;
    3768             :         case AHEAD:
    3769             :             if (a->co == RAINBOW)
    3770             :                 fprintf(f, ">*>");
    3771             :             else
    3772             :                 fprintf(f, ">%ld>", (long) a->co);
    3773             :             break;
    3774             :         case BEHIND:
    3775             :             if (a->co == RAINBOW)
    3776             :                 fprintf(f, "<*<");
    3777             :             else
    3778             :                 fprintf(f, "<%ld<", (long) a->co);
    3779             :             break;
    3780             :         case LACON:
    3781             :             fprintf(f, ":%ld:", (long) a->co);
    3782             :             break;
    3783             :         case '^':
    3784             :         case '$':
    3785             :             fprintf(f, "%c%d", a->type, (int) a->co);
    3786             :             break;
    3787             :         case EMPTY:
    3788             :             break;
    3789             :         case CANTMATCH:
    3790             :             fprintf(f, "X");
    3791             :             break;
    3792             :         default:
    3793             :             fprintf(f, "0x%x/0%lo", a->type, (long) a->co);
    3794             :             break;
    3795             :     }
    3796             :     if (a->from != s)
    3797             :         fprintf(f, "?%d?", a->from->no);
    3798             :     for (aa = a->from->outs; aa != NULL; aa = aa->outchain)
    3799             :         if (aa == a)
    3800             :             break;              /* NOTE BREAK OUT */
    3801             :     if (aa == NULL)
    3802             :         fprintf(f, "?!?");        /* missing from out-chain */
    3803             :     fprintf(f, "->");
    3804             :     if (a->to == NULL)
    3805             :     {
    3806             :         fprintf(f, "NULL");
    3807             :         return;
    3808             :     }
    3809             :     fprintf(f, "%d", a->to->no);
    3810             :     for (aa = a->to->ins; aa != NULL; aa = aa->inchain)
    3811             :         if (aa == a)
    3812             :             break;              /* NOTE BREAK OUT */
    3813             :     if (aa == NULL)
    3814             :         fprintf(f, "?!?");        /* missing from in-chain */
    3815             : }
    3816             : #endif                          /* REG_DEBUG */
    3817             : 
    3818             : /*
    3819             :  * dumpcnfa - dump a compacted NFA in human-readable form
    3820             :  */
    3821             : #ifdef REG_DEBUG
    3822             : static void
    3823             : dumpcnfa(struct cnfa *cnfa,
    3824             :          FILE *f)
    3825             : {
    3826             :     int         st;
    3827             : 
    3828             :     fprintf(f, "pre %d, post %d", cnfa->pre, cnfa->post);
    3829             :     if (cnfa->bos[0] != COLORLESS)
    3830             :         fprintf(f, ", bos [%ld]", (long) cnfa->bos[0]);
    3831             :     if (cnfa->bos[1] != COLORLESS)
    3832             :         fprintf(f, ", bol [%ld]", (long) cnfa->bos[1]);
    3833             :     if (cnfa->eos[0] != COLORLESS)
    3834             :         fprintf(f, ", eos [%ld]", (long) cnfa->eos[0]);
    3835             :     if (cnfa->eos[1] != COLORLESS)
    3836             :         fprintf(f, ", eol [%ld]", (long) cnfa->eos[1]);
    3837             :     if (cnfa->flags & HASLACONS)
    3838             :         fprintf(f, ", haslacons");
    3839             :     if (cnfa->flags & MATCHALL)
    3840             :     {
    3841             :         fprintf(f, ", minmatchall %d", cnfa->minmatchall);
    3842             :         if (cnfa->maxmatchall == DUPINF)
    3843             :             fprintf(f, ", maxmatchall inf");
    3844             :         else
    3845             :             fprintf(f, ", maxmatchall %d", cnfa->maxmatchall);
    3846             :     }
    3847             :     fprintf(f, "\n");
    3848             :     for (st = 0; st < cnfa->nstates; st++)
    3849             :         dumpcstate(st, cnfa, f);
    3850             :     fflush(f);
    3851             : }
    3852             : #endif
    3853             : 
    3854             : #ifdef REG_DEBUG                /* subordinates of dumpcnfa */
    3855             : 
    3856             : /*
    3857             :  * dumpcstate - dump a compacted-NFA state in human-readable form
    3858             :  */
    3859             : static void
    3860             : dumpcstate(int st,
    3861             :            struct cnfa *cnfa,
    3862             :            FILE *f)
    3863             : {
    3864             :     struct carc *ca;
    3865             :     int         pos;
    3866             : 
    3867             :     fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : ".");
    3868             :     pos = 1;
    3869             :     for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
    3870             :     {
    3871             :         if (ca->co == RAINBOW)
    3872             :             fprintf(f, "\t[*]->%d", ca->to);
    3873             :         else if (ca->co < cnfa->ncolors)
    3874             :             fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to);
    3875             :         else
    3876             :             fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to);
    3877             :         if (pos == 5)
    3878             :         {
    3879             :             fprintf(f, "\n");
    3880             :             pos = 1;
    3881             :         }
    3882             :         else
    3883             :             pos++;
    3884             :     }
    3885             :     if (ca == cnfa->states[st] || pos != 1)
    3886             :         fprintf(f, "\n");
    3887             :     fflush(f);
    3888             : }
    3889             : 
    3890             : #endif                          /* REG_DEBUG */

Generated by: LCOV version 1.16