Line data Source code
1 : /*
2 : * DFA routines
3 : * This file is #included by regexec.c.
4 : *
5 : * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6 : *
7 : * Development of this software was funded, in part, by Cray Research Inc.,
8 : * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 : * Corporation, none of whom are responsible for the results. The author
10 : * thanks all of them.
11 : *
12 : * Redistribution and use in source and binary forms -- with or without
13 : * modification -- are permitted for any purpose, provided that
14 : * redistributions in source form retain this entire copyright notice and
15 : * indicate the origin and nature of any modifications.
16 : *
17 : * I'd appreciate being given credit for this package in the documentation
18 : * of software which uses it, but that is not a requirement.
19 : *
20 : * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 : * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 : * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 : * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 : * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 : * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 : * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 : * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 : * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 : * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 : *
31 : * src/backend/regex/rege_dfa.c
32 : *
33 : */
34 :
35 : /*
36 : * longest - longest-preferred matching engine
37 : *
38 : * On success, returns match endpoint address. Returns NULL on no match.
39 : * Internal errors also return NULL, with v->err set.
40 : */
41 : static chr *
42 493503 : longest(struct vars *v,
43 : struct dfa *d,
44 : chr *start, /* where the match should start */
45 : chr *stop, /* match must end at or before here */
46 : int *hitstopp) /* record whether hit v->stop, if non-NULL */
47 : {
48 : chr *cp;
49 493503 : chr *realstop = (stop == v->stop) ? stop : stop + 1;
50 : color co;
51 : struct sset *css;
52 : struct sset *ss;
53 : chr *post;
54 : int i;
55 493503 : struct colormap *cm = d->cm;
56 :
57 : /* prevent "uninitialized variable" warnings */
58 493503 : if (hitstopp != NULL)
59 471277 : *hitstopp = 0;
60 :
61 : /* if this is a backref to a known string, just match against that */
62 493503 : if (d->backno >= 0)
63 : {
64 : assert((size_t) d->backno < v->nmatch);
65 1120 : if (v->pmatch[d->backno].rm_so >= 0)
66 : {
67 861 : cp = dfa_backref(v, d, start, start, stop, false);
68 861 : if (cp == v->stop && stop == v->stop && hitstopp != NULL)
69 0 : *hitstopp = 1;
70 861 : return cp;
71 : }
72 : }
73 :
74 : /* fast path for matchall NFAs */
75 492642 : if (d->cnfa->flags & MATCHALL)
76 : {
77 2995 : size_t nchr = stop - start;
78 2995 : size_t maxmatchall = d->cnfa->maxmatchall;
79 :
80 2995 : if (nchr < d->cnfa->minmatchall)
81 246 : return NULL;
82 2749 : if (maxmatchall == DUPINF)
83 : {
84 1495 : if (stop == v->stop && hitstopp != NULL)
85 7 : *hitstopp = 1;
86 : }
87 : else
88 : {
89 1254 : if (stop == v->stop && nchr <= maxmatchall + 1 && hitstopp != NULL)
90 128 : *hitstopp = 1;
91 1254 : if (nchr > maxmatchall)
92 855 : return start + maxmatchall;
93 : }
94 1894 : return stop;
95 : }
96 :
97 : /* initialize */
98 489647 : css = initialize(v, d, start);
99 489647 : if (css == NULL)
100 0 : return NULL;
101 489647 : cp = start;
102 :
103 : /* startup */
104 : FDEBUG(("+++ startup +++\n"));
105 489647 : if (cp == v->start)
106 : {
107 2191 : co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
108 : FDEBUG(("color %ld\n", (long) co));
109 : }
110 : else
111 : {
112 487456 : co = GETCOLOR(cm, *(cp - 1));
113 : FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
114 : }
115 489647 : css = miss(v, d, css, co, cp, start);
116 489647 : if (css == NULL)
117 324 : return NULL;
118 489323 : css->lastseen = cp;
119 :
120 : /*
121 : * This is the main text-scanning loop. It seems worth having two copies
122 : * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
123 : * builds, when you're not actively tracing.
124 : */
125 : #ifdef REG_DEBUG
126 : if (v->eflags & REG_FTRACE)
127 : {
128 : while (cp < realstop)
129 : {
130 : FDEBUG(("+++ at c%d +++\n", (int) (css - d->ssets)));
131 : co = GETCOLOR(cm, *cp);
132 : FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
133 : ss = css->outs[co];
134 : if (ss == NULL)
135 : {
136 : ss = miss(v, d, css, co, cp + 1, start);
137 : if (ss == NULL)
138 : break; /* NOTE BREAK OUT */
139 : }
140 : cp++;
141 : ss->lastseen = cp;
142 : css = ss;
143 : }
144 : }
145 : else
146 : #endif
147 : {
148 6124125 : while (cp < realstop)
149 : {
150 6106695 : co = GETCOLOR(cm, *cp);
151 6106695 : ss = css->outs[co];
152 6106695 : if (ss == NULL)
153 : {
154 1596280 : ss = miss(v, d, css, co, cp + 1, start);
155 1596280 : if (ss == NULL)
156 471893 : break; /* NOTE BREAK OUT */
157 : }
158 5634802 : cp++;
159 5634802 : ss->lastseen = cp;
160 5634802 : css = ss;
161 : }
162 : }
163 :
164 489323 : if (ISERR())
165 0 : return NULL;
166 :
167 : /* shutdown */
168 : FDEBUG(("+++ shutdown at c%d +++\n", (int) (css - d->ssets)));
169 489323 : if (cp == v->stop && stop == v->stop)
170 : {
171 9214 : if (hitstopp != NULL)
172 4809 : *hitstopp = 1;
173 9214 : co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
174 : FDEBUG(("color %ld\n", (long) co));
175 9214 : ss = miss(v, d, css, co, cp, start);
176 9214 : if (ISERR())
177 0 : return NULL;
178 : /* special case: match ended at eol? */
179 9214 : if (ss != NULL && (ss->flags & POSTSTATE))
180 5073 : return cp;
181 4141 : else if (ss != NULL)
182 0 : ss->lastseen = cp; /* to be tidy */
183 : }
184 :
185 : /* find last match, if any */
186 484250 : post = d->lastpost;
187 2529827 : for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
188 2045577 : if ((ss->flags & POSTSTATE) && post != ss->lastseen &&
189 9611 : (post == NULL || post < ss->lastseen))
190 491014 : post = ss->lastseen;
191 484250 : if (post != NULL) /* found one */
192 481949 : return post - 1;
193 :
194 2301 : return NULL;
195 : }
196 :
197 : /*
198 : * shortest - shortest-preferred matching engine
199 : *
200 : * On success, returns match endpoint address. Returns NULL on no match.
201 : * Internal errors also return NULL, with v->err set.
202 : */
203 : static chr *
204 5762128 : shortest(struct vars *v,
205 : struct dfa *d,
206 : chr *start, /* where the match should start */
207 : chr *min, /* match must end at or after here */
208 : chr *max, /* match must end at or before here */
209 : chr **coldp, /* store coldstart pointer here, if non-NULL */
210 : int *hitstopp) /* record whether hit v->stop, if non-NULL */
211 : {
212 : chr *cp;
213 5762128 : chr *realmin = (min == v->stop) ? min : min + 1;
214 5762128 : chr *realmax = (max == v->stop) ? max : max + 1;
215 : color co;
216 : struct sset *css;
217 : struct sset *ss;
218 5762128 : struct colormap *cm = d->cm;
219 :
220 : /* prevent "uninitialized variable" warnings */
221 5762128 : if (coldp != NULL)
222 5760957 : *coldp = NULL;
223 5762128 : if (hitstopp != NULL)
224 227 : *hitstopp = 0;
225 :
226 : /* if this is a backref to a known string, just match against that */
227 5762128 : if (d->backno >= 0)
228 : {
229 : assert((size_t) d->backno < v->nmatch);
230 0 : if (v->pmatch[d->backno].rm_so >= 0)
231 : {
232 0 : cp = dfa_backref(v, d, start, min, max, true);
233 0 : if (cp != NULL && coldp != NULL)
234 0 : *coldp = start;
235 : /* there is no case where we should set *hitstopp */
236 0 : return cp;
237 : }
238 : }
239 :
240 : /* fast path for matchall NFAs */
241 5762128 : if (d->cnfa->flags & MATCHALL)
242 : {
243 1440 : size_t nchr = min - start;
244 :
245 1440 : if (d->cnfa->maxmatchall != DUPINF &&
246 16 : nchr > d->cnfa->maxmatchall)
247 0 : return NULL;
248 1440 : if ((max - start) < d->cnfa->minmatchall)
249 15 : return NULL;
250 1425 : if (nchr < d->cnfa->minmatchall)
251 99 : min = start + d->cnfa->minmatchall;
252 1425 : if (coldp != NULL)
253 676 : *coldp = start;
254 : /* there is no case where we should set *hitstopp */
255 1425 : return min;
256 : }
257 :
258 : /* initialize */
259 5760688 : css = initialize(v, d, start);
260 5760688 : if (css == NULL)
261 0 : return NULL;
262 5760688 : cp = start;
263 :
264 : /* startup */
265 : FDEBUG(("--- startup ---\n"));
266 5760688 : if (cp == v->start)
267 : {
268 5295457 : co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
269 : FDEBUG(("color %ld\n", (long) co));
270 : }
271 : else
272 : {
273 465231 : co = GETCOLOR(cm, *(cp - 1));
274 : FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
275 : }
276 5760688 : css = miss(v, d, css, co, cp, start);
277 5760688 : if (css == NULL)
278 8 : return NULL;
279 5760680 : css->lastseen = cp;
280 5760680 : ss = css;
281 :
282 : /*
283 : * This is the main text-scanning loop. It seems worth having two copies
284 : * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
285 : * builds, when you're not actively tracing.
286 : */
287 : #ifdef REG_DEBUG
288 : if (v->eflags & REG_FTRACE)
289 : {
290 : while (cp < realmax)
291 : {
292 : FDEBUG(("--- at c%d ---\n", (int) (css - d->ssets)));
293 : co = GETCOLOR(cm, *cp);
294 : FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
295 : ss = css->outs[co];
296 : if (ss == NULL)
297 : {
298 : ss = miss(v, d, css, co, cp + 1, start);
299 : if (ss == NULL)
300 : break; /* NOTE BREAK OUT */
301 : }
302 : cp++;
303 : ss->lastseen = cp;
304 : css = ss;
305 : if ((ss->flags & POSTSTATE) && cp >= realmin)
306 : break; /* NOTE BREAK OUT */
307 : }
308 : }
309 : else
310 : #endif
311 : {
312 25638094 : while (cp < realmax)
313 : {
314 25335073 : co = GETCOLOR(cm, *cp);
315 25335073 : ss = css->outs[co];
316 25335073 : if (ss == NULL)
317 : {
318 9026880 : ss = miss(v, d, css, co, cp + 1, start);
319 9026880 : if (ss == NULL)
320 4870365 : break; /* NOTE BREAK OUT */
321 : }
322 20464708 : cp++;
323 20464708 : ss->lastseen = cp;
324 20464708 : css = ss;
325 20464708 : if ((ss->flags & POSTSTATE) && cp >= realmin)
326 587294 : break; /* NOTE BREAK OUT */
327 : }
328 : }
329 :
330 5760680 : if (ss == NULL)
331 4870365 : return NULL;
332 :
333 890315 : if (coldp != NULL) /* report last no-progress state set, if any */
334 889937 : *coldp = lastcold(v, d);
335 :
336 890315 : if ((ss->flags & POSTSTATE) && cp > min)
337 : {
338 : assert(cp >= realmin);
339 587273 : cp--;
340 : }
341 303042 : else if (cp == v->stop && max == v->stop)
342 : {
343 303042 : co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
344 : FDEBUG(("color %ld\n", (long) co));
345 303042 : ss = miss(v, d, css, co, cp, start);
346 : /* match might have ended at eol */
347 303042 : if ((ss == NULL || !(ss->flags & POSTSTATE)) && hitstopp != NULL)
348 8 : *hitstopp = 1;
349 : }
350 :
351 890315 : if (ss == NULL || !(ss->flags & POSTSTATE))
352 290068 : return NULL;
353 :
354 600247 : return cp;
355 : }
356 :
357 : /*
358 : * matchuntil - incremental matching engine
359 : *
360 : * This is meant for use with a search-style NFA (that is, the pattern is
361 : * known to act as though it had a leading .*). We determine whether a
362 : * match exists starting at v->start and ending at probe. Multiple calls
363 : * require only O(N) time not O(N^2) so long as the probe values are
364 : * nondecreasing. *lastcss and *lastcp must be initialized to NULL before
365 : * starting a series of calls.
366 : *
367 : * Returns 1 if a match exists, 0 if not.
368 : * Internal errors also return 0, with v->err set.
369 : */
370 : static int
371 74 : matchuntil(struct vars *v,
372 : struct dfa *d,
373 : chr *probe, /* we want to know if a match ends here */
374 : struct sset **lastcss, /* state storage across calls */
375 : chr **lastcp) /* state storage across calls */
376 : {
377 74 : chr *cp = *lastcp;
378 : color co;
379 74 : struct sset *css = *lastcss;
380 : struct sset *ss;
381 74 : struct colormap *cm = d->cm;
382 :
383 : /* fast path for matchall NFAs */
384 74 : if (d->cnfa->flags & MATCHALL)
385 : {
386 18 : size_t nchr = probe - v->start;
387 :
388 18 : if (nchr < d->cnfa->minmatchall)
389 9 : return 0;
390 : /* maxmatchall will always be infinity, cf. makesearch() */
391 : assert(d->cnfa->maxmatchall == DUPINF);
392 9 : return 1;
393 : }
394 :
395 : /* initialize and startup, or restart, if necessary */
396 56 : if (cp == NULL || cp > probe)
397 : {
398 16 : cp = v->start;
399 16 : css = initialize(v, d, cp);
400 16 : if (css == NULL)
401 0 : return 0;
402 :
403 : FDEBUG((">>> startup >>>\n"));
404 16 : co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
405 : FDEBUG(("color %ld\n", (long) co));
406 :
407 16 : css = miss(v, d, css, co, cp, v->start);
408 16 : if (css == NULL)
409 0 : return 0;
410 16 : css->lastseen = cp;
411 : }
412 40 : else if (css == NULL)
413 : {
414 : /* we previously found that no match is possible beyond *lastcp */
415 0 : return 0;
416 : }
417 56 : ss = css;
418 :
419 : /*
420 : * This is the main text-scanning loop. It seems worth having two copies
421 : * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
422 : * builds, when you're not actively tracing.
423 : */
424 : #ifdef REG_DEBUG
425 : if (v->eflags & REG_FTRACE)
426 : {
427 : while (cp < probe)
428 : {
429 : FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
430 : co = GETCOLOR(cm, *cp);
431 : FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
432 : ss = css->outs[co];
433 : if (ss == NULL)
434 : {
435 : ss = miss(v, d, css, co, cp + 1, v->start);
436 : if (ss == NULL)
437 : break; /* NOTE BREAK OUT */
438 : }
439 : cp++;
440 : ss->lastseen = cp;
441 : css = ss;
442 : }
443 : }
444 : else
445 : #endif
446 : {
447 120 : while (cp < probe)
448 : {
449 64 : co = GETCOLOR(cm, *cp);
450 64 : ss = css->outs[co];
451 64 : if (ss == NULL)
452 : {
453 8 : ss = miss(v, d, css, co, cp + 1, v->start);
454 8 : if (ss == NULL)
455 0 : break; /* NOTE BREAK OUT */
456 : }
457 64 : cp++;
458 64 : ss->lastseen = cp;
459 64 : css = ss;
460 : }
461 : }
462 :
463 56 : *lastcss = ss;
464 56 : *lastcp = cp;
465 :
466 56 : if (ss == NULL)
467 0 : return 0; /* impossible match, or internal error */
468 :
469 : /* We need to process one more chr, or the EOS symbol, to check match */
470 56 : if (cp < v->stop)
471 : {
472 : FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
473 56 : co = GETCOLOR(cm, *cp);
474 : FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
475 56 : ss = css->outs[co];
476 56 : if (ss == NULL)
477 36 : ss = miss(v, d, css, co, cp + 1, v->start);
478 : }
479 : else
480 : {
481 : assert(cp == v->stop);
482 0 : co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
483 : FDEBUG(("color %ld\n", (long) co));
484 0 : ss = miss(v, d, css, co, cp, v->start);
485 : }
486 :
487 56 : if (ss == NULL || !(ss->flags & POSTSTATE))
488 40 : return 0;
489 :
490 16 : return 1;
491 : }
492 :
493 : /*
494 : * dfa_backref - find best match length for a known backref string
495 : *
496 : * When the backref's referent is already available, we can deliver an exact
497 : * answer with considerably less work than running the backref node's NFA.
498 : *
499 : * Return match endpoint for longest or shortest valid repeated match,
500 : * or NULL if there is no valid match.
501 : *
502 : * Should be in sync with cbrdissect(), although that has the different task
503 : * of checking a match to a predetermined section of the string.
504 : */
505 : static chr *
506 861 : dfa_backref(struct vars *v,
507 : struct dfa *d,
508 : chr *start, /* where the match should start */
509 : chr *min, /* match must end at or after here */
510 : chr *max, /* match must end at or before here */
511 : bool shortest)
512 : {
513 861 : int n = d->backno;
514 861 : int backmin = d->backmin;
515 861 : int backmax = d->backmax;
516 : size_t numreps;
517 : size_t minreps;
518 : size_t maxreps;
519 : size_t brlen;
520 : chr *brstring;
521 : chr *p;
522 :
523 : /* get the backreferenced string (caller should have checked this) */
524 861 : if (v->pmatch[n].rm_so == -1)
525 0 : return NULL;
526 861 : brstring = v->start + v->pmatch[n].rm_so;
527 861 : brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
528 :
529 : /* special-case zero-length backreference to avoid divide by zero */
530 861 : if (brlen == 0)
531 : {
532 : /*
533 : * matches only a zero-length string, but any number of repetitions
534 : * can be considered to be present
535 : */
536 1 : if (min == start && backmin <= backmax)
537 1 : return start;
538 0 : return NULL;
539 : }
540 :
541 : /*
542 : * convert min and max into numbers of possible repetitions of the backref
543 : * string, rounding appropriately
544 : */
545 860 : if (min <= start)
546 860 : minreps = 0;
547 : else
548 0 : minreps = (min - start - 1) / brlen + 1;
549 860 : maxreps = (max - start) / brlen;
550 :
551 : /* apply bounds, then see if there is any allowed match length */
552 860 : if (minreps < backmin)
553 832 : minreps = backmin;
554 860 : if (backmax != DUPINF && maxreps > backmax)
555 426 : maxreps = backmax;
556 860 : if (maxreps < minreps)
557 178 : return NULL;
558 :
559 : /* quick exit if zero-repetitions match is valid and preferred */
560 682 : if (shortest && minreps == 0)
561 0 : return start;
562 :
563 : /* okay, compare the actual string contents */
564 682 : p = start;
565 682 : numreps = 0;
566 805 : while (numreps < maxreps)
567 : {
568 699 : if ((*v->g->compare) (brstring, p, brlen) != 0)
569 576 : break;
570 123 : p += brlen;
571 123 : numreps++;
572 123 : if (shortest && numreps >= minreps)
573 0 : break;
574 : }
575 :
576 682 : if (numreps >= minreps)
577 112 : return p;
578 570 : return NULL;
579 : }
580 :
581 : /*
582 : * lastcold - determine last point at which no progress had been made
583 : */
584 : static chr * /* endpoint, or NULL */
585 889937 : lastcold(struct vars *v,
586 : struct dfa *d)
587 : {
588 : struct sset *ss;
589 : chr *nopr;
590 : int i;
591 :
592 889937 : nopr = d->lastnopr;
593 889937 : if (nopr == NULL)
594 889935 : nopr = v->start;
595 4747887 : for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
596 3857950 : if ((ss->flags & NOPROGRESS) && nopr < ss->lastseen)
597 1230775 : nopr = ss->lastseen;
598 889937 : return nopr;
599 : }
600 :
601 : /*
602 : * newdfa - set up a fresh DFA
603 : *
604 : * Returns NULL (and sets v->err) on failure.
605 : */
606 : static struct dfa *
607 6248623 : newdfa(struct vars *v,
608 : struct cnfa *cnfa,
609 : struct colormap *cm,
610 : struct smalldfa *sml) /* preallocated space, may be NULL */
611 : {
612 : struct dfa *d;
613 6248623 : size_t nss = cnfa->nstates * 2;
614 6248623 : int wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
615 6248623 : bool ismalloced = false;
616 :
617 : assert(cnfa != NULL && cnfa->nstates != 0);
618 :
619 6248623 : if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS)
620 : {
621 : assert(wordsper == 1);
622 2878386 : if (sml == NULL)
623 : {
624 15908 : sml = (struct smalldfa *) MALLOC(sizeof(struct smalldfa));
625 15908 : if (sml == NULL)
626 : {
627 0 : ERR(REG_ESPACE);
628 0 : return NULL;
629 : }
630 15908 : ismalloced = true;
631 : }
632 2878386 : d = &sml->dfa;
633 2878386 : d->ssets = sml->ssets;
634 2878386 : d->statesarea = sml->statesarea;
635 2878386 : d->work = &d->statesarea[nss];
636 2878386 : d->outsarea = sml->outsarea;
637 2878386 : d->incarea = sml->incarea;
638 2878386 : d->ismalloced = ismalloced;
639 2878386 : d->arraysmalloced = false; /* not separately allocated, anyway */
640 : }
641 : else
642 : {
643 : /*
644 : * Restrict the ranges of nstates and ncolors enough that the arrays
645 : * we allocate here have no more than INT_MAX members. This protects
646 : * not only the allocation calculations just below, but later indexing
647 : * into these arrays.
648 : */
649 3370237 : if (wordsper >= INT_MAX / (nss + WORK) ||
650 3370237 : cnfa->ncolors >= INT_MAX / nss)
651 : {
652 0 : ERR(REG_ETOOBIG);
653 0 : return NULL;
654 : }
655 3370237 : d = (struct dfa *) MALLOC(sizeof(struct dfa));
656 3370237 : if (d == NULL)
657 : {
658 0 : ERR(REG_ESPACE);
659 0 : return NULL;
660 : }
661 3370237 : d->ssets = MALLOC_ARRAY(struct sset, nss);
662 3370237 : d->statesarea = MALLOC_ARRAY(unsigned, (nss + WORK) * wordsper);
663 3370237 : d->work = &d->statesarea[nss * wordsper];
664 3370237 : d->outsarea = MALLOC_ARRAY(struct sset *, nss * cnfa->ncolors);
665 3370237 : d->incarea = MALLOC_ARRAY(struct arcp, nss * cnfa->ncolors);
666 3370237 : d->ismalloced = true;
667 3370237 : d->arraysmalloced = true;
668 : /* now freedfa() will behave sanely */
669 3370237 : if (d->ssets == NULL || d->statesarea == NULL ||
670 3370237 : d->outsarea == NULL || d->incarea == NULL)
671 : {
672 0 : freedfa(d);
673 0 : ERR(REG_ESPACE);
674 0 : return NULL;
675 : }
676 : }
677 :
678 6248623 : d->nssets = (v->eflags & REG_SMALL) ? 7 : nss;
679 6248623 : d->nssused = 0;
680 6248623 : d->nstates = cnfa->nstates;
681 6248623 : d->ncolors = cnfa->ncolors;
682 6248623 : d->wordsper = wordsper;
683 6248623 : d->cnfa = cnfa;
684 6248623 : d->cm = cm;
685 6248623 : d->lastpost = NULL;
686 6248623 : d->lastnopr = NULL;
687 6248623 : d->search = d->ssets;
688 6248623 : d->backno = -1; /* may be set by caller */
689 6248623 : d->backmin = d->backmax = 0;
690 :
691 : /* initialization of sset fields is done as needed */
692 :
693 6248623 : return d;
694 : }
695 :
696 : /*
697 : * freedfa - free a DFA
698 : */
699 : static void
700 6248623 : freedfa(struct dfa *d)
701 : {
702 6248623 : if (d->arraysmalloced)
703 : {
704 3370237 : if (d->ssets != NULL)
705 3370237 : FREE(d->ssets);
706 3370237 : if (d->statesarea != NULL)
707 3370237 : FREE(d->statesarea);
708 3370237 : if (d->outsarea != NULL)
709 3370237 : FREE(d->outsarea);
710 3370237 : if (d->incarea != NULL)
711 3370237 : FREE(d->incarea);
712 : }
713 :
714 6248623 : if (d->ismalloced)
715 3386145 : FREE(d);
716 6248623 : }
717 :
718 : /*
719 : * hash - construct a hash code for a bitvector
720 : *
721 : * There are probably better ways, but they're more expensive.
722 : */
723 : static unsigned
724 22274 : hash(unsigned *uv,
725 : int n)
726 : {
727 : int i;
728 : unsigned h;
729 :
730 22274 : h = 0;
731 95107 : for (i = 0; i < n; i++)
732 72833 : h ^= uv[i];
733 22274 : return h;
734 : }
735 :
736 : /*
737 : * initialize - hand-craft a cache entry for startup, otherwise get ready
738 : */
739 : static struct sset *
740 6250351 : initialize(struct vars *v,
741 : struct dfa *d,
742 : chr *start)
743 : {
744 : struct sset *ss;
745 : int i;
746 :
747 : /* is previous one still there? */
748 6250351 : if (d->nssused > 0 && (d->ssets[0].flags & STARTER))
749 3627 : ss = &d->ssets[0];
750 : else
751 : { /* no, must (re)build it */
752 6246724 : ss = getvacant(v, d, start, start);
753 6246724 : if (ss == NULL)
754 0 : return NULL;
755 12495247 : for (i = 0; i < d->wordsper; i++)
756 6248523 : ss->states[i] = 0;
757 6246724 : BSET(ss->states, d->cnfa->pre);
758 6246724 : ss->hash = HASH(ss->states, d->wordsper);
759 : assert(d->cnfa->pre != d->cnfa->post);
760 6246724 : ss->flags = STARTER | LOCKED | NOPROGRESS;
761 : /* lastseen dealt with below */
762 : }
763 :
764 12541285 : for (i = 0; i < d->nssused; i++)
765 6290934 : d->ssets[i].lastseen = NULL;
766 6250351 : ss->lastseen = start; /* maybe untrue, but harmless */
767 6250351 : d->lastpost = NULL;
768 6250351 : d->lastnopr = NULL;
769 6250351 : return ss;
770 : }
771 :
772 : /*
773 : * miss - handle a stateset cache miss
774 : *
775 : * css is the current stateset, co is the color of the current input character,
776 : * cp points to the character after that (which is where we may need to test
777 : * LACONs). start does not affect matching behavior but is needed for pickss'
778 : * heuristics about which stateset cache entry to replace.
779 : *
780 : * Ordinarily, returns the address of the next stateset (the one that is
781 : * valid after consuming the input character). Returns NULL if no valid
782 : * NFA states remain, ie we have a certain match failure.
783 : * Internal errors also return NULL, with v->err set.
784 : */
785 : static struct sset *
786 17185811 : miss(struct vars *v,
787 : struct dfa *d,
788 : struct sset *css,
789 : color co,
790 : chr *cp, /* next chr */
791 : chr *start) /* where the attempt got started */
792 : {
793 17185811 : struct cnfa *cnfa = d->cnfa;
794 : int i;
795 : unsigned h;
796 : struct carc *ca;
797 : struct sset *p;
798 : int ispseudocolor;
799 : int ispost;
800 : int noprogress;
801 : int gotstate;
802 : int dolacons;
803 : int sawlacons;
804 :
805 : /* for convenience, we can be called even if it might not be a miss */
806 17185811 : if (css->outs[co] != NULL)
807 : {
808 : FDEBUG(("hit\n"));
809 2927 : return css->outs[co];
810 : }
811 : FDEBUG(("miss\n"));
812 :
813 : /*
814 : * Checking for operation cancel in the inner text search loop seems
815 : * unduly expensive. As a compromise, check during cache misses.
816 : */
817 17182884 : INTERRUPT(v->re);
818 :
819 : /*
820 : * What set of states would we end up in after consuming the co character?
821 : * We first consider PLAIN arcs that consume the character, and then look
822 : * to see what LACON arcs could be traversed after consuming it.
823 : */
824 34417793 : for (i = 0; i < d->wordsper; i++)
825 17234909 : d->work[i] = 0; /* build new stateset bitmap in d->work */
826 17182884 : ispseudocolor = d->cm->cd[co].flags & PSEUDO;
827 17182884 : ispost = 0;
828 17182884 : noprogress = 1;
829 17182884 : gotstate = 0;
830 212478750 : for (i = 0; i < d->nstates; i++)
831 195295866 : if (ISBSET(css->states, i))
832 68870947 : for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
833 46585863 : if (ca->co == co ||
834 37959337 : (ca->co == RAINBOW && !ispseudocolor))
835 : {
836 17993966 : BSET(d->work, ca->to);
837 17993966 : gotstate = 1;
838 17993966 : if (ca->to == cnfa->post)
839 1105832 : ispost = 1;
840 17993966 : if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
841 4947188 : noprogress = 0;
842 : FDEBUG(("%d -> %d\n", i, ca->to));
843 : }
844 17182884 : if (!gotstate)
845 5636799 : return NULL; /* character cannot reach any new state */
846 11546085 : dolacons = (cnfa->flags & HASLACONS);
847 11546085 : sawlacons = 0;
848 : /* outer loop handles transitive closure of reachable-by-LACON states */
849 11547350 : while (dolacons)
850 : {
851 1265 : dolacons = 0;
852 11822 : for (i = 0; i < d->nstates; i++)
853 10557 : if (ISBSET(d->work, i))
854 3718 : for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
855 : {
856 2163 : if (ca->co < cnfa->ncolors)
857 1921 : continue; /* not a LACON arc */
858 242 : if (ISBSET(d->work, ca->to))
859 84 : continue; /* arc would be a no-op anyway */
860 158 : sawlacons = 1; /* this LACON affects our result */
861 158 : if (!lacon(v, cnfa, cp, ca->co))
862 : {
863 85 : if (ISERR())
864 0 : return NULL;
865 85 : continue; /* LACON arc cannot be traversed */
866 : }
867 73 : if (ISERR())
868 0 : return NULL;
869 73 : BSET(d->work, ca->to);
870 73 : dolacons = 1;
871 73 : if (ca->to == cnfa->post)
872 0 : ispost = 1;
873 73 : if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
874 73 : noprogress = 0;
875 : FDEBUG(("%d :> %d\n", i, ca->to));
876 : }
877 : }
878 11546085 : h = HASH(d->work, d->wordsper);
879 :
880 : /* Is this stateset already in the cache? */
881 34257215 : for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
882 24147865 : if (HIT(h, d->work, p, d->wordsper))
883 : {
884 : FDEBUG(("cached c%d\n", (int) (p - d->ssets)));
885 : break; /* NOTE BREAK OUT */
886 : }
887 11546085 : if (i == 0)
888 : { /* nope, need a new cache entry */
889 10109350 : p = getvacant(v, d, cp, start);
890 10109350 : if (p == NULL)
891 0 : return NULL;
892 : assert(p != css);
893 20261282 : for (i = 0; i < d->wordsper; i++)
894 10151932 : p->states[i] = d->work[i];
895 10109350 : p->hash = h;
896 10109350 : p->flags = (ispost) ? POSTSTATE : 0;
897 10109350 : if (noprogress)
898 6249655 : p->flags |= NOPROGRESS;
899 : /* lastseen to be dealt with by caller */
900 : }
901 :
902 : /*
903 : * Link new stateset to old, unless a LACON affected the result, in which
904 : * case we don't create the link. That forces future transitions across
905 : * this same arc (same prior stateset and character color) to come through
906 : * miss() again, so that we can recheck the LACON(s), which might or might
907 : * not pass since context will be different.
908 : */
909 11546085 : if (!sawlacons)
910 : {
911 : FDEBUG(("c%d[%d]->c%d\n",
912 : (int) (css - d->ssets), co, (int) (p - d->ssets)));
913 11545973 : css->outs[co] = p;
914 11545973 : css->inchain[co] = p->ins;
915 11545973 : p->ins.ss = css;
916 11545973 : p->ins.co = co;
917 : }
918 11546085 : return p;
919 : }
920 :
921 : /*
922 : * lacon - lookaround-constraint checker for miss()
923 : */
924 : static int /* predicate: constraint satisfied? */
925 158 : lacon(struct vars *v,
926 : struct cnfa *pcnfa, /* parent cnfa */
927 : chr *cp,
928 : color co) /* "color" of the lookaround constraint */
929 : {
930 : int n;
931 : struct subre *sub;
932 : struct dfa *d;
933 : chr *end;
934 : int satisfied;
935 :
936 : /* Since this is recursive, it could be driven to stack overflow */
937 158 : if (STACK_TOO_DEEP(v->re))
938 : {
939 0 : ERR(REG_ETOOBIG);
940 0 : return 0;
941 : }
942 :
943 158 : n = co - pcnfa->ncolors;
944 : assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
945 : FDEBUG(("=== testing lacon %d\n", n));
946 158 : sub = &v->g->lacons[n];
947 158 : d = getladfa(v, n);
948 158 : if (d == NULL)
949 0 : return 0;
950 158 : if (LATYPE_IS_AHEAD(sub->latype))
951 : {
952 : /* used to use longest() here, but shortest() could be much cheaper */
953 84 : end = shortest(v, d, cp, cp, v->stop,
954 : (chr **) NULL, (int *) NULL);
955 84 : satisfied = LATYPE_IS_POS(sub->latype) ? (end != NULL) : (end == NULL);
956 : }
957 : else
958 : {
959 : /*
960 : * To avoid doing O(N^2) work when repeatedly testing a lookbehind
961 : * constraint in an N-character string, we use matchuntil() which can
962 : * cache the DFA state across calls. We only need to restart if the
963 : * probe point decreases, which is not common. The NFA we're using is
964 : * a search NFA, so it doesn't mind scanning over stuff before the
965 : * nominal match.
966 : */
967 74 : satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]);
968 74 : if (!LATYPE_IS_POS(sub->latype))
969 0 : satisfied = !satisfied;
970 : }
971 : FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied));
972 158 : return satisfied;
973 : }
974 :
975 : /*
976 : * getvacant - get a vacant state set
977 : *
978 : * This routine clears out the inarcs and outarcs, but does not otherwise
979 : * clear the innards of the state set -- that's up to the caller.
980 : */
981 : static struct sset *
982 16356074 : getvacant(struct vars *v,
983 : struct dfa *d,
984 : chr *cp,
985 : chr *start)
986 : {
987 : int i;
988 : struct sset *ss;
989 : struct sset *p;
990 : struct arcp ap;
991 : color co;
992 :
993 16356074 : ss = pickss(v, d, cp, start);
994 16356074 : if (ss == NULL)
995 0 : return NULL;
996 : assert(!(ss->flags & LOCKED));
997 :
998 : /* clear out its inarcs, including self-referential ones */
999 16356074 : ap = ss->ins;
1000 16356086 : while ((p = ap.ss) != NULL)
1001 : {
1002 12 : co = ap.co;
1003 : FDEBUG(("zapping c%d's %ld outarc\n", (int) (p - d->ssets), (long) co));
1004 12 : p->outs[co] = NULL;
1005 12 : ap = p->inchain[co];
1006 12 : p->inchain[co].ss = NULL; /* paranoia */
1007 : }
1008 16356074 : ss->ins.ss = NULL;
1009 :
1010 : /* take it off the inarc chains of the ssets reached by its outarcs */
1011 199784750 : for (i = 0; i < d->ncolors; i++)
1012 : {
1013 183428676 : p = ss->outs[i];
1014 : assert(p != ss); /* not self-referential */
1015 183428676 : if (p == NULL)
1016 183428610 : continue; /* NOTE CONTINUE */
1017 : FDEBUG(("del outarc %d from c%d's in chn\n", i, (int) (p - d->ssets)));
1018 66 : if (p->ins.ss == ss && p->ins.co == i)
1019 60 : p->ins = ss->inchain[i];
1020 : else
1021 : {
1022 6 : struct arcp lastap = {NULL, 0};
1023 :
1024 : assert(p->ins.ss != NULL);
1025 12 : for (ap = p->ins; ap.ss != NULL &&
1026 12 : !(ap.ss == ss && ap.co == i);
1027 6 : ap = ap.ss->inchain[ap.co])
1028 6 : lastap = ap;
1029 : assert(ap.ss != NULL);
1030 6 : lastap.ss->inchain[lastap.co] = ss->inchain[i];
1031 : }
1032 66 : ss->outs[i] = NULL;
1033 66 : ss->inchain[i].ss = NULL;
1034 : }
1035 :
1036 : /* if ss was a success state, may need to remember location */
1037 16356074 : if ((ss->flags & POSTSTATE) && ss->lastseen != d->lastpost &&
1038 18 : (d->lastpost == NULL || d->lastpost < ss->lastseen))
1039 18 : d->lastpost = ss->lastseen;
1040 :
1041 : /* likewise for a no-progress state */
1042 16356074 : if ((ss->flags & NOPROGRESS) && ss->lastseen != d->lastnopr &&
1043 6 : (d->lastnopr == NULL || d->lastnopr < ss->lastseen))
1044 6 : d->lastnopr = ss->lastseen;
1045 :
1046 16356074 : return ss;
1047 : }
1048 :
1049 : /*
1050 : * pickss - pick the next stateset to be used
1051 : */
1052 : static struct sset *
1053 16356074 : pickss(struct vars *v,
1054 : struct dfa *d,
1055 : chr *cp,
1056 : chr *start)
1057 : {
1058 : int i;
1059 : struct sset *ss;
1060 : struct sset *end;
1061 : chr *ancient;
1062 :
1063 : /* shortcut for cases where cache isn't full */
1064 16356074 : if (d->nssused < d->nssets)
1065 : {
1066 16356008 : i = d->nssused;
1067 16356008 : d->nssused++;
1068 16356008 : ss = &d->ssets[i];
1069 : FDEBUG(("new c%d\n", i));
1070 : /* set up innards */
1071 16356008 : ss->states = &d->statesarea[i * d->wordsper];
1072 16356008 : ss->flags = 0;
1073 16356008 : ss->ins.ss = NULL;
1074 16356008 : ss->ins.co = WHITE; /* give it some value */
1075 16356008 : ss->outs = &d->outsarea[i * d->ncolors];
1076 16356008 : ss->inchain = &d->incarea[i * d->ncolors];
1077 199783148 : for (i = 0; i < d->ncolors; i++)
1078 : {
1079 183427140 : ss->outs[i] = NULL;
1080 183427140 : ss->inchain[i].ss = NULL;
1081 : }
1082 16356008 : return ss;
1083 : }
1084 :
1085 : /* look for oldest, or old enough anyway */
1086 66 : if (cp - start > d->nssets * 2 / 3) /* oldest 33% are expendable */
1087 66 : ancient = cp - d->nssets * 2 / 3;
1088 : else
1089 0 : ancient = start;
1090 72 : for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++)
1091 66 : if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
1092 66 : !(ss->flags & LOCKED))
1093 : {
1094 60 : d->search = ss + 1;
1095 : FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
1096 60 : return ss;
1097 : }
1098 12 : for (ss = d->ssets, end = d->search; ss < end; ss++)
1099 12 : if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
1100 12 : !(ss->flags & LOCKED))
1101 : {
1102 6 : d->search = ss + 1;
1103 : FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
1104 6 : return ss;
1105 : }
1106 :
1107 : /* nobody's old enough?!? -- something's really wrong */
1108 : FDEBUG(("cannot find victim to replace!\n"));
1109 0 : ERR(REG_ASSERT);
1110 0 : return NULL;
1111 : }
|