Line data Source code
1 : /*
2 : * re_*exec and friends - match REs
3 : *
4 : * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
5 : *
6 : * Development of this software was funded, in part, by Cray Research Inc.,
7 : * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
8 : * Corporation, none of whom are responsible for the results. The author
9 : * thanks all of them.
10 : *
11 : * Redistribution and use in source and binary forms -- with or without
12 : * modification -- are permitted for any purpose, provided that
13 : * redistributions in source form retain this entire copyright notice and
14 : * indicate the origin and nature of any modifications.
15 : *
16 : * I'd appreciate being given credit for this package in the documentation
17 : * of software which uses it, but that is not a requirement.
18 : *
19 : * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
20 : * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
21 : * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
22 : * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 : * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 : * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 : * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 : * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 : * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 : * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 : *
30 : * src/backend/regex/regexec.c
31 : *
32 : */
33 :
34 : #include "regex/regguts.h"
35 :
36 :
37 :
38 : /* lazy-DFA representation */
39 : struct arcp
40 : { /* "pointer" to an outarc */
41 : struct sset *ss;
42 : color co;
43 : };
44 :
45 : struct sset
46 : { /* state set */
47 : unsigned *states; /* pointer to bitvector */
48 : unsigned hash; /* hash of bitvector */
49 : #define HASH(bv, nw) (((nw) == 1) ? *(bv) : hash(bv, nw))
50 : #define HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \
51 : memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0))
52 : int flags;
53 : #define STARTER 01 /* the initial state set */
54 : #define POSTSTATE 02 /* includes the goal state */
55 : #define LOCKED 04 /* locked in cache */
56 : #define NOPROGRESS 010 /* zero-progress state set */
57 : struct arcp ins; /* chain of inarcs pointing here */
58 : chr *lastseen; /* last entered on arrival here */
59 : struct sset **outs; /* outarc vector indexed by color */
60 : struct arcp *inchain; /* chain-pointer vector for outarcs */
61 : };
62 :
63 : struct dfa
64 : {
65 : int nssets; /* size of cache */
66 : int nssused; /* how many entries occupied yet */
67 : int nstates; /* number of states */
68 : int ncolors; /* length of outarc and inchain vectors */
69 : int wordsper; /* length of state-set bitvectors */
70 : struct sset *ssets; /* state-set cache */
71 : unsigned *statesarea; /* bitvector storage */
72 : unsigned *work; /* pointer to work area within statesarea */
73 : struct sset **outsarea; /* outarc-vector storage */
74 : struct arcp *incarea; /* inchain storage */
75 : struct cnfa *cnfa;
76 : struct colormap *cm;
77 : chr *lastpost; /* location of last cache-flushed success */
78 : chr *lastnopr; /* location of last cache-flushed NOPROGRESS */
79 : struct sset *search; /* replacement-search-pointer memory */
80 : int backno; /* if DFA for a backref, subno it refers to */
81 : short backmin; /* min repetitions for backref */
82 : short backmax; /* max repetitions for backref */
83 : bool ismalloced; /* should this struct dfa be freed? */
84 : bool arraysmalloced; /* should its subsidiary arrays be freed? */
85 : };
86 :
87 : #define WORK 1 /* number of work bitvectors needed */
88 :
89 : /* setup for non-malloc allocation for small cases */
90 : #define FEWSTATES 20 /* must be less than UBITS */
91 : #define FEWCOLORS 15
92 : struct smalldfa
93 : {
94 : struct dfa dfa; /* must be first */
95 : struct sset ssets[FEWSTATES * 2];
96 : unsigned statesarea[FEWSTATES * 2 + WORK];
97 : struct sset *outsarea[FEWSTATES * 2 * FEWCOLORS];
98 : struct arcp incarea[FEWSTATES * 2 * FEWCOLORS];
99 : };
100 :
101 : #define DOMALLOC ((struct smalldfa *)NULL) /* force malloc */
102 :
103 :
104 :
105 : /* internal variables, bundled for easy passing around */
106 : struct vars
107 : {
108 : regex_t *re;
109 : struct guts *g;
110 : int eflags; /* copies of arguments */
111 : size_t nmatch;
112 : regmatch_t *pmatch;
113 : rm_detail_t *details;
114 : chr *start; /* start of string */
115 : chr *search_start; /* search start of string */
116 : chr *stop; /* just past end of string */
117 : int err; /* error code if any (0 none) */
118 : struct dfa **subdfas; /* per-tree-subre DFAs */
119 : struct dfa **ladfas; /* per-lacon-subre DFAs */
120 : struct sset **lblastcss; /* per-lacon-subre lookbehind restart data */
121 : chr **lblastcp; /* per-lacon-subre lookbehind restart data */
122 : struct smalldfa dfa1;
123 : struct smalldfa dfa2;
124 : };
125 :
126 : #define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
127 : #define ISERR() VISERR(v)
128 : #define VERR(vv,e) ((vv)->err = ((vv)->err ? (vv)->err : (e)))
129 : #define ERR(e) VERR(v, e) /* record an error */
130 : #define NOERR() {if (ISERR()) return v->err;} /* if error seen, return it */
131 : #define OFF(p) ((p) - v->start)
132 : #define LOFF(p) ((long)OFF(p))
133 :
134 :
135 :
136 : /*
137 : * forward declarations
138 : */
139 : /* === regexec.c === */
140 : static struct dfa *getsubdfa(struct vars *v, struct subre *t);
141 : static struct dfa *getladfa(struct vars *v, int n);
142 : static int find(struct vars *v, struct cnfa *cnfa, struct colormap *cm);
143 : static int cfind(struct vars *v, struct cnfa *cnfa, struct colormap *cm);
144 : static int cfindloop(struct vars *v, struct cnfa *cnfa, struct colormap *cm,
145 : struct dfa *d, struct dfa *s, chr **coldp);
146 : static void zapallsubs(regmatch_t *p, size_t n);
147 : static void zaptreesubs(struct vars *v, struct subre *t);
148 : static void subset(struct vars *v, struct subre *sub, chr *begin, chr *end);
149 : static int cdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
150 : static int ccondissect(struct vars *v, struct subre *t, chr *begin, chr *end);
151 : static int crevcondissect(struct vars *v, struct subre *t, chr *begin, chr *end);
152 : static int cbrdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
153 : static int caltdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
154 : static int citerdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
155 : static int creviterdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
156 :
157 : /* === rege_dfa.c === */
158 : static chr *longest(struct vars *v, struct dfa *d,
159 : chr *start, chr *stop, int *hitstopp);
160 : static chr *shortest(struct vars *v, struct dfa *d, chr *start, chr *min,
161 : chr *max, chr **coldp, int *hitstopp);
162 : static int matchuntil(struct vars *v, struct dfa *d, chr *probe,
163 : struct sset **lastcss, chr **lastcp);
164 : static chr *dfa_backref(struct vars *v, struct dfa *d, chr *start,
165 : chr *min, chr *max, bool shortest);
166 : static chr *lastcold(struct vars *v, struct dfa *d);
167 : static struct dfa *newdfa(struct vars *v, struct cnfa *cnfa,
168 : struct colormap *cm, struct smalldfa *sml);
169 : static void freedfa(struct dfa *d);
170 : static unsigned hash(unsigned *uv, int n);
171 : static struct sset *initialize(struct vars *v, struct dfa *d, chr *start);
172 : static struct sset *miss(struct vars *v, struct dfa *d, struct sset *css,
173 : color co, chr *cp, chr *start);
174 : static int lacon(struct vars *v, struct cnfa *pcnfa, chr *cp, color co);
175 : static struct sset *getvacant(struct vars *v, struct dfa *d, chr *cp,
176 : chr *start);
177 : static struct sset *pickss(struct vars *v, struct dfa *d, chr *cp,
178 : chr *start);
179 :
180 :
181 : /*
182 : * pg_regexec - match regular expression
183 : */
184 : int
185 5762030 : pg_regexec(regex_t *re,
186 : const chr *string,
187 : size_t len,
188 : size_t search_start,
189 : rm_detail_t *details,
190 : size_t nmatch,
191 : regmatch_t pmatch[],
192 : int flags)
193 : {
194 : struct vars var;
195 5762030 : struct vars *v = &var;
196 : int st;
197 : size_t n;
198 : size_t i;
199 : int backref;
200 :
201 : #define LOCALMAT 20
202 : regmatch_t mat[LOCALMAT];
203 :
204 : #define LOCALDFAS 40
205 : struct dfa *subdfas[LOCALDFAS];
206 :
207 : /* sanity checks */
208 5762030 : if (re == NULL || string == NULL || re->re_magic != REMAGIC)
209 0 : return REG_INVARG;
210 5762030 : if (re->re_csize != sizeof(chr))
211 0 : return REG_MIXED;
212 5762030 : if (search_start > len)
213 5 : return REG_NOMATCH;
214 :
215 : /* Initialize locale-dependent support */
216 5762025 : pg_set_regex_collation(re->re_collation);
217 :
218 : /* setup */
219 5762025 : v->re = re;
220 5762025 : v->g = (struct guts *) re->re_guts;
221 5762025 : if ((v->g->cflags & REG_EXPECT) && details == NULL)
222 0 : return REG_INVARG;
223 5762025 : if (v->g->info & REG_UIMPOSSIBLE)
224 721 : return REG_NOMATCH;
225 5761304 : backref = (v->g->info & REG_UBACKREF) ? 1 : 0;
226 5761304 : v->eflags = flags;
227 5761304 : if (backref && nmatch <= v->g->nsub)
228 : {
229 : /* need larger work area */
230 149 : v->nmatch = v->g->nsub + 1;
231 149 : if (v->nmatch <= LOCALMAT)
232 148 : v->pmatch = mat;
233 : else
234 1 : v->pmatch = MALLOC_ARRAY(regmatch_t, v->nmatch);
235 149 : if (v->pmatch == NULL)
236 0 : return REG_ESPACE;
237 149 : zapallsubs(v->pmatch, v->nmatch);
238 : }
239 : else
240 : {
241 : /* we can store results directly in caller's array */
242 5761155 : v->pmatch = pmatch;
243 : /* ensure any extra entries in caller's array are filled with -1 */
244 5761155 : if (nmatch > 0)
245 583052 : zapallsubs(pmatch, nmatch);
246 : /* then forget about extra entries, to avoid useless work in find() */
247 5761155 : if (nmatch > v->g->nsub + 1)
248 670 : nmatch = v->g->nsub + 1;
249 5761155 : v->nmatch = nmatch;
250 : }
251 5761304 : v->details = details;
252 5761304 : v->start = (chr *) string;
253 5761304 : v->search_start = (chr *) string + search_start;
254 5761304 : v->stop = (chr *) string + len;
255 5761304 : v->err = 0;
256 5761304 : v->subdfas = NULL;
257 5761304 : v->ladfas = NULL;
258 5761304 : v->lblastcss = NULL;
259 5761304 : v->lblastcp = NULL;
260 : /* below this point, "goto cleanup" will behave sanely */
261 :
262 : assert(v->g->ntree >= 0);
263 5761304 : n = (size_t) v->g->ntree;
264 5761304 : if (n <= LOCALDFAS)
265 5761302 : v->subdfas = subdfas;
266 : else
267 : {
268 : /* ntree is surely less than the number of states, so this is safe: */
269 2 : v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
270 2 : if (v->subdfas == NULL)
271 : {
272 0 : st = REG_ESPACE;
273 0 : goto cleanup;
274 : }
275 : }
276 17305646 : for (i = 0; i < n; i++)
277 11544342 : v->subdfas[i] = NULL;
278 :
279 : assert(v->g->nlacons >= 0);
280 5761304 : n = (size_t) v->g->nlacons;
281 5761304 : if (n > 0)
282 : {
283 : /* nlacons is surely less than the number of arcs, so this is safe: */
284 1040 : v->ladfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
285 1040 : if (v->ladfas == NULL)
286 : {
287 0 : st = REG_ESPACE;
288 0 : goto cleanup;
289 : }
290 3168 : for (i = 0; i < n; i++)
291 2128 : v->ladfas[i] = NULL;
292 1040 : v->lblastcss = (struct sset **) MALLOC(n * sizeof(struct sset *));
293 1040 : v->lblastcp = (chr **) MALLOC(n * sizeof(chr *));
294 1040 : if (v->lblastcss == NULL || v->lblastcp == NULL)
295 : {
296 0 : st = REG_ESPACE;
297 0 : goto cleanup;
298 : }
299 3168 : for (i = 0; i < n; i++)
300 : {
301 2128 : v->lblastcss[i] = NULL;
302 2128 : v->lblastcp[i] = NULL;
303 : }
304 : }
305 :
306 : /* do it */
307 : assert(v->g->tree != NULL);
308 5761304 : if (backref)
309 208 : st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
310 : else
311 5761096 : st = find(v, &v->g->tree->cnfa, &v->g->cmap);
312 :
313 : /* on success, ensure caller's match vector is filled correctly */
314 5761304 : if (st == REG_OKAY && nmatch > 0)
315 : {
316 470605 : if (v->pmatch != pmatch)
317 : {
318 : /* copy portion of match vector over from (larger) work area */
319 : assert(nmatch <= v->nmatch);
320 33 : memcpy(VS(pmatch), VS(v->pmatch), nmatch * sizeof(regmatch_t));
321 : }
322 470605 : if (v->g->cflags & REG_NOSUB)
323 : {
324 : /* don't expose possibly-partial sub-match results to caller */
325 466736 : zapallsubs(pmatch, nmatch);
326 : }
327 : }
328 :
329 : /* clean up */
330 5294568 : cleanup:
331 5761304 : if (v->pmatch != pmatch && v->pmatch != mat)
332 1 : FREE(v->pmatch);
333 5761304 : if (v->subdfas != NULL)
334 : {
335 5761304 : n = (size_t) v->g->ntree;
336 17305646 : for (i = 0; i < n; i++)
337 : {
338 11544342 : if (v->subdfas[i] != NULL)
339 16880 : freedfa(v->subdfas[i]);
340 : }
341 5761304 : if (v->subdfas != subdfas)
342 2 : FREE(v->subdfas);
343 : }
344 5761304 : if (v->ladfas != NULL)
345 : {
346 1040 : n = (size_t) v->g->nlacons;
347 3168 : for (i = 0; i < n; i++)
348 : {
349 2128 : if (v->ladfas[i] != NULL)
350 75 : freedfa(v->ladfas[i]);
351 : }
352 1040 : FREE(v->ladfas);
353 : }
354 5761304 : if (v->lblastcss != NULL)
355 1040 : FREE(v->lblastcss);
356 5761304 : if (v->lblastcp != NULL)
357 1040 : FREE(v->lblastcp);
358 :
359 : #ifdef REG_DEBUG
360 : if (v->eflags & (REG_FTRACE | REG_MTRACE))
361 : fflush(stdout);
362 : #endif
363 :
364 5761304 : return st;
365 : }
366 :
367 : /*
368 : * getsubdfa - create or re-fetch the DFA for a tree subre node
369 : *
370 : * We only need to create the DFA once per overall regex execution.
371 : * The DFA will be freed by the cleanup step in pg_regexec().
372 : */
373 : static struct dfa *
374 17864 : getsubdfa(struct vars *v,
375 : struct subre *t)
376 : {
377 17864 : struct dfa *d = v->subdfas[t->id];
378 :
379 17864 : if (d == NULL)
380 : {
381 16880 : d = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
382 16880 : if (d == NULL)
383 0 : return NULL;
384 : /* set up additional info if this is a backref node */
385 16880 : if (t->op == 'b')
386 : {
387 199 : d->backno = t->backno;
388 199 : d->backmin = t->min;
389 199 : d->backmax = t->max;
390 : }
391 16880 : v->subdfas[t->id] = d;
392 : }
393 17864 : return d;
394 : }
395 :
396 : /*
397 : * getladfa - create or re-fetch the DFA for a LACON subre node
398 : *
399 : * Same as above, but for LACONs.
400 : */
401 : static struct dfa *
402 158 : getladfa(struct vars *v,
403 : int n)
404 : {
405 : assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
406 :
407 158 : if (v->ladfas[n] == NULL)
408 : {
409 75 : struct subre *sub = &v->g->lacons[n];
410 :
411 75 : v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC);
412 : /* a LACON can't contain a backref, so nothing else to do */
413 : }
414 158 : return v->ladfas[n];
415 : }
416 :
417 : /*
418 : * find - find a match for the main NFA (no-complications case)
419 : */
420 : static int
421 5761096 : find(struct vars *v,
422 : struct cnfa *cnfa,
423 : struct colormap *cm)
424 : {
425 : struct dfa *s;
426 : struct dfa *d;
427 : chr *begin;
428 5761096 : chr *end = NULL;
429 : chr *cold;
430 : chr *open; /* open and close of range of possible starts */
431 : chr *close;
432 : int hitend;
433 5761096 : int shorter = (v->g->tree->flags & SHORTER) ? 1 : 0;
434 :
435 : /* first, a shot with the search RE */
436 5761096 : s = newdfa(v, &v->g->search, cm, &v->dfa1);
437 5761096 : if (s == NULL)
438 0 : return v->err;
439 : MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
440 5761096 : cold = NULL;
441 5761096 : close = shortest(v, s, v->search_start, v->search_start, v->stop,
442 : &cold, (int *) NULL);
443 5761096 : freedfa(s);
444 5761096 : NOERR();
445 5761096 : if (v->g->cflags & REG_EXPECT)
446 : {
447 : assert(v->details != NULL);
448 27 : if (cold != NULL)
449 27 : v->details->rm_extend.rm_so = OFF(cold);
450 : else
451 0 : v->details->rm_extend.rm_so = OFF(v->stop);
452 27 : v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
453 : }
454 5761096 : if (close == NULL) /* not found */
455 5160786 : return REG_NOMATCH;
456 600310 : if (v->nmatch == 0) /* found, don't need exact location */
457 129779 : return REG_OKAY;
458 :
459 : /* find starting point and match */
460 : assert(cold != NULL);
461 470531 : open = cold;
462 470531 : cold = NULL;
463 : MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
464 470531 : d = newdfa(v, cnfa, cm, &v->dfa1);
465 470531 : if (d == NULL)
466 0 : return v->err;
467 470562 : for (begin = open; begin <= close; begin++)
468 : {
469 : MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
470 470562 : if (shorter)
471 86 : end = shortest(v, d, begin, begin, v->stop,
472 : (chr **) NULL, &hitend);
473 : else
474 470476 : end = longest(v, d, begin, v->stop, &hitend);
475 470562 : if (ISERR())
476 : {
477 0 : freedfa(d);
478 0 : return v->err;
479 : }
480 470562 : if (hitend && cold == NULL)
481 4775 : cold = begin;
482 470562 : if (end != NULL)
483 470531 : break; /* NOTE BREAK OUT */
484 : }
485 : assert(end != NULL); /* search RE succeeded so loop should */
486 470531 : freedfa(d);
487 :
488 : /* and pin down details */
489 : assert(v->nmatch > 0);
490 470531 : v->pmatch[0].rm_so = OFF(begin);
491 470531 : v->pmatch[0].rm_eo = OFF(end);
492 470531 : if (v->g->cflags & REG_EXPECT)
493 : {
494 10 : if (cold != NULL)
495 5 : v->details->rm_extend.rm_so = OFF(cold);
496 : else
497 5 : v->details->rm_extend.rm_so = OFF(v->stop);
498 10 : v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
499 : }
500 470531 : if (v->nmatch == 1) /* no need for submatches */
501 467537 : return REG_OKAY;
502 :
503 : /* find submatches */
504 2994 : return cdissect(v, v->g->tree, begin, end);
505 : }
506 :
507 : /*
508 : * cfind - find a match for the main NFA (with complications)
509 : */
510 : static int
511 208 : cfind(struct vars *v,
512 : struct cnfa *cnfa,
513 : struct colormap *cm)
514 : {
515 : struct dfa *s;
516 : struct dfa *d;
517 : chr *cold;
518 : int ret;
519 :
520 208 : s = newdfa(v, &v->g->search, cm, &v->dfa1);
521 208 : if (s == NULL)
522 0 : return v->err;
523 208 : d = newdfa(v, cnfa, cm, &v->dfa2);
524 208 : if (d == NULL)
525 : {
526 0 : freedfa(s);
527 0 : return v->err;
528 : }
529 :
530 208 : ret = cfindloop(v, cnfa, cm, d, s, &cold);
531 :
532 208 : freedfa(d);
533 208 : freedfa(s);
534 208 : NOERR();
535 208 : if (v->g->cflags & REG_EXPECT)
536 : {
537 : assert(v->details != NULL);
538 0 : if (cold != NULL)
539 0 : v->details->rm_extend.rm_so = OFF(cold);
540 : else
541 0 : v->details->rm_extend.rm_so = OFF(v->stop);
542 0 : v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
543 : }
544 208 : return ret;
545 : }
546 :
547 : /*
548 : * cfindloop - the heart of cfind
549 : */
550 : static int
551 208 : cfindloop(struct vars *v,
552 : struct cnfa *cnfa,
553 : struct colormap *cm,
554 : struct dfa *d,
555 : struct dfa *s,
556 : chr **coldp) /* where to put coldstart pointer */
557 : {
558 : chr *begin;
559 : chr *end;
560 : chr *cold;
561 : chr *open; /* open and close of range of possible starts */
562 : chr *close;
563 : chr *estart;
564 : chr *estop;
565 : int er;
566 208 : int shorter = v->g->tree->flags & SHORTER;
567 : int hitend;
568 :
569 : assert(d != NULL && s != NULL);
570 208 : cold = NULL;
571 208 : close = v->search_start;
572 : do
573 : {
574 : /* Search with the search RE for match range at/beyond "close" */
575 : MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
576 232 : close = shortest(v, s, close, close, v->stop, &cold, (int *) NULL);
577 232 : if (ISERR())
578 : {
579 0 : *coldp = cold;
580 0 : return v->err;
581 : }
582 232 : if (close == NULL)
583 23 : break; /* no more possible match anywhere */
584 : assert(cold != NULL);
585 209 : open = cold;
586 209 : cold = NULL;
587 : /* Search for matches starting between "open" and "close" inclusive */
588 : MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
589 749 : for (begin = open; begin <= close; begin++)
590 : {
591 : MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
592 659 : estart = begin;
593 659 : estop = v->stop;
594 : for (;;)
595 : {
596 : /* Here we use the top node's detailed RE */
597 942 : if (shorter)
598 141 : end = shortest(v, d, begin, estart,
599 : estop, (chr **) NULL, &hitend);
600 : else
601 801 : end = longest(v, d, begin, estop,
602 : &hitend);
603 942 : if (ISERR())
604 : {
605 0 : *coldp = cold;
606 0 : return v->err;
607 : }
608 942 : if (hitend && cold == NULL)
609 147 : cold = begin;
610 942 : if (end == NULL)
611 513 : break; /* no match with this begin point, try next */
612 : MDEBUG(("tentative end %ld\n", LOFF(end)));
613 : /* Dissect the potential match to see if it really matches */
614 429 : er = cdissect(v, v->g->tree, begin, end);
615 429 : if (er == REG_OKAY)
616 : {
617 119 : if (v->nmatch > 0)
618 : {
619 119 : v->pmatch[0].rm_so = OFF(begin);
620 119 : v->pmatch[0].rm_eo = OFF(end);
621 : }
622 119 : *coldp = cold;
623 119 : return REG_OKAY;
624 : }
625 310 : if (er != REG_NOMATCH)
626 : {
627 0 : ERR(er);
628 0 : *coldp = cold;
629 0 : return er;
630 : }
631 : /* Try next longer/shorter match with same begin point */
632 310 : if (shorter)
633 : {
634 119 : if (end == estop)
635 17 : break; /* no more, so try next begin point */
636 102 : estart = end + 1;
637 : }
638 : else
639 : {
640 191 : if (end == begin)
641 10 : break; /* no more, so try next begin point */
642 181 : estop = end - 1;
643 : }
644 : } /* end loop over endpoint positions */
645 : } /* end loop over beginning positions */
646 :
647 : /*
648 : * If we get here, there is no possible match starting at or before
649 : * "close", so consider matches beyond that. We'll do a fresh search
650 : * with the search RE to find a new promising match range.
651 : */
652 90 : close++;
653 90 : } while (close < v->stop);
654 :
655 89 : *coldp = cold;
656 89 : return REG_NOMATCH;
657 : }
658 :
659 : /*
660 : * zapallsubs - initialize all subexpression matches to "no match"
661 : *
662 : * Note that p[0], the overall-match location, is not touched.
663 : */
664 : static void
665 1049937 : zapallsubs(regmatch_t *p,
666 : size_t n)
667 : {
668 : size_t i;
669 :
670 1060993 : for (i = n - 1; i > 0; i--)
671 : {
672 11056 : p[i].rm_so = -1;
673 11056 : p[i].rm_eo = -1;
674 : }
675 1049937 : }
676 :
677 : /*
678 : * zaptreesubs - initialize subexpressions within subtree to "no match"
679 : */
680 : static void
681 739 : zaptreesubs(struct vars *v,
682 : struct subre *t)
683 : {
684 739 : int n = t->capno;
685 : struct subre *t2;
686 :
687 739 : if (n > 0)
688 : {
689 328 : if ((size_t) n < v->nmatch)
690 : {
691 328 : v->pmatch[n].rm_so = -1;
692 328 : v->pmatch[n].rm_eo = -1;
693 : }
694 : }
695 :
696 988 : for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
697 249 : zaptreesubs(v, t2);
698 739 : }
699 :
700 : /*
701 : * subset - set subexpression match data for a successful subre
702 : */
703 : static void
704 5539 : subset(struct vars *v,
705 : struct subre *sub,
706 : chr *begin,
707 : chr *end)
708 : {
709 5539 : int n = sub->capno;
710 :
711 : assert(n > 0);
712 5539 : if ((size_t) n >= v->nmatch)
713 15 : return;
714 :
715 : MDEBUG(("%d: setting %d = %ld-%ld\n", sub->id, n, LOFF(begin), LOFF(end)));
716 5524 : v->pmatch[n].rm_so = OFF(begin);
717 5524 : v->pmatch[n].rm_eo = OFF(end);
718 : }
719 :
720 : /*
721 : * cdissect - check backrefs and determine subexpression matches
722 : *
723 : * cdissect recursively processes a subre tree to check matching of backrefs
724 : * and/or identify submatch boundaries for capture nodes. The proposed match
725 : * runs from "begin" to "end" (not including "end"), and we are basically
726 : * "dissecting" it to see where the submatches are.
727 : *
728 : * Before calling any level of cdissect, the caller must have run the node's
729 : * DFA and found that the proposed substring satisfies the DFA. (We make
730 : * the caller do that because in concatenation and iteration nodes, it's
731 : * much faster to check all the substrings against the child DFAs before we
732 : * recurse.)
733 : *
734 : * A side-effect of a successful match is to save match locations for
735 : * capturing subexpressions in v->pmatch[]. This is a little bit tricky,
736 : * so we make the following rules:
737 : * 1. Before initial entry to cdissect, all match data must have been
738 : * cleared (this is seen to by zapallsubs).
739 : * 2. Before any recursive entry to cdissect, the match data for that
740 : * subexpression tree must be guaranteed clear (see zaptreesubs).
741 : * 3. When returning REG_OKAY, each level of cdissect will have saved
742 : * any relevant match locations.
743 : * 4. When returning REG_NOMATCH, each level of cdissect will guarantee
744 : * that its subexpression match locations are again clear.
745 : * 5. No guarantees are made for error cases (i.e., other result codes).
746 : * 6. When a level of cdissect abandons a successful sub-match, it will
747 : * clear that subtree's match locations with zaptreesubs before trying
748 : * any new DFA match or cdissect call for that subtree or any subtree
749 : * to its right (that is, any subtree that could have a backref into the
750 : * abandoned match).
751 : * This may seem overly complicated, but it's difficult to simplify it
752 : * because of the provision that match locations must be reset before
753 : * any fresh DFA match (a rule that is needed to make dfa_backref safe).
754 : * That means it won't work to just reset relevant match locations at the
755 : * start of each cdissect level.
756 : */
757 : static int /* regexec return code */
758 20816 : cdissect(struct vars *v,
759 : struct subre *t,
760 : chr *begin, /* beginning of relevant substring */
761 : chr *end) /* end of same */
762 : {
763 : int er;
764 :
765 : assert(t != NULL);
766 : MDEBUG(("%d: cdissect %c %ld-%ld\n", t->id, t->op, LOFF(begin), LOFF(end)));
767 :
768 : /* handy place to check for operation cancel */
769 20816 : INTERRUPT(v->re);
770 : /* ... and stack overrun */
771 20816 : if (STACK_TOO_DEEP(v->re))
772 0 : return REG_ETOOBIG;
773 :
774 20816 : switch (t->op)
775 : {
776 11432 : case '=': /* terminal node */
777 : assert(t->child == NULL);
778 11432 : er = REG_OKAY; /* no action, parent did the work */
779 11432 : break;
780 303 : case 'b': /* back reference */
781 : assert(t->child == NULL);
782 303 : er = cbrdissect(v, t, begin, end);
783 303 : break;
784 8769 : case '.': /* concatenation */
785 : assert(t->child != NULL);
786 8769 : if (t->child->flags & SHORTER) /* reverse scan */
787 237 : er = crevcondissect(v, t, begin, end);
788 : else
789 8532 : er = ccondissect(v, t, begin, end);
790 8769 : break;
791 113 : case '|': /* alternation */
792 : assert(t->child != NULL);
793 113 : er = caltdissect(v, t, begin, end);
794 113 : break;
795 167 : case '*': /* iteration */
796 : assert(t->child != NULL);
797 167 : if (t->child->flags & SHORTER) /* reverse scan */
798 8 : er = creviterdissect(v, t, begin, end);
799 : else
800 159 : er = citerdissect(v, t, begin, end);
801 167 : break;
802 32 : case '(': /* no-op capture node */
803 : assert(t->child != NULL);
804 32 : er = cdissect(v, t->child, begin, end);
805 32 : break;
806 0 : default:
807 0 : er = REG_ASSERT;
808 0 : break;
809 : }
810 :
811 : /*
812 : * We should never have a match failure unless backrefs lurk below;
813 : * otherwise, either caller failed to check the DFA, or there's some
814 : * inconsistency between the DFA and the node's innards.
815 : */
816 : assert(er != REG_NOMATCH || (t->flags & BACKR));
817 :
818 : /*
819 : * If this node is marked as capturing, save successful match's location.
820 : */
821 20816 : if (t->capno > 0 && er == REG_OKAY)
822 5539 : subset(v, t, begin, end);
823 :
824 20816 : return er;
825 : }
826 :
827 : /*
828 : * ccondissect - dissect match for concatenation node
829 : */
830 : static int /* regexec return code */
831 8532 : ccondissect(struct vars *v,
832 : struct subre *t,
833 : chr *begin, /* beginning of relevant substring */
834 : chr *end) /* end of same */
835 : {
836 8532 : struct subre *left = t->child;
837 8532 : struct subre *right = left->sibling;
838 : struct dfa *d;
839 : struct dfa *d2;
840 : chr *mid;
841 : int er;
842 :
843 : assert(t->op == '.');
844 : assert(left != NULL && left->cnfa.nstates > 0);
845 : assert(right != NULL && right->cnfa.nstates > 0);
846 : assert(right->sibling == NULL);
847 : assert(!(left->flags & SHORTER));
848 :
849 8532 : d = getsubdfa(v, left);
850 8532 : NOERR();
851 8532 : d2 = getsubdfa(v, right);
852 8532 : NOERR();
853 : MDEBUG(("%d: ccondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
854 :
855 : /* pick a tentative midpoint */
856 8532 : mid = longest(v, d, begin, end, (int *) NULL);
857 8532 : NOERR();
858 8532 : if (mid == NULL)
859 8 : return REG_NOMATCH;
860 : MDEBUG(("%d: tentative midpoint %ld\n", t->id, LOFF(mid)));
861 :
862 : /* iterate until satisfaction or failure */
863 : for (;;)
864 : {
865 : /* try this midpoint on for size */
866 10116 : if (longest(v, d2, mid, end, (int *) NULL) == end)
867 : {
868 8467 : er = cdissect(v, left, begin, mid);
869 8467 : if (er == REG_OKAY)
870 : {
871 8393 : er = cdissect(v, right, mid, end);
872 8393 : if (er == REG_OKAY)
873 : {
874 : /* satisfaction */
875 : MDEBUG(("%d: successful\n", t->id));
876 8035 : return REG_OKAY;
877 : }
878 : /* Reset left's matches (right should have done so itself) */
879 358 : zaptreesubs(v, left);
880 : }
881 432 : if (er != REG_NOMATCH)
882 0 : return er;
883 : }
884 2081 : NOERR();
885 :
886 : /* that midpoint didn't work, find a new one */
887 2081 : if (mid == begin)
888 : {
889 : /* all possibilities exhausted */
890 : MDEBUG(("%d: no midpoint\n", t->id));
891 105 : return REG_NOMATCH;
892 : }
893 1976 : mid = longest(v, d, begin, mid - 1, (int *) NULL);
894 1976 : NOERR();
895 1976 : if (mid == NULL)
896 : {
897 : /* failed to find a new one */
898 : MDEBUG(("%d: failed midpoint\n", t->id));
899 384 : return REG_NOMATCH;
900 : }
901 : MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
902 : }
903 :
904 : /* can't get here */
905 : return REG_ASSERT;
906 : }
907 :
908 : /*
909 : * crevcondissect - dissect match for concatenation node, shortest-first
910 : */
911 : static int /* regexec return code */
912 237 : crevcondissect(struct vars *v,
913 : struct subre *t,
914 : chr *begin, /* beginning of relevant substring */
915 : chr *end) /* end of same */
916 : {
917 237 : struct subre *left = t->child;
918 237 : struct subre *right = left->sibling;
919 : struct dfa *d;
920 : struct dfa *d2;
921 : chr *mid;
922 : int er;
923 :
924 : assert(t->op == '.');
925 : assert(left != NULL && left->cnfa.nstates > 0);
926 : assert(right != NULL && right->cnfa.nstates > 0);
927 : assert(right->sibling == NULL);
928 : assert(left->flags & SHORTER);
929 :
930 237 : d = getsubdfa(v, left);
931 237 : NOERR();
932 237 : d2 = getsubdfa(v, right);
933 237 : NOERR();
934 : MDEBUG(("%d: crevcondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
935 :
936 : /* pick a tentative midpoint */
937 237 : mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
938 237 : NOERR();
939 237 : if (mid == NULL)
940 0 : return REG_NOMATCH;
941 : MDEBUG(("%d: tentative midpoint %ld\n", t->id, LOFF(mid)));
942 :
943 : /* iterate until satisfaction or failure */
944 : for (;;)
945 : {
946 : /* try this midpoint on for size */
947 853 : if (longest(v, d2, mid, end, (int *) NULL) == end)
948 : {
949 128 : er = cdissect(v, left, begin, mid);
950 128 : if (er == REG_OKAY)
951 : {
952 128 : er = cdissect(v, right, mid, end);
953 128 : if (er == REG_OKAY)
954 : {
955 : /* satisfaction */
956 : MDEBUG(("%d: successful\n", t->id));
957 116 : return REG_OKAY;
958 : }
959 : /* Reset left's matches (right should have done so itself) */
960 12 : zaptreesubs(v, left);
961 : }
962 12 : if (er != REG_NOMATCH)
963 0 : return er;
964 : }
965 737 : NOERR();
966 :
967 : /* that midpoint didn't work, find a new one */
968 737 : if (mid == end)
969 : {
970 : /* all possibilities exhausted */
971 : MDEBUG(("%d: no midpoint\n", t->id));
972 121 : return REG_NOMATCH;
973 : }
974 616 : mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL);
975 616 : NOERR();
976 616 : if (mid == NULL)
977 : {
978 : /* failed to find a new one */
979 : MDEBUG(("%d: failed midpoint\n", t->id));
980 0 : return REG_NOMATCH;
981 : }
982 : MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
983 : }
984 :
985 : /* can't get here */
986 : return REG_ASSERT;
987 : }
988 :
989 : /*
990 : * cbrdissect - dissect match for backref node
991 : *
992 : * The backref match might already have been verified by dfa_backref(),
993 : * but we don't know that for sure so must check it here.
994 : */
995 : static int /* regexec return code */
996 303 : cbrdissect(struct vars *v,
997 : struct subre *t,
998 : chr *begin, /* beginning of relevant substring */
999 : chr *end) /* end of same */
1000 : {
1001 303 : int n = t->backno;
1002 : size_t numreps;
1003 : size_t tlen;
1004 : size_t brlen;
1005 : chr *brstring;
1006 : chr *p;
1007 303 : int min = t->min;
1008 303 : int max = t->max;
1009 :
1010 : assert(t != NULL);
1011 : assert(t->op == 'b');
1012 : assert(n >= 0);
1013 : assert((size_t) n < v->nmatch);
1014 :
1015 : MDEBUG(("%d: cbrdissect %d{%d-%d} %ld-%ld\n", t->id, n, min, max,
1016 : LOFF(begin), LOFF(end)));
1017 :
1018 : /* get the backreferenced string */
1019 303 : if (v->pmatch[n].rm_so == -1)
1020 83 : return REG_NOMATCH;
1021 220 : brstring = v->start + v->pmatch[n].rm_so;
1022 220 : brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
1023 :
1024 : /* special cases for zero-length strings */
1025 220 : if (brlen == 0)
1026 : {
1027 : /*
1028 : * matches only if target is zero length, but any number of
1029 : * repetitions can be considered to be present
1030 : */
1031 11 : if (begin == end && min <= max)
1032 : {
1033 : MDEBUG(("%d: backref matched trivially\n", t->id));
1034 11 : return REG_OKAY;
1035 : }
1036 0 : return REG_NOMATCH;
1037 : }
1038 209 : if (begin == end)
1039 : {
1040 : /* matches only if zero repetitions are okay */
1041 8 : if (min == 0)
1042 : {
1043 : MDEBUG(("%d: backref matched trivially\n", t->id));
1044 7 : return REG_OKAY;
1045 : }
1046 1 : return REG_NOMATCH;
1047 : }
1048 :
1049 : /*
1050 : * check target length to see if it could possibly be an allowed number of
1051 : * repetitions of brstring
1052 : */
1053 : assert(end > begin);
1054 201 : tlen = end - begin;
1055 201 : if (tlen % brlen != 0)
1056 5 : return REG_NOMATCH;
1057 196 : numreps = tlen / brlen;
1058 196 : if (numreps < min || (numreps > max && max != DUPINF))
1059 3 : return REG_NOMATCH;
1060 :
1061 : /* okay, compare the actual string contents */
1062 193 : p = begin;
1063 338 : while (numreps-- > 0)
1064 : {
1065 219 : if ((*v->g->compare) (brstring, p, brlen) != 0)
1066 74 : return REG_NOMATCH;
1067 145 : p += brlen;
1068 : }
1069 :
1070 : MDEBUG(("%d: backref matched\n", t->id));
1071 119 : return REG_OKAY;
1072 : }
1073 :
1074 : /*
1075 : * caltdissect - dissect match for alternation node
1076 : */
1077 : static int /* regexec return code */
1078 113 : caltdissect(struct vars *v,
1079 : struct subre *t,
1080 : chr *begin, /* beginning of relevant substring */
1081 : chr *end) /* end of same */
1082 : {
1083 : struct dfa *d;
1084 : int er;
1085 :
1086 : assert(t->op == '|');
1087 :
1088 113 : t = t->child;
1089 : /* there should be at least 2 alternatives */
1090 : assert(t != NULL && t->sibling != NULL);
1091 :
1092 200 : while (t != NULL)
1093 : {
1094 : assert(t->cnfa.nstates > 0);
1095 :
1096 : MDEBUG(("%d: caltdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
1097 :
1098 160 : d = getsubdfa(v, t);
1099 160 : NOERR();
1100 160 : if (longest(v, d, begin, end, (int *) NULL) == end)
1101 : {
1102 : MDEBUG(("%d: caltdissect matched\n", t->id));
1103 125 : er = cdissect(v, t, begin, end);
1104 125 : if (er != REG_NOMATCH)
1105 73 : return er;
1106 : }
1107 87 : NOERR();
1108 :
1109 87 : t = t->sibling;
1110 : }
1111 :
1112 40 : return REG_NOMATCH;
1113 : }
1114 :
1115 : /*
1116 : * citerdissect - dissect match for iteration node
1117 : */
1118 : static int /* regexec return code */
1119 159 : citerdissect(struct vars *v,
1120 : struct subre *t,
1121 : chr *begin, /* beginning of relevant substring */
1122 : chr *end) /* end of same */
1123 : {
1124 : struct dfa *d;
1125 : chr **endpts;
1126 : chr *limit;
1127 : int min_matches;
1128 : size_t max_matches;
1129 : int nverified;
1130 : int k;
1131 : int i;
1132 : int er;
1133 :
1134 : assert(t->op == '*');
1135 : assert(t->child != NULL && t->child->cnfa.nstates > 0);
1136 : assert(!(t->child->flags & SHORTER));
1137 : assert(begin <= end);
1138 :
1139 : MDEBUG(("%d: citerdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
1140 :
1141 : /*
1142 : * For the moment, assume the minimum number of matches is 1. If zero
1143 : * matches are allowed, and the target string is empty, we are allowed to
1144 : * match regardless of the contents of the iter node --- but we would
1145 : * prefer to match once, so that capturing parens get set. (An example of
1146 : * the concern here is a pattern like "()*\1", which historically this
1147 : * code has allowed to succeed.) Therefore, we deal with the zero-matches
1148 : * case at the bottom, after failing to find any other way to match.
1149 : */
1150 159 : min_matches = t->min;
1151 159 : if (min_matches <= 0)
1152 105 : min_matches = 1;
1153 :
1154 : /*
1155 : * We need workspace to track the endpoints of each sub-match. Normally
1156 : * we consider only nonzero-length sub-matches, so there can be at most
1157 : * end-begin of them. However, if min is larger than that, we will also
1158 : * consider zero-length sub-matches in order to find enough matches.
1159 : *
1160 : * For convenience, endpts[0] contains the "begin" pointer and we store
1161 : * sub-match endpoints in endpts[1..max_matches].
1162 : */
1163 159 : max_matches = end - begin;
1164 159 : if (max_matches > t->max && t->max != DUPINF)
1165 0 : max_matches = t->max;
1166 159 : if (max_matches < min_matches)
1167 97 : max_matches = min_matches;
1168 159 : endpts = MALLOC_ARRAY(chr *, max_matches + 1);
1169 159 : if (endpts == NULL)
1170 0 : return REG_ESPACE;
1171 159 : endpts[0] = begin;
1172 :
1173 159 : d = getsubdfa(v, t->child);
1174 159 : if (ISERR())
1175 : {
1176 0 : FREE(endpts);
1177 0 : return v->err;
1178 : }
1179 :
1180 : /*
1181 : * Our strategy is to first find a set of sub-match endpoints that are
1182 : * valid according to the child node's DFA, and then recursively dissect
1183 : * each sub-match to confirm validity. If any validity check fails,
1184 : * backtrack that sub-match and try again. And, when we next try for a
1185 : * validity check, we need not recheck any successfully verified
1186 : * sub-matches that we didn't move the endpoints of. nverified remembers
1187 : * how many sub-matches are currently known okay.
1188 : */
1189 :
1190 : /* initialize to consider first sub-match */
1191 159 : nverified = 0;
1192 159 : k = 1;
1193 159 : limit = end;
1194 :
1195 : /* iterate until satisfaction or failure */
1196 725 : while (k > 0)
1197 : {
1198 : /* try to find an endpoint for the k'th sub-match */
1199 593 : endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
1200 593 : if (ISERR())
1201 : {
1202 0 : FREE(endpts);
1203 0 : return v->err;
1204 : }
1205 593 : if (endpts[k] == NULL)
1206 : {
1207 : /* no match possible, so see if we can shorten previous one */
1208 306 : k--;
1209 306 : goto backtrack;
1210 : }
1211 : MDEBUG(("%d: working endpoint %d: %ld\n",
1212 : t->id, k, LOFF(endpts[k])));
1213 :
1214 : /* k'th sub-match can no longer be considered verified */
1215 287 : if (nverified >= k)
1216 12 : nverified = k - 1;
1217 :
1218 287 : if (endpts[k] != end)
1219 : {
1220 : /* haven't reached end yet, try another iteration if allowed */
1221 200 : if (k >= max_matches)
1222 : {
1223 : /* must try to shorten some previous match */
1224 0 : k--;
1225 0 : goto backtrack;
1226 : }
1227 :
1228 : /* reject zero-length match unless necessary to achieve min */
1229 200 : if (endpts[k] == endpts[k - 1] &&
1230 0 : (k >= min_matches || min_matches - k < end - endpts[k]))
1231 0 : goto backtrack;
1232 :
1233 200 : k++;
1234 200 : limit = end;
1235 200 : continue;
1236 : }
1237 :
1238 : /*
1239 : * We've identified a way to divide the string into k sub-matches that
1240 : * works so far as the child DFA can tell. If k is an allowed number
1241 : * of matches, start the slow part: recurse to verify each sub-match.
1242 : * We always have k <= max_matches, needn't check that.
1243 : */
1244 87 : if (k < min_matches)
1245 0 : goto backtrack;
1246 :
1247 : MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
1248 :
1249 140 : for (i = nverified + 1; i <= k; i++)
1250 : {
1251 : /* zap any match data from a non-last iteration */
1252 113 : zaptreesubs(v, t->child);
1253 113 : er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
1254 113 : if (er == REG_OKAY)
1255 : {
1256 53 : nverified = i;
1257 53 : continue;
1258 : }
1259 60 : if (er == REG_NOMATCH)
1260 60 : break;
1261 : /* oops, something failed */
1262 0 : FREE(endpts);
1263 0 : return er;
1264 : }
1265 :
1266 87 : if (i > k)
1267 : {
1268 : /* satisfaction */
1269 : MDEBUG(("%d: successful\n", t->id));
1270 27 : FREE(endpts);
1271 27 : return REG_OKAY;
1272 : }
1273 :
1274 : /* i'th match failed to verify, so backtrack it */
1275 60 : k = i;
1276 :
1277 366 : backtrack:
1278 :
1279 : /*
1280 : * Must consider shorter versions of the k'th sub-match. However,
1281 : * we'll only ask for a zero-length match if necessary.
1282 : */
1283 366 : while (k > 0)
1284 : {
1285 234 : chr *prev_end = endpts[k - 1];
1286 :
1287 234 : if (endpts[k] > prev_end)
1288 : {
1289 234 : limit = endpts[k] - 1;
1290 234 : if (limit > prev_end ||
1291 0 : (k < min_matches && min_matches - k >= end - prev_end))
1292 : {
1293 : /* break out of backtrack loop, continue the outer one */
1294 : break;
1295 : }
1296 : }
1297 : /* can't shorten k'th sub-match any more, consider previous one */
1298 0 : k--;
1299 : }
1300 : }
1301 :
1302 : /* all possibilities exhausted */
1303 132 : FREE(endpts);
1304 :
1305 : /*
1306 : * Now consider the possibility that we can match to a zero-length string
1307 : * by using zero repetitions.
1308 : */
1309 132 : if (t->min == 0 && begin == end)
1310 : {
1311 : MDEBUG(("%d: allowing zero matches\n", t->id));
1312 90 : return REG_OKAY;
1313 : }
1314 :
1315 : MDEBUG(("%d: failed\n", t->id));
1316 42 : return REG_NOMATCH;
1317 : }
1318 :
1319 : /*
1320 : * creviterdissect - dissect match for iteration node, shortest-first
1321 : */
1322 : static int /* regexec return code */
1323 8 : creviterdissect(struct vars *v,
1324 : struct subre *t,
1325 : chr *begin, /* beginning of relevant substring */
1326 : chr *end) /* end of same */
1327 : {
1328 : struct dfa *d;
1329 : chr **endpts;
1330 : chr *limit;
1331 : int min_matches;
1332 : size_t max_matches;
1333 : int nverified;
1334 : int k;
1335 : int i;
1336 : int er;
1337 :
1338 : assert(t->op == '*');
1339 : assert(t->child != NULL && t->child->cnfa.nstates > 0);
1340 : assert(t->child->flags & SHORTER);
1341 : assert(begin <= end);
1342 :
1343 : MDEBUG(("%d: creviterdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
1344 :
1345 : /*
1346 : * If zero matches are allowed, and target string is empty, just declare
1347 : * victory. OTOH, if target string isn't empty, zero matches can't work
1348 : * so we pretend the min is 1.
1349 : */
1350 8 : min_matches = t->min;
1351 8 : if (min_matches <= 0)
1352 : {
1353 8 : if (begin == end)
1354 : {
1355 : MDEBUG(("%d: allowing zero matches\n", t->id));
1356 1 : return REG_OKAY;
1357 : }
1358 7 : min_matches = 1;
1359 : }
1360 :
1361 : /*
1362 : * We need workspace to track the endpoints of each sub-match. Normally
1363 : * we consider only nonzero-length sub-matches, so there can be at most
1364 : * end-begin of them. However, if min is larger than that, we will also
1365 : * consider zero-length sub-matches in order to find enough matches.
1366 : *
1367 : * For convenience, endpts[0] contains the "begin" pointer and we store
1368 : * sub-match endpoints in endpts[1..max_matches].
1369 : */
1370 7 : max_matches = end - begin;
1371 7 : if (max_matches > t->max && t->max != DUPINF)
1372 7 : max_matches = t->max;
1373 7 : if (max_matches < min_matches)
1374 0 : max_matches = min_matches;
1375 7 : endpts = MALLOC_ARRAY(chr *, max_matches + 1);
1376 7 : if (endpts == NULL)
1377 0 : return REG_ESPACE;
1378 7 : endpts[0] = begin;
1379 :
1380 7 : d = getsubdfa(v, t->child);
1381 7 : if (ISERR())
1382 : {
1383 0 : FREE(endpts);
1384 0 : return v->err;
1385 : }
1386 :
1387 : /*
1388 : * Our strategy is to first find a set of sub-match endpoints that are
1389 : * valid according to the child node's DFA, and then recursively dissect
1390 : * each sub-match to confirm validity. If any validity check fails,
1391 : * backtrack that sub-match and try again. And, when we next try for a
1392 : * validity check, we need not recheck any successfully verified
1393 : * sub-matches that we didn't move the endpoints of. nverified remembers
1394 : * how many sub-matches are currently known okay.
1395 : */
1396 :
1397 : /* initialize to consider first sub-match */
1398 7 : nverified = 0;
1399 7 : k = 1;
1400 7 : limit = begin;
1401 :
1402 : /* iterate until satisfaction or failure */
1403 9 : while (k > 0)
1404 : {
1405 : /* disallow zero-length match unless necessary to achieve min */
1406 7 : if (limit == endpts[k - 1] &&
1407 7 : limit != end &&
1408 0 : (k >= min_matches || min_matches - k < end - limit))
1409 7 : limit++;
1410 :
1411 : /* if this is the last allowed sub-match, it must reach to the end */
1412 7 : if (k >= max_matches)
1413 7 : limit = end;
1414 :
1415 : /* try to find an endpoint for the k'th sub-match */
1416 7 : endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
1417 : (chr **) NULL, (int *) NULL);
1418 7 : if (ISERR())
1419 : {
1420 0 : FREE(endpts);
1421 0 : return v->err;
1422 : }
1423 7 : if (endpts[k] == NULL)
1424 : {
1425 : /* no match possible, so see if we can lengthen previous one */
1426 0 : k--;
1427 0 : goto backtrack;
1428 : }
1429 : MDEBUG(("%d: working endpoint %d: %ld\n",
1430 : t->id, k, LOFF(endpts[k])));
1431 :
1432 : /* k'th sub-match can no longer be considered verified */
1433 7 : if (nverified >= k)
1434 0 : nverified = k - 1;
1435 :
1436 7 : if (endpts[k] != end)
1437 : {
1438 : /* haven't reached end yet, try another iteration if allowed */
1439 0 : if (k >= max_matches)
1440 : {
1441 : /* must try to lengthen some previous match */
1442 0 : k--;
1443 0 : goto backtrack;
1444 : }
1445 :
1446 0 : k++;
1447 0 : limit = endpts[k - 1];
1448 0 : continue;
1449 : }
1450 :
1451 : /*
1452 : * We've identified a way to divide the string into k sub-matches that
1453 : * works so far as the child DFA can tell. If k is an allowed number
1454 : * of matches, start the slow part: recurse to verify each sub-match.
1455 : * We always have k <= max_matches, needn't check that.
1456 : */
1457 7 : if (k < min_matches)
1458 0 : goto backtrack;
1459 :
1460 : MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
1461 :
1462 12 : for (i = nverified + 1; i <= k; i++)
1463 : {
1464 : /* zap any match data from a non-last iteration */
1465 7 : zaptreesubs(v, t->child);
1466 7 : er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
1467 7 : if (er == REG_OKAY)
1468 : {
1469 5 : nverified = i;
1470 5 : continue;
1471 : }
1472 2 : if (er == REG_NOMATCH)
1473 2 : break;
1474 : /* oops, something failed */
1475 0 : FREE(endpts);
1476 0 : return er;
1477 : }
1478 :
1479 7 : if (i > k)
1480 : {
1481 : /* satisfaction */
1482 : MDEBUG(("%d: successful\n", t->id));
1483 5 : FREE(endpts);
1484 5 : return REG_OKAY;
1485 : }
1486 :
1487 : /* i'th match failed to verify, so backtrack it */
1488 2 : k = i;
1489 :
1490 2 : backtrack:
1491 :
1492 : /*
1493 : * Must consider longer versions of the k'th sub-match.
1494 : */
1495 4 : while (k > 0)
1496 : {
1497 2 : if (endpts[k] < end)
1498 : {
1499 0 : limit = endpts[k] + 1;
1500 : /* break out of backtrack loop, continue the outer one */
1501 0 : break;
1502 : }
1503 : /* can't lengthen k'th sub-match any more, consider previous one */
1504 2 : k--;
1505 : }
1506 : }
1507 :
1508 : /* all possibilities exhausted */
1509 : MDEBUG(("%d: failed\n", t->id));
1510 2 : FREE(endpts);
1511 2 : return REG_NOMATCH;
1512 : }
1513 :
1514 :
1515 :
1516 : #include "rege_dfa.c"
|