Line data Source code
1 : /*
2 : * re_*exec and friends - match REs
3 : *
4 : * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
5 : *
6 : * Development of this software was funded, in part, by Cray Research Inc.,
7 : * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
8 : * Corporation, none of whom are responsible for the results. The author
9 : * thanks all of them.
10 : *
11 : * Redistribution and use in source and binary forms -- with or without
12 : * modification -- are permitted for any purpose, provided that
13 : * redistributions in source form retain this entire copyright notice and
14 : * indicate the origin and nature of any modifications.
15 : *
16 : * I'd appreciate being given credit for this package in the documentation
17 : * of software which uses it, but that is not a requirement.
18 : *
19 : * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
20 : * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
21 : * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
22 : * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 : * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 : * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 : * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 : * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 : * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 : * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 : *
30 : * src/backend/regex/regexec.c
31 : *
32 : */
33 :
34 : #include "regex/regguts.h"
35 :
36 :
37 :
38 : /* lazy-DFA representation */
39 : struct arcp
40 : { /* "pointer" to an outarc */
41 : struct sset *ss;
42 : color co;
43 : };
44 :
45 : struct sset
46 : { /* state set */
47 : unsigned *states; /* pointer to bitvector */
48 : unsigned hash; /* hash of bitvector */
49 : #define HASH(bv, nw) (((nw) == 1) ? *(bv) : hash(bv, nw))
50 : #define HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \
51 : memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0))
52 : int flags;
53 : #define STARTER 01 /* the initial state set */
54 : #define POSTSTATE 02 /* includes the goal state */
55 : #define LOCKED 04 /* locked in cache */
56 : #define NOPROGRESS 010 /* zero-progress state set */
57 : struct arcp ins; /* chain of inarcs pointing here */
58 : chr *lastseen; /* last entered on arrival here */
59 : struct sset **outs; /* outarc vector indexed by color */
60 : struct arcp *inchain; /* chain-pointer vector for outarcs */
61 : };
62 :
63 : struct dfa
64 : {
65 : int nssets; /* size of cache */
66 : int nssused; /* how many entries occupied yet */
67 : int nstates; /* number of states */
68 : int ncolors; /* length of outarc and inchain vectors */
69 : int wordsper; /* length of state-set bitvectors */
70 : struct sset *ssets; /* state-set cache */
71 : unsigned *statesarea; /* bitvector storage */
72 : unsigned *work; /* pointer to work area within statesarea */
73 : struct sset **outsarea; /* outarc-vector storage */
74 : struct arcp *incarea; /* inchain storage */
75 : struct cnfa *cnfa;
76 : struct colormap *cm;
77 : chr *lastpost; /* location of last cache-flushed success */
78 : chr *lastnopr; /* location of last cache-flushed NOPROGRESS */
79 : struct sset *search; /* replacement-search-pointer memory */
80 : int backno; /* if DFA for a backref, subno it refers to */
81 : short backmin; /* min repetitions for backref */
82 : short backmax; /* max repetitions for backref */
83 : bool ismalloced; /* should this struct dfa be freed? */
84 : bool arraysmalloced; /* should its subsidiary arrays be freed? */
85 : };
86 :
87 : #define WORK 1 /* number of work bitvectors needed */
88 :
89 : /* setup for non-malloc allocation for small cases */
90 : #define FEWSTATES 20 /* must be less than UBITS */
91 : #define FEWCOLORS 15
92 : struct smalldfa
93 : {
94 : struct dfa dfa; /* must be first */
95 : struct sset ssets[FEWSTATES * 2];
96 : unsigned statesarea[FEWSTATES * 2 + WORK];
97 : struct sset *outsarea[FEWSTATES * 2 * FEWCOLORS];
98 : struct arcp incarea[FEWSTATES * 2 * FEWCOLORS];
99 : };
100 :
101 : #define DOMALLOC ((struct smalldfa *)NULL) /* force malloc */
102 :
103 :
104 :
105 : /* internal variables, bundled for easy passing around */
106 : struct vars
107 : {
108 : regex_t *re;
109 : struct guts *g;
110 : int eflags; /* copies of arguments */
111 : size_t nmatch;
112 : regmatch_t *pmatch;
113 : rm_detail_t *details;
114 : chr *start; /* start of string */
115 : chr *search_start; /* search start of string */
116 : chr *stop; /* just past end of string */
117 : int err; /* error code if any (0 none) */
118 : struct dfa **subdfas; /* per-tree-subre DFAs */
119 : struct dfa **ladfas; /* per-lacon-subre DFAs */
120 : struct sset **lblastcss; /* per-lacon-subre lookbehind restart data */
121 : chr **lblastcp; /* per-lacon-subre lookbehind restart data */
122 : struct smalldfa dfa1;
123 : struct smalldfa dfa2;
124 : };
125 :
126 : #define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
127 : #define ISERR() VISERR(v)
128 : #define VERR(vv,e) ((vv)->err = ((vv)->err ? (vv)->err : (e)))
129 : #define ERR(e) VERR(v, e) /* record an error */
130 : #define NOERR() {if (ISERR()) return v->err;} /* if error seen, return it */
131 : #define OFF(p) ((p) - v->start)
132 : #define LOFF(p) ((long)OFF(p))
133 :
134 :
135 :
136 : /*
137 : * forward declarations
138 : */
139 : /* === regexec.c === */
140 : static struct dfa *getsubdfa(struct vars *v, struct subre *t);
141 : static struct dfa *getladfa(struct vars *v, int n);
142 : static int find(struct vars *v, struct cnfa *cnfa, struct colormap *cm);
143 : static int cfind(struct vars *v, struct cnfa *cnfa, struct colormap *cm);
144 : static int cfindloop(struct vars *v, struct cnfa *cnfa, struct colormap *cm,
145 : struct dfa *d, struct dfa *s, chr **coldp);
146 : static void zapallsubs(regmatch_t *p, size_t n);
147 : static void zaptreesubs(struct vars *v, struct subre *t);
148 : static void subset(struct vars *v, struct subre *sub, chr *begin, chr *end);
149 : static int cdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
150 : static int ccondissect(struct vars *v, struct subre *t, chr *begin, chr *end);
151 : static int crevcondissect(struct vars *v, struct subre *t, chr *begin, chr *end);
152 : static int cbrdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
153 : static int caltdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
154 : static int citerdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
155 : static int creviterdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
156 :
157 : /* === rege_dfa.c === */
158 : static chr *longest(struct vars *v, struct dfa *d,
159 : chr *start, chr *stop, int *hitstopp);
160 : static chr *shortest(struct vars *v, struct dfa *d, chr *start, chr *min,
161 : chr *max, chr **coldp, int *hitstopp);
162 : static int matchuntil(struct vars *v, struct dfa *d, chr *probe,
163 : struct sset **lastcss, chr **lastcp);
164 : static chr *dfa_backref(struct vars *v, struct dfa *d, chr *start,
165 : chr *min, chr *max, bool shortest);
166 : static chr *lastcold(struct vars *v, struct dfa *d);
167 : static struct dfa *newdfa(struct vars *v, struct cnfa *cnfa,
168 : struct colormap *cm, struct smalldfa *sml);
169 : static void freedfa(struct dfa *d);
170 : static unsigned hash(unsigned *uv, int n);
171 : static struct sset *initialize(struct vars *v, struct dfa *d, chr *start);
172 : static struct sset *miss(struct vars *v, struct dfa *d, struct sset *css,
173 : color co, chr *cp, chr *start);
174 : static int lacon(struct vars *v, struct cnfa *pcnfa, chr *cp, color co);
175 : static struct sset *getvacant(struct vars *v, struct dfa *d, chr *cp,
176 : chr *start);
177 : static struct sset *pickss(struct vars *v, struct dfa *d, chr *cp,
178 : chr *start);
179 :
180 :
181 : /*
182 : * pg_regexec - match regular expression
183 : */
184 : int
185 2354312 : pg_regexec(regex_t *re,
186 : const chr *string,
187 : size_t len,
188 : size_t search_start,
189 : rm_detail_t *details,
190 : size_t nmatch,
191 : regmatch_t pmatch[],
192 : int flags)
193 : {
194 : struct vars var;
195 2354312 : struct vars *v = &var;
196 : int st;
197 : size_t n;
198 : size_t i;
199 : int backref;
200 :
201 : #define LOCALMAT 20
202 : regmatch_t mat[LOCALMAT];
203 :
204 : #define LOCALDFAS 40
205 : struct dfa *subdfas[LOCALDFAS];
206 :
207 : /* sanity checks */
208 2354312 : if (re == NULL || string == NULL || re->re_magic != REMAGIC)
209 0 : return REG_INVARG;
210 2354312 : if (re->re_csize != sizeof(chr))
211 0 : return REG_MIXED;
212 2354312 : if (search_start > len)
213 6 : return REG_NOMATCH;
214 :
215 : /* Initialize locale-dependent support */
216 2354306 : pg_set_regex_collation(re->re_collation);
217 :
218 : /* setup */
219 2354306 : v->re = re;
220 2354306 : v->g = (struct guts *) re->re_guts;
221 2354306 : if ((v->g->cflags & REG_EXPECT) && details == NULL)
222 0 : return REG_INVARG;
223 2354306 : if (v->g->info & REG_UIMPOSSIBLE)
224 1432 : return REG_NOMATCH;
225 2352874 : backref = (v->g->info & REG_UBACKREF) ? 1 : 0;
226 2352874 : v->eflags = flags;
227 2352874 : if (backref && nmatch <= v->g->nsub)
228 : {
229 : /* need larger work area */
230 182 : v->nmatch = v->g->nsub + 1;
231 182 : if (v->nmatch <= LOCALMAT)
232 180 : v->pmatch = mat;
233 : else
234 2 : v->pmatch = (regmatch_t *) MALLOC(v->nmatch * sizeof(regmatch_t));
235 182 : if (v->pmatch == NULL)
236 0 : return REG_ESPACE;
237 182 : zapallsubs(v->pmatch, v->nmatch);
238 : }
239 : else
240 : {
241 : /* we can store results directly in caller's array */
242 2352692 : v->pmatch = pmatch;
243 : /* ensure any extra entries in caller's array are filled with -1 */
244 2352692 : if (nmatch > 0)
245 1116416 : zapallsubs(pmatch, nmatch);
246 : /* then forget about extra entries, to avoid useless work in find() */
247 2352692 : if (nmatch > v->g->nsub + 1)
248 1002 : nmatch = v->g->nsub + 1;
249 2352692 : v->nmatch = nmatch;
250 : }
251 2352874 : v->details = details;
252 2352874 : v->start = (chr *) string;
253 2352874 : v->search_start = (chr *) string + search_start;
254 2352874 : v->stop = (chr *) string + len;
255 2352874 : v->err = 0;
256 2352874 : v->subdfas = NULL;
257 2352874 : v->ladfas = NULL;
258 2352874 : v->lblastcss = NULL;
259 2352874 : v->lblastcp = NULL;
260 : /* below this point, "goto cleanup" will behave sanely */
261 :
262 : assert(v->g->ntree >= 0);
263 2352874 : n = (size_t) v->g->ntree;
264 2352874 : if (n <= LOCALDFAS)
265 2352870 : v->subdfas = subdfas;
266 : else
267 : {
268 4 : v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
269 4 : if (v->subdfas == NULL)
270 : {
271 0 : st = REG_ESPACE;
272 0 : goto cleanup;
273 : }
274 : }
275 7090818 : for (i = 0; i < n; i++)
276 4737944 : v->subdfas[i] = NULL;
277 :
278 : assert(v->g->nlacons >= 0);
279 2352874 : n = (size_t) v->g->nlacons;
280 2352874 : if (n > 0)
281 : {
282 1260 : v->ladfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
283 1260 : if (v->ladfas == NULL)
284 : {
285 0 : st = REG_ESPACE;
286 0 : goto cleanup;
287 : }
288 3844 : for (i = 0; i < n; i++)
289 2584 : v->ladfas[i] = NULL;
290 1260 : v->lblastcss = (struct sset **) MALLOC(n * sizeof(struct sset *));
291 1260 : v->lblastcp = (chr **) MALLOC(n * sizeof(chr *));
292 1260 : if (v->lblastcss == NULL || v->lblastcp == NULL)
293 : {
294 0 : st = REG_ESPACE;
295 0 : goto cleanup;
296 : }
297 3844 : for (i = 0; i < n; i++)
298 : {
299 2584 : v->lblastcss[i] = NULL;
300 2584 : v->lblastcp[i] = NULL;
301 : }
302 : }
303 :
304 : /* do it */
305 : assert(v->g->tree != NULL);
306 2352874 : if (backref)
307 290 : st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
308 : else
309 2352584 : st = find(v, &v->g->tree->cnfa, &v->g->cmap);
310 :
311 : /* on success, ensure caller's match vector is filled correctly */
312 2352874 : if (st == REG_OKAY && nmatch > 0)
313 : {
314 904154 : if (v->pmatch != pmatch)
315 : {
316 : /* copy portion of match vector over from (larger) work area */
317 : assert(nmatch <= v->nmatch);
318 42 : memcpy(VS(pmatch), VS(v->pmatch), nmatch * sizeof(regmatch_t));
319 : }
320 904154 : if (v->g->cflags & REG_NOSUB)
321 : {
322 : /* don't expose possibly-partial sub-match results to caller */
323 898520 : zapallsubs(pmatch, nmatch);
324 : }
325 : }
326 :
327 : /* clean up */
328 1454354 : cleanup:
329 2352874 : if (v->pmatch != pmatch && v->pmatch != mat)
330 2 : FREE(v->pmatch);
331 2352874 : if (v->subdfas != NULL)
332 : {
333 2352874 : n = (size_t) v->g->ntree;
334 7090818 : for (i = 0; i < n; i++)
335 : {
336 4737944 : if (v->subdfas[i] != NULL)
337 24752 : freedfa(v->subdfas[i]);
338 : }
339 2352874 : if (v->subdfas != subdfas)
340 4 : FREE(v->subdfas);
341 : }
342 2352874 : if (v->ladfas != NULL)
343 : {
344 1260 : n = (size_t) v->g->nlacons;
345 3844 : for (i = 0; i < n; i++)
346 : {
347 2584 : if (v->ladfas[i] != NULL)
348 106 : freedfa(v->ladfas[i]);
349 : }
350 1260 : FREE(v->ladfas);
351 : }
352 2352874 : if (v->lblastcss != NULL)
353 1260 : FREE(v->lblastcss);
354 2352874 : if (v->lblastcp != NULL)
355 1260 : FREE(v->lblastcp);
356 :
357 : #ifdef REG_DEBUG
358 : if (v->eflags & (REG_FTRACE | REG_MTRACE))
359 : fflush(stdout);
360 : #endif
361 :
362 2352874 : return st;
363 : }
364 :
365 : /*
366 : * getsubdfa - create or re-fetch the DFA for a tree subre node
367 : *
368 : * We only need to create the DFA once per overall regex execution.
369 : * The DFA will be freed by the cleanup step in pg_regexec().
370 : */
371 : static struct dfa *
372 26124 : getsubdfa(struct vars *v,
373 : struct subre *t)
374 : {
375 26124 : struct dfa *d = v->subdfas[t->id];
376 :
377 26124 : if (d == NULL)
378 : {
379 24752 : d = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
380 24752 : if (d == NULL)
381 0 : return NULL;
382 : /* set up additional info if this is a backref node */
383 24752 : if (t->op == 'b')
384 : {
385 280 : d->backno = t->backno;
386 280 : d->backmin = t->min;
387 280 : d->backmax = t->max;
388 : }
389 24752 : v->subdfas[t->id] = d;
390 : }
391 26124 : return d;
392 : }
393 :
394 : /*
395 : * getladfa - create or re-fetch the DFA for a LACON subre node
396 : *
397 : * Same as above, but for LACONs.
398 : */
399 : static struct dfa *
400 240 : getladfa(struct vars *v,
401 : int n)
402 : {
403 : assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
404 :
405 240 : if (v->ladfas[n] == NULL)
406 : {
407 106 : struct subre *sub = &v->g->lacons[n];
408 :
409 106 : v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC);
410 : /* a LACON can't contain a backref, so nothing else to do */
411 : }
412 240 : return v->ladfas[n];
413 : }
414 :
415 : /*
416 : * find - find a match for the main NFA (no-complications case)
417 : */
418 : static int
419 2352584 : find(struct vars *v,
420 : struct cnfa *cnfa,
421 : struct colormap *cm)
422 : {
423 : struct dfa *s;
424 : struct dfa *d;
425 : chr *begin;
426 2352584 : chr *end = NULL;
427 : chr *cold;
428 : chr *open; /* open and close of range of possible starts */
429 : chr *close;
430 : int hitend;
431 2352584 : int shorter = (v->g->tree->flags & SHORTER) ? 1 : 0;
432 :
433 : /* first, a shot with the search RE */
434 2352584 : s = newdfa(v, &v->g->search, cm, &v->dfa1);
435 2352584 : if (s == NULL)
436 0 : return v->err;
437 : MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
438 2352584 : cold = NULL;
439 2352584 : close = shortest(v, s, v->search_start, v->search_start, v->stop,
440 : &cold, (int *) NULL);
441 2352584 : freedfa(s);
442 2352584 : NOERR();
443 2352584 : if (v->g->cflags & REG_EXPECT)
444 : {
445 : assert(v->details != NULL);
446 54 : if (cold != NULL)
447 54 : v->details->rm_extend.rm_so = OFF(cold);
448 : else
449 0 : v->details->rm_extend.rm_so = OFF(v->stop);
450 54 : v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
451 : }
452 2352584 : if (close == NULL) /* not found */
453 1205048 : return REG_NOMATCH;
454 1147536 : if (v->nmatch == 0) /* found, don't need exact location */
455 243498 : return REG_OKAY;
456 :
457 : /* find starting point and match */
458 : assert(cold != NULL);
459 904038 : open = cold;
460 904038 : cold = NULL;
461 : MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
462 904038 : d = newdfa(v, cnfa, cm, &v->dfa1);
463 904038 : if (d == NULL)
464 0 : return v->err;
465 904096 : for (begin = open; begin <= close; begin++)
466 : {
467 : MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
468 904096 : if (shorter)
469 124 : end = shortest(v, d, begin, begin, v->stop,
470 : (chr **) NULL, &hitend);
471 : else
472 903972 : end = longest(v, d, begin, v->stop, &hitend);
473 904096 : if (ISERR())
474 : {
475 0 : freedfa(d);
476 0 : return v->err;
477 : }
478 904096 : if (hitend && cold == NULL)
479 6426 : cold = begin;
480 904096 : if (end != NULL)
481 904038 : break; /* NOTE BREAK OUT */
482 : }
483 : assert(end != NULL); /* search RE succeeded so loop should */
484 904038 : freedfa(d);
485 :
486 : /* and pin down details */
487 : assert(v->nmatch > 0);
488 904038 : v->pmatch[0].rm_so = OFF(begin);
489 904038 : v->pmatch[0].rm_eo = OFF(end);
490 904038 : if (v->g->cflags & REG_EXPECT)
491 : {
492 20 : if (cold != NULL)
493 10 : v->details->rm_extend.rm_so = OFF(cold);
494 : else
495 10 : v->details->rm_extend.rm_so = OFF(v->stop);
496 20 : v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
497 : }
498 904038 : if (v->nmatch == 1) /* no need for submatches */
499 899634 : return REG_OKAY;
500 :
501 : /* find submatches */
502 4404 : return cdissect(v, v->g->tree, begin, end);
503 : }
504 :
505 : /*
506 : * cfind - find a match for the main NFA (with complications)
507 : */
508 : static int
509 290 : cfind(struct vars *v,
510 : struct cnfa *cnfa,
511 : struct colormap *cm)
512 : {
513 : struct dfa *s;
514 : struct dfa *d;
515 : chr *cold;
516 : int ret;
517 :
518 290 : s = newdfa(v, &v->g->search, cm, &v->dfa1);
519 290 : if (s == NULL)
520 0 : return v->err;
521 290 : d = newdfa(v, cnfa, cm, &v->dfa2);
522 290 : if (d == NULL)
523 : {
524 0 : freedfa(s);
525 0 : return v->err;
526 : }
527 :
528 290 : ret = cfindloop(v, cnfa, cm, d, s, &cold);
529 :
530 290 : freedfa(d);
531 290 : freedfa(s);
532 290 : NOERR();
533 290 : if (v->g->cflags & REG_EXPECT)
534 : {
535 : assert(v->details != NULL);
536 0 : if (cold != NULL)
537 0 : v->details->rm_extend.rm_so = OFF(cold);
538 : else
539 0 : v->details->rm_extend.rm_so = OFF(v->stop);
540 0 : v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
541 : }
542 290 : return ret;
543 : }
544 :
545 : /*
546 : * cfindloop - the heart of cfind
547 : */
548 : static int
549 290 : cfindloop(struct vars *v,
550 : struct cnfa *cnfa,
551 : struct colormap *cm,
552 : struct dfa *d,
553 : struct dfa *s,
554 : chr **coldp) /* where to put coldstart pointer */
555 : {
556 : chr *begin;
557 : chr *end;
558 : chr *cold;
559 : chr *open; /* open and close of range of possible starts */
560 : chr *close;
561 : chr *estart;
562 : chr *estop;
563 : int er;
564 290 : int shorter = v->g->tree->flags & SHORTER;
565 : int hitend;
566 :
567 : assert(d != NULL && s != NULL);
568 290 : cold = NULL;
569 290 : close = v->search_start;
570 : do
571 : {
572 : /* Search with the search RE for match range at/beyond "close" */
573 : MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
574 322 : close = shortest(v, s, close, close, v->stop, &cold, (int *) NULL);
575 322 : if (ISERR())
576 : {
577 0 : *coldp = cold;
578 0 : return v->err;
579 : }
580 322 : if (close == NULL)
581 30 : break; /* no more possible match anywhere */
582 : assert(cold != NULL);
583 292 : open = cold;
584 292 : cold = NULL;
585 : /* Search for matches starting between "open" and "close" inclusive */
586 : MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
587 1026 : for (begin = open; begin <= close; begin++)
588 : {
589 : MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
590 904 : estart = begin;
591 904 : estop = v->stop;
592 : for (;;)
593 : {
594 : /* Here we use the top node's detailed RE */
595 1292 : if (shorter)
596 194 : end = shortest(v, d, begin, estart,
597 : estop, (chr **) NULL, &hitend);
598 : else
599 1098 : end = longest(v, d, begin, estop,
600 : &hitend);
601 1292 : if (ISERR())
602 : {
603 0 : *coldp = cold;
604 0 : return v->err;
605 : }
606 1292 : if (hitend && cold == NULL)
607 208 : cold = begin;
608 1292 : if (end == NULL)
609 698 : break; /* no match with this begin point, try next */
610 : MDEBUG(("tentative end %ld\n", LOFF(end)));
611 : /* Dissect the potential match to see if it really matches */
612 594 : er = cdissect(v, v->g->tree, begin, end);
613 594 : if (er == REG_OKAY)
614 : {
615 170 : if (v->nmatch > 0)
616 : {
617 170 : v->pmatch[0].rm_so = OFF(begin);
618 170 : v->pmatch[0].rm_eo = OFF(end);
619 : }
620 170 : *coldp = cold;
621 170 : return REG_OKAY;
622 : }
623 424 : if (er != REG_NOMATCH)
624 : {
625 0 : ERR(er);
626 0 : *coldp = cold;
627 0 : return er;
628 : }
629 : /* Try next longer/shorter match with same begin point */
630 424 : if (shorter)
631 : {
632 162 : if (end == estop)
633 24 : break; /* no more, so try next begin point */
634 138 : estart = end + 1;
635 : }
636 : else
637 : {
638 262 : if (end == begin)
639 12 : break; /* no more, so try next begin point */
640 250 : estop = end - 1;
641 : }
642 : } /* end loop over endpoint positions */
643 : } /* end loop over beginning positions */
644 :
645 : /*
646 : * If we get here, there is no possible match starting at or before
647 : * "close", so consider matches beyond that. We'll do a fresh search
648 : * with the search RE to find a new promising match range.
649 : */
650 122 : close++;
651 122 : } while (close < v->stop);
652 :
653 120 : *coldp = cold;
654 120 : return REG_NOMATCH;
655 : }
656 :
657 : /*
658 : * zapallsubs - initialize all subexpression matches to "no match"
659 : *
660 : * Note that p[0], the overall-match location, is not touched.
661 : */
662 : static void
663 2015118 : zapallsubs(regmatch_t *p,
664 : size_t n)
665 : {
666 : size_t i;
667 :
668 2031526 : for (i = n - 1; i > 0; i--)
669 : {
670 16408 : p[i].rm_so = -1;
671 16408 : p[i].rm_eo = -1;
672 : }
673 2015118 : }
674 :
675 : /*
676 : * zaptreesubs - initialize subexpressions within subtree to "no match"
677 : */
678 : static void
679 1040 : zaptreesubs(struct vars *v,
680 : struct subre *t)
681 : {
682 1040 : int n = t->capno;
683 : struct subre *t2;
684 :
685 1040 : if (n > 0)
686 : {
687 486 : if ((size_t) n < v->nmatch)
688 : {
689 486 : v->pmatch[n].rm_so = -1;
690 486 : v->pmatch[n].rm_eo = -1;
691 : }
692 : }
693 :
694 1390 : for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
695 350 : zaptreesubs(v, t2);
696 1040 : }
697 :
698 : /*
699 : * subset - set subexpression match data for a successful subre
700 : */
701 : static void
702 8110 : subset(struct vars *v,
703 : struct subre *sub,
704 : chr *begin,
705 : chr *end)
706 : {
707 8110 : int n = sub->capno;
708 :
709 : assert(n > 0);
710 8110 : if ((size_t) n >= v->nmatch)
711 18 : return;
712 :
713 : MDEBUG(("%d: setting %d = %ld-%ld\n", sub->id, n, LOFF(begin), LOFF(end)));
714 8092 : v->pmatch[n].rm_so = OFF(begin);
715 8092 : v->pmatch[n].rm_eo = OFF(end);
716 : }
717 :
718 : /*
719 : * cdissect - check backrefs and determine subexpression matches
720 : *
721 : * cdissect recursively processes a subre tree to check matching of backrefs
722 : * and/or identify submatch boundaries for capture nodes. The proposed match
723 : * runs from "begin" to "end" (not including "end"), and we are basically
724 : * "dissecting" it to see where the submatches are.
725 : *
726 : * Before calling any level of cdissect, the caller must have run the node's
727 : * DFA and found that the proposed substring satisfies the DFA. (We make
728 : * the caller do that because in concatenation and iteration nodes, it's
729 : * much faster to check all the substrings against the child DFAs before we
730 : * recurse.)
731 : *
732 : * A side-effect of a successful match is to save match locations for
733 : * capturing subexpressions in v->pmatch[]. This is a little bit tricky,
734 : * so we make the following rules:
735 : * 1. Before initial entry to cdissect, all match data must have been
736 : * cleared (this is seen to by zapallsubs).
737 : * 2. Before any recursive entry to cdissect, the match data for that
738 : * subexpression tree must be guaranteed clear (see zaptreesubs).
739 : * 3. When returning REG_OKAY, each level of cdissect will have saved
740 : * any relevant match locations.
741 : * 4. When returning REG_NOMATCH, each level of cdissect will guarantee
742 : * that its subexpression match locations are again clear.
743 : * 5. No guarantees are made for error cases (i.e., other result codes).
744 : * 6. When a level of cdissect abandons a successful sub-match, it will
745 : * clear that subtree's match locations with zaptreesubs before trying
746 : * any new DFA match or cdissect call for that subtree or any subtree
747 : * to its right (that is, any subtree that could have a backref into the
748 : * abandoned match).
749 : * This may seem overly complicated, but it's difficult to simplify it
750 : * because of the provision that match locations must be reset before
751 : * any fresh DFA match (a rule that is needed to make dfa_backref safe).
752 : * That means it won't work to just reset relevant match locations at the
753 : * start of each cdissect level.
754 : */
755 : static int /* regexec return code */
756 30480 : cdissect(struct vars *v,
757 : struct subre *t,
758 : chr *begin, /* beginning of relevant substring */
759 : chr *end) /* end of same */
760 : {
761 : int er;
762 :
763 : assert(t != NULL);
764 : MDEBUG(("%d: cdissect %c %ld-%ld\n", t->id, t->op, LOFF(begin), LOFF(end)));
765 :
766 : /* handy place to check for operation cancel */
767 30480 : INTERRUPT(v->re);
768 : /* ... and stack overrun */
769 30480 : if (STACK_TOO_DEEP(v->re))
770 0 : return REG_ETOOBIG;
771 :
772 30480 : switch (t->op)
773 : {
774 16766 : case '=': /* terminal node */
775 : assert(t->child == NULL);
776 16766 : er = REG_OKAY; /* no action, parent did the work */
777 16766 : break;
778 418 : case 'b': /* back reference */
779 : assert(t->child == NULL);
780 418 : er = cbrdissect(v, t, begin, end);
781 418 : break;
782 12828 : case '.': /* concatenation */
783 : assert(t->child != NULL);
784 12828 : if (t->child->flags & SHORTER) /* reverse scan */
785 328 : er = crevcondissect(v, t, begin, end);
786 : else
787 12500 : er = ccondissect(v, t, begin, end);
788 12828 : break;
789 160 : case '|': /* alternation */
790 : assert(t->child != NULL);
791 160 : er = caltdissect(v, t, begin, end);
792 160 : break;
793 248 : case '*': /* iteration */
794 : assert(t->child != NULL);
795 248 : if (t->child->flags & SHORTER) /* reverse scan */
796 14 : er = creviterdissect(v, t, begin, end);
797 : else
798 234 : er = citerdissect(v, t, begin, end);
799 248 : break;
800 60 : case '(': /* no-op capture node */
801 : assert(t->child != NULL);
802 60 : er = cdissect(v, t->child, begin, end);
803 60 : break;
804 0 : default:
805 0 : er = REG_ASSERT;
806 0 : break;
807 : }
808 :
809 : /*
810 : * We should never have a match failure unless backrefs lurk below;
811 : * otherwise, either caller failed to check the DFA, or there's some
812 : * inconsistency between the DFA and the node's innards.
813 : */
814 : assert(er != REG_NOMATCH || (t->flags & BACKR));
815 :
816 : /*
817 : * If this node is marked as capturing, save successful match's location.
818 : */
819 30480 : if (t->capno > 0 && er == REG_OKAY)
820 8110 : subset(v, t, begin, end);
821 :
822 30480 : return er;
823 : }
824 :
825 : /*
826 : * ccondissect - dissect match for concatenation node
827 : */
828 : static int /* regexec return code */
829 12500 : ccondissect(struct vars *v,
830 : struct subre *t,
831 : chr *begin, /* beginning of relevant substring */
832 : chr *end) /* end of same */
833 : {
834 12500 : struct subre *left = t->child;
835 12500 : struct subre *right = left->sibling;
836 : struct dfa *d;
837 : struct dfa *d2;
838 : chr *mid;
839 : int er;
840 :
841 : assert(t->op == '.');
842 : assert(left != NULL && left->cnfa.nstates > 0);
843 : assert(right != NULL && right->cnfa.nstates > 0);
844 : assert(right->sibling == NULL);
845 : assert(!(left->flags & SHORTER));
846 :
847 12500 : d = getsubdfa(v, left);
848 12500 : NOERR();
849 12500 : d2 = getsubdfa(v, right);
850 12500 : NOERR();
851 : MDEBUG(("%d: ccondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
852 :
853 : /* pick a tentative midpoint */
854 12500 : mid = longest(v, d, begin, end, (int *) NULL);
855 12500 : NOERR();
856 12500 : if (mid == NULL)
857 16 : return REG_NOMATCH;
858 : MDEBUG(("%d: tentative midpoint %ld\n", t->id, LOFF(mid)));
859 :
860 : /* iterate until satisfaction or failure */
861 : for (;;)
862 : {
863 : /* try this midpoint on for size */
864 15144 : if (longest(v, d2, mid, end, (int *) NULL) == end)
865 : {
866 12402 : er = cdissect(v, left, begin, mid);
867 12402 : if (er == REG_OKAY)
868 : {
869 12302 : er = cdissect(v, right, mid, end);
870 12302 : if (er == REG_OKAY)
871 : {
872 : /* satisfaction */
873 : MDEBUG(("%d: successful\n", t->id));
874 11806 : return REG_OKAY;
875 : }
876 : /* Reset left's matches (right should have done so itself) */
877 496 : zaptreesubs(v, left);
878 : }
879 596 : if (er != REG_NOMATCH)
880 0 : return er;
881 : }
882 3338 : NOERR();
883 :
884 : /* that midpoint didn't work, find a new one */
885 3338 : if (mid == begin)
886 : {
887 : /* all possibilities exhausted */
888 : MDEBUG(("%d: no midpoint\n", t->id));
889 150 : return REG_NOMATCH;
890 : }
891 3188 : mid = longest(v, d, begin, mid - 1, (int *) NULL);
892 3188 : NOERR();
893 3188 : if (mid == NULL)
894 : {
895 : /* failed to find a new one */
896 : MDEBUG(("%d: failed midpoint\n", t->id));
897 528 : return REG_NOMATCH;
898 : }
899 : MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
900 : }
901 :
902 : /* can't get here */
903 : return REG_ASSERT;
904 : }
905 :
906 : /*
907 : * crevcondissect - dissect match for concatenation node, shortest-first
908 : */
909 : static int /* regexec return code */
910 328 : crevcondissect(struct vars *v,
911 : struct subre *t,
912 : chr *begin, /* beginning of relevant substring */
913 : chr *end) /* end of same */
914 : {
915 328 : struct subre *left = t->child;
916 328 : struct subre *right = left->sibling;
917 : struct dfa *d;
918 : struct dfa *d2;
919 : chr *mid;
920 : int er;
921 :
922 : assert(t->op == '.');
923 : assert(left != NULL && left->cnfa.nstates > 0);
924 : assert(right != NULL && right->cnfa.nstates > 0);
925 : assert(right->sibling == NULL);
926 : assert(left->flags & SHORTER);
927 :
928 328 : d = getsubdfa(v, left);
929 328 : NOERR();
930 328 : d2 = getsubdfa(v, right);
931 328 : NOERR();
932 : MDEBUG(("%d: crevcondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
933 :
934 : /* pick a tentative midpoint */
935 328 : mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
936 328 : NOERR();
937 328 : if (mid == NULL)
938 0 : return REG_NOMATCH;
939 : MDEBUG(("%d: tentative midpoint %ld\n", t->id, LOFF(mid)));
940 :
941 : /* iterate until satisfaction or failure */
942 : for (;;)
943 : {
944 : /* try this midpoint on for size */
945 1210 : if (longest(v, d2, mid, end, (int *) NULL) == end)
946 : {
947 186 : er = cdissect(v, left, begin, mid);
948 186 : if (er == REG_OKAY)
949 : {
950 186 : er = cdissect(v, right, mid, end);
951 186 : if (er == REG_OKAY)
952 : {
953 : /* satisfaction */
954 : MDEBUG(("%d: successful\n", t->id));
955 162 : return REG_OKAY;
956 : }
957 : /* Reset left's matches (right should have done so itself) */
958 24 : zaptreesubs(v, left);
959 : }
960 24 : if (er != REG_NOMATCH)
961 0 : return er;
962 : }
963 1048 : NOERR();
964 :
965 : /* that midpoint didn't work, find a new one */
966 1048 : if (mid == end)
967 : {
968 : /* all possibilities exhausted */
969 : MDEBUG(("%d: no midpoint\n", t->id));
970 166 : return REG_NOMATCH;
971 : }
972 882 : mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL);
973 882 : NOERR();
974 882 : if (mid == NULL)
975 : {
976 : /* failed to find a new one */
977 : MDEBUG(("%d: failed midpoint\n", t->id));
978 0 : return REG_NOMATCH;
979 : }
980 : MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
981 : }
982 :
983 : /* can't get here */
984 : return REG_ASSERT;
985 : }
986 :
987 : /*
988 : * cbrdissect - dissect match for backref node
989 : *
990 : * The backref match might already have been verified by dfa_backref(),
991 : * but we don't know that for sure so must check it here.
992 : */
993 : static int /* regexec return code */
994 418 : cbrdissect(struct vars *v,
995 : struct subre *t,
996 : chr *begin, /* beginning of relevant substring */
997 : chr *end) /* end of same */
998 : {
999 418 : int n = t->backno;
1000 : size_t numreps;
1001 : size_t tlen;
1002 : size_t brlen;
1003 : chr *brstring;
1004 : chr *p;
1005 418 : int min = t->min;
1006 418 : int max = t->max;
1007 :
1008 : assert(t != NULL);
1009 : assert(t->op == 'b');
1010 : assert(n >= 0);
1011 : assert((size_t) n < v->nmatch);
1012 :
1013 : MDEBUG(("%d: cbrdissect %d{%d-%d} %ld-%ld\n", t->id, n, min, max,
1014 : LOFF(begin), LOFF(end)));
1015 :
1016 : /* get the backreferenced string */
1017 418 : if (v->pmatch[n].rm_so == -1)
1018 110 : return REG_NOMATCH;
1019 308 : brstring = v->start + v->pmatch[n].rm_so;
1020 308 : brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
1021 :
1022 : /* special cases for zero-length strings */
1023 308 : if (brlen == 0)
1024 : {
1025 : /*
1026 : * matches only if target is zero length, but any number of
1027 : * repetitions can be considered to be present
1028 : */
1029 14 : if (begin == end && min <= max)
1030 : {
1031 : MDEBUG(("%d: backref matched trivially\n", t->id));
1032 14 : return REG_OKAY;
1033 : }
1034 0 : return REG_NOMATCH;
1035 : }
1036 294 : if (begin == end)
1037 : {
1038 : /* matches only if zero repetitions are okay */
1039 12 : if (min == 0)
1040 : {
1041 : MDEBUG(("%d: backref matched trivially\n", t->id));
1042 10 : return REG_OKAY;
1043 : }
1044 2 : return REG_NOMATCH;
1045 : }
1046 :
1047 : /*
1048 : * check target length to see if it could possibly be an allowed number of
1049 : * repetitions of brstring
1050 : */
1051 : assert(end > begin);
1052 282 : tlen = end - begin;
1053 282 : if (tlen % brlen != 0)
1054 10 : return REG_NOMATCH;
1055 272 : numreps = tlen / brlen;
1056 272 : if (numreps < min || (numreps > max && max != DUPINF))
1057 6 : return REG_NOMATCH;
1058 :
1059 : /* okay, compare the actual string contents */
1060 266 : p = begin;
1061 472 : while (numreps-- > 0)
1062 : {
1063 302 : if ((*v->g->compare) (brstring, p, brlen) != 0)
1064 96 : return REG_NOMATCH;
1065 206 : p += brlen;
1066 : }
1067 :
1068 : MDEBUG(("%d: backref matched\n", t->id));
1069 170 : return REG_OKAY;
1070 : }
1071 :
1072 : /*
1073 : * caltdissect - dissect match for alternation node
1074 : */
1075 : static int /* regexec return code */
1076 160 : caltdissect(struct vars *v,
1077 : struct subre *t,
1078 : chr *begin, /* beginning of relevant substring */
1079 : chr *end) /* end of same */
1080 : {
1081 : struct dfa *d;
1082 : int er;
1083 :
1084 : assert(t->op == '|');
1085 :
1086 160 : t = t->child;
1087 : /* there should be at least 2 alternatives */
1088 : assert(t != NULL && t->sibling != NULL);
1089 :
1090 274 : while (t != NULL)
1091 : {
1092 : assert(t->cnfa.nstates > 0);
1093 :
1094 : MDEBUG(("%d: caltdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
1095 :
1096 222 : d = getsubdfa(v, t);
1097 222 : NOERR();
1098 222 : if (longest(v, d, begin, end, (int *) NULL) == end)
1099 : {
1100 : MDEBUG(("%d: caltdissect matched\n", t->id));
1101 176 : er = cdissect(v, t, begin, end);
1102 176 : if (er != REG_NOMATCH)
1103 108 : return er;
1104 : }
1105 114 : NOERR();
1106 :
1107 114 : t = t->sibling;
1108 : }
1109 :
1110 52 : return REG_NOMATCH;
1111 : }
1112 :
1113 : /*
1114 : * citerdissect - dissect match for iteration node
1115 : */
1116 : static int /* regexec return code */
1117 234 : citerdissect(struct vars *v,
1118 : struct subre *t,
1119 : chr *begin, /* beginning of relevant substring */
1120 : chr *end) /* end of same */
1121 : {
1122 : struct dfa *d;
1123 : chr **endpts;
1124 : chr *limit;
1125 : int min_matches;
1126 : size_t max_matches;
1127 : int nverified;
1128 : int k;
1129 : int i;
1130 : int er;
1131 :
1132 : assert(t->op == '*');
1133 : assert(t->child != NULL && t->child->cnfa.nstates > 0);
1134 : assert(!(t->child->flags & SHORTER));
1135 : assert(begin <= end);
1136 :
1137 : MDEBUG(("%d: citerdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
1138 :
1139 : /*
1140 : * For the moment, assume the minimum number of matches is 1. If zero
1141 : * matches are allowed, and the target string is empty, we are allowed to
1142 : * match regardless of the contents of the iter node --- but we would
1143 : * prefer to match once, so that capturing parens get set. (An example of
1144 : * the concern here is a pattern like "()*\1", which historically this
1145 : * code has allowed to succeed.) Therefore, we deal with the zero-matches
1146 : * case at the bottom, after failing to find any other way to match.
1147 : */
1148 234 : min_matches = t->min;
1149 234 : if (min_matches <= 0)
1150 162 : min_matches = 1;
1151 :
1152 : /*
1153 : * We need workspace to track the endpoints of each sub-match. Normally
1154 : * we consider only nonzero-length sub-matches, so there can be at most
1155 : * end-begin of them. However, if min is larger than that, we will also
1156 : * consider zero-length sub-matches in order to find enough matches.
1157 : *
1158 : * For convenience, endpts[0] contains the "begin" pointer and we store
1159 : * sub-match endpoints in endpts[1..max_matches].
1160 : */
1161 234 : max_matches = end - begin;
1162 234 : if (max_matches > t->max && t->max != DUPINF)
1163 0 : max_matches = t->max;
1164 234 : if (max_matches < min_matches)
1165 146 : max_matches = min_matches;
1166 234 : endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
1167 234 : if (endpts == NULL)
1168 0 : return REG_ESPACE;
1169 234 : endpts[0] = begin;
1170 :
1171 234 : d = getsubdfa(v, t->child);
1172 234 : if (ISERR())
1173 : {
1174 0 : FREE(endpts);
1175 0 : return v->err;
1176 : }
1177 :
1178 : /*
1179 : * Our strategy is to first find a set of sub-match endpoints that are
1180 : * valid according to the child node's DFA, and then recursively dissect
1181 : * each sub-match to confirm validity. If any validity check fails,
1182 : * backtrack that sub-match and try again. And, when we next try for a
1183 : * validity check, we need not recheck any successfully verified
1184 : * sub-matches that we didn't move the endpoints of. nverified remembers
1185 : * how many sub-matches are currently known okay.
1186 : */
1187 :
1188 : /* initialize to consider first sub-match */
1189 234 : nverified = 0;
1190 234 : k = 1;
1191 234 : limit = end;
1192 :
1193 : /* iterate until satisfaction or failure */
1194 1006 : while (k > 0)
1195 : {
1196 : /* try to find an endpoint for the k'th sub-match */
1197 814 : endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
1198 814 : if (ISERR())
1199 : {
1200 0 : FREE(endpts);
1201 0 : return v->err;
1202 : }
1203 814 : if (endpts[k] == NULL)
1204 : {
1205 : /* no match possible, so see if we can shorten previous one */
1206 424 : k--;
1207 424 : goto backtrack;
1208 : }
1209 : MDEBUG(("%d: working endpoint %d: %ld\n",
1210 : t->id, k, LOFF(endpts[k])));
1211 :
1212 : /* k'th sub-match can no longer be considered verified */
1213 390 : if (nverified >= k)
1214 16 : nverified = k - 1;
1215 :
1216 390 : if (endpts[k] != end)
1217 : {
1218 : /* haven't reached end yet, try another iteration if allowed */
1219 268 : if (k >= max_matches)
1220 : {
1221 : /* must try to shorten some previous match */
1222 0 : k--;
1223 0 : goto backtrack;
1224 : }
1225 :
1226 : /* reject zero-length match unless necessary to achieve min */
1227 268 : if (endpts[k] == endpts[k - 1] &&
1228 0 : (k >= min_matches || min_matches - k < end - endpts[k]))
1229 0 : goto backtrack;
1230 :
1231 268 : k++;
1232 268 : limit = end;
1233 268 : continue;
1234 : }
1235 :
1236 : /*
1237 : * We've identified a way to divide the string into k sub-matches that
1238 : * works so far as the child DFA can tell. If k is an allowed number
1239 : * of matches, start the slow part: recurse to verify each sub-match.
1240 : * We always have k <= max_matches, needn't check that.
1241 : */
1242 122 : if (k < min_matches)
1243 0 : goto backtrack;
1244 :
1245 : MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
1246 :
1247 200 : for (i = nverified + 1; i <= k; i++)
1248 : {
1249 : /* zap any match data from a non-last iteration */
1250 158 : zaptreesubs(v, t->child);
1251 158 : er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
1252 158 : if (er == REG_OKAY)
1253 : {
1254 78 : nverified = i;
1255 78 : continue;
1256 : }
1257 80 : if (er == REG_NOMATCH)
1258 80 : break;
1259 : /* oops, something failed */
1260 0 : FREE(endpts);
1261 0 : return er;
1262 : }
1263 :
1264 122 : if (i > k)
1265 : {
1266 : /* satisfaction */
1267 : MDEBUG(("%d: successful\n", t->id));
1268 42 : FREE(endpts);
1269 42 : return REG_OKAY;
1270 : }
1271 :
1272 : /* i'th match failed to verify, so backtrack it */
1273 80 : k = i;
1274 :
1275 504 : backtrack:
1276 :
1277 : /*
1278 : * Must consider shorter versions of the k'th sub-match. However,
1279 : * we'll only ask for a zero-length match if necessary.
1280 : */
1281 504 : while (k > 0)
1282 : {
1283 312 : chr *prev_end = endpts[k - 1];
1284 :
1285 312 : if (endpts[k] > prev_end)
1286 : {
1287 312 : limit = endpts[k] - 1;
1288 312 : if (limit > prev_end ||
1289 0 : (k < min_matches && min_matches - k >= end - prev_end))
1290 : {
1291 : /* break out of backtrack loop, continue the outer one */
1292 : break;
1293 : }
1294 : }
1295 : /* can't shorten k'th sub-match any more, consider previous one */
1296 0 : k--;
1297 : }
1298 : }
1299 :
1300 : /* all possibilities exhausted */
1301 192 : FREE(endpts);
1302 :
1303 : /*
1304 : * Now consider the possibility that we can match to a zero-length string
1305 : * by using zero repetitions.
1306 : */
1307 192 : if (t->min == 0 && begin == end)
1308 : {
1309 : MDEBUG(("%d: allowing zero matches\n", t->id));
1310 136 : return REG_OKAY;
1311 : }
1312 :
1313 : MDEBUG(("%d: failed\n", t->id));
1314 56 : return REG_NOMATCH;
1315 : }
1316 :
1317 : /*
1318 : * creviterdissect - dissect match for iteration node, shortest-first
1319 : */
1320 : static int /* regexec return code */
1321 14 : creviterdissect(struct vars *v,
1322 : struct subre *t,
1323 : chr *begin, /* beginning of relevant substring */
1324 : chr *end) /* end of same */
1325 : {
1326 : struct dfa *d;
1327 : chr **endpts;
1328 : chr *limit;
1329 : int min_matches;
1330 : size_t max_matches;
1331 : int nverified;
1332 : int k;
1333 : int i;
1334 : int er;
1335 :
1336 : assert(t->op == '*');
1337 : assert(t->child != NULL && t->child->cnfa.nstates > 0);
1338 : assert(t->child->flags & SHORTER);
1339 : assert(begin <= end);
1340 :
1341 : MDEBUG(("%d: creviterdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
1342 :
1343 : /*
1344 : * If zero matches are allowed, and target string is empty, just declare
1345 : * victory. OTOH, if target string isn't empty, zero matches can't work
1346 : * so we pretend the min is 1.
1347 : */
1348 14 : min_matches = t->min;
1349 14 : if (min_matches <= 0)
1350 : {
1351 14 : if (begin == end)
1352 : {
1353 : MDEBUG(("%d: allowing zero matches\n", t->id));
1354 2 : return REG_OKAY;
1355 : }
1356 12 : min_matches = 1;
1357 : }
1358 :
1359 : /*
1360 : * We need workspace to track the endpoints of each sub-match. Normally
1361 : * we consider only nonzero-length sub-matches, so there can be at most
1362 : * end-begin of them. However, if min is larger than that, we will also
1363 : * consider zero-length sub-matches in order to find enough matches.
1364 : *
1365 : * For convenience, endpts[0] contains the "begin" pointer and we store
1366 : * sub-match endpoints in endpts[1..max_matches].
1367 : */
1368 12 : max_matches = end - begin;
1369 12 : if (max_matches > t->max && t->max != DUPINF)
1370 12 : max_matches = t->max;
1371 12 : if (max_matches < min_matches)
1372 0 : max_matches = min_matches;
1373 12 : endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
1374 12 : if (endpts == NULL)
1375 0 : return REG_ESPACE;
1376 12 : endpts[0] = begin;
1377 :
1378 12 : d = getsubdfa(v, t->child);
1379 12 : if (ISERR())
1380 : {
1381 0 : FREE(endpts);
1382 0 : return v->err;
1383 : }
1384 :
1385 : /*
1386 : * Our strategy is to first find a set of sub-match endpoints that are
1387 : * valid according to the child node's DFA, and then recursively dissect
1388 : * each sub-match to confirm validity. If any validity check fails,
1389 : * backtrack that sub-match and try again. And, when we next try for a
1390 : * validity check, we need not recheck any successfully verified
1391 : * sub-matches that we didn't move the endpoints of. nverified remembers
1392 : * how many sub-matches are currently known okay.
1393 : */
1394 :
1395 : /* initialize to consider first sub-match */
1396 12 : nverified = 0;
1397 12 : k = 1;
1398 12 : limit = begin;
1399 :
1400 : /* iterate until satisfaction or failure */
1401 16 : while (k > 0)
1402 : {
1403 : /* disallow zero-length match unless necessary to achieve min */
1404 12 : if (limit == endpts[k - 1] &&
1405 12 : limit != end &&
1406 0 : (k >= min_matches || min_matches - k < end - limit))
1407 12 : limit++;
1408 :
1409 : /* if this is the last allowed sub-match, it must reach to the end */
1410 12 : if (k >= max_matches)
1411 12 : limit = end;
1412 :
1413 : /* try to find an endpoint for the k'th sub-match */
1414 12 : endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
1415 : (chr **) NULL, (int *) NULL);
1416 12 : if (ISERR())
1417 : {
1418 0 : FREE(endpts);
1419 0 : return v->err;
1420 : }
1421 12 : if (endpts[k] == NULL)
1422 : {
1423 : /* no match possible, so see if we can lengthen previous one */
1424 0 : k--;
1425 0 : goto backtrack;
1426 : }
1427 : MDEBUG(("%d: working endpoint %d: %ld\n",
1428 : t->id, k, LOFF(endpts[k])));
1429 :
1430 : /* k'th sub-match can no longer be considered verified */
1431 12 : if (nverified >= k)
1432 0 : nverified = k - 1;
1433 :
1434 12 : if (endpts[k] != end)
1435 : {
1436 : /* haven't reached end yet, try another iteration if allowed */
1437 0 : if (k >= max_matches)
1438 : {
1439 : /* must try to lengthen some previous match */
1440 0 : k--;
1441 0 : goto backtrack;
1442 : }
1443 :
1444 0 : k++;
1445 0 : limit = endpts[k - 1];
1446 0 : continue;
1447 : }
1448 :
1449 : /*
1450 : * We've identified a way to divide the string into k sub-matches that
1451 : * works so far as the child DFA can tell. If k is an allowed number
1452 : * of matches, start the slow part: recurse to verify each sub-match.
1453 : * We always have k <= max_matches, needn't check that.
1454 : */
1455 12 : if (k < min_matches)
1456 0 : goto backtrack;
1457 :
1458 : MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
1459 :
1460 20 : for (i = nverified + 1; i <= k; i++)
1461 : {
1462 : /* zap any match data from a non-last iteration */
1463 12 : zaptreesubs(v, t->child);
1464 12 : er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
1465 12 : if (er == REG_OKAY)
1466 : {
1467 8 : nverified = i;
1468 8 : continue;
1469 : }
1470 4 : if (er == REG_NOMATCH)
1471 4 : break;
1472 : /* oops, something failed */
1473 0 : FREE(endpts);
1474 0 : return er;
1475 : }
1476 :
1477 12 : if (i > k)
1478 : {
1479 : /* satisfaction */
1480 : MDEBUG(("%d: successful\n", t->id));
1481 8 : FREE(endpts);
1482 8 : return REG_OKAY;
1483 : }
1484 :
1485 : /* i'th match failed to verify, so backtrack it */
1486 4 : k = i;
1487 :
1488 4 : backtrack:
1489 :
1490 : /*
1491 : * Must consider longer versions of the k'th sub-match.
1492 : */
1493 8 : while (k > 0)
1494 : {
1495 4 : if (endpts[k] < end)
1496 : {
1497 0 : limit = endpts[k] + 1;
1498 : /* break out of backtrack loop, continue the outer one */
1499 0 : break;
1500 : }
1501 : /* can't lengthen k'th sub-match any more, consider previous one */
1502 4 : k--;
1503 : }
1504 : }
1505 :
1506 : /* all possibilities exhausted */
1507 : MDEBUG(("%d: failed\n", t->id));
1508 4 : FREE(endpts);
1509 4 : return REG_NOMATCH;
1510 : }
1511 :
1512 :
1513 :
1514 : #include "rege_dfa.c"
|