Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * wparser_def.c
4 : * Default text search parser
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : *
8 : *
9 : * IDENTIFICATION
10 : * src/backend/tsearch/wparser_def.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include <limits.h>
18 :
19 : #include "catalog/pg_collation.h"
20 : #include "commands/defrem.h"
21 : #include "miscadmin.h"
22 : #include "tsearch/ts_locale.h"
23 : #include "tsearch/ts_public.h"
24 : #include "tsearch/ts_type.h"
25 : #include "tsearch/ts_utils.h"
26 : #include "utils/builtins.h"
27 :
28 :
29 : /* Define me to enable tracing of parser behavior */
30 : /* #define WPARSER_TRACE */
31 :
32 :
33 : /* Output token categories */
34 :
35 : #define ASCIIWORD 1
36 : #define WORD_T 2
37 : #define NUMWORD 3
38 : #define EMAIL 4
39 : #define URL_T 5
40 : #define HOST 6
41 : #define SCIENTIFIC 7
42 : #define VERSIONNUMBER 8
43 : #define NUMPARTHWORD 9
44 : #define PARTHWORD 10
45 : #define ASCIIPARTHWORD 11
46 : #define SPACE 12
47 : #define TAG_T 13
48 : #define PROTOCOL 14
49 : #define NUMHWORD 15
50 : #define ASCIIHWORD 16
51 : #define HWORD 17
52 : #define URLPATH 18
53 : #define FILEPATH 19
54 : #define DECIMAL_T 20
55 : #define SIGNEDINT 21
56 : #define UNSIGNEDINT 22
57 : #define XMLENTITY 23
58 :
59 : #define LASTNUM 23
60 :
61 : static const char *const tok_alias[] = {
62 : "",
63 : "asciiword",
64 : "word",
65 : "numword",
66 : "email",
67 : "url",
68 : "host",
69 : "sfloat",
70 : "version",
71 : "hword_numpart",
72 : "hword_part",
73 : "hword_asciipart",
74 : "blank",
75 : "tag",
76 : "protocol",
77 : "numhword",
78 : "asciihword",
79 : "hword",
80 : "url_path",
81 : "file",
82 : "float",
83 : "int",
84 : "uint",
85 : "entity"
86 : };
87 :
88 : static const char *const lex_descr[] = {
89 : "",
90 : "Word, all ASCII",
91 : "Word, all letters",
92 : "Word, letters and digits",
93 : "Email address",
94 : "URL",
95 : "Host",
96 : "Scientific notation",
97 : "Version number",
98 : "Hyphenated word part, letters and digits",
99 : "Hyphenated word part, all letters",
100 : "Hyphenated word part, all ASCII",
101 : "Space symbols",
102 : "XML tag",
103 : "Protocol head",
104 : "Hyphenated word, letters and digits",
105 : "Hyphenated word, all ASCII",
106 : "Hyphenated word, all letters",
107 : "URL path",
108 : "File or path name",
109 : "Decimal notation",
110 : "Signed integer",
111 : "Unsigned integer",
112 : "XML entity"
113 : };
114 :
115 :
116 : /* Parser states */
117 :
118 : typedef enum
119 : {
120 : TPS_Base = 0,
121 : TPS_InNumWord,
122 : TPS_InAsciiWord,
123 : TPS_InWord,
124 : TPS_InUnsignedInt,
125 : TPS_InSignedIntFirst,
126 : TPS_InSignedInt,
127 : TPS_InSpace,
128 : TPS_InUDecimalFirst,
129 : TPS_InUDecimal,
130 : TPS_InDecimalFirst,
131 : TPS_InDecimal,
132 : TPS_InVerVersion,
133 : TPS_InSVerVersion,
134 : TPS_InVersionFirst,
135 : TPS_InVersion,
136 : TPS_InMantissaFirst,
137 : TPS_InMantissaSign,
138 : TPS_InMantissa,
139 : TPS_InXMLEntityFirst,
140 : TPS_InXMLEntity,
141 : TPS_InXMLEntityNumFirst,
142 : TPS_InXMLEntityNum,
143 : TPS_InXMLEntityHexNumFirst,
144 : TPS_InXMLEntityHexNum,
145 : TPS_InXMLEntityEnd,
146 : TPS_InTagFirst,
147 : TPS_InXMLBegin,
148 : TPS_InTagCloseFirst,
149 : TPS_InTagName,
150 : TPS_InTagBeginEnd,
151 : TPS_InTag,
152 : TPS_InTagEscapeK,
153 : TPS_InTagEscapeKK,
154 : TPS_InTagBackSleshed,
155 : TPS_InTagEnd,
156 : TPS_InCommentFirst,
157 : TPS_InCommentLast,
158 : TPS_InComment,
159 : TPS_InCloseCommentFirst,
160 : TPS_InCloseCommentLast,
161 : TPS_InCommentEnd,
162 : TPS_InHostFirstDomain,
163 : TPS_InHostDomainSecond,
164 : TPS_InHostDomain,
165 : TPS_InPortFirst,
166 : TPS_InPort,
167 : TPS_InHostFirstAN,
168 : TPS_InHost,
169 : TPS_InEmail,
170 : TPS_InFileFirst,
171 : TPS_InFileTwiddle,
172 : TPS_InPathFirst,
173 : TPS_InPathFirstFirst,
174 : TPS_InPathSecond,
175 : TPS_InFile,
176 : TPS_InFileNext,
177 : TPS_InURLPathFirst,
178 : TPS_InURLPathStart,
179 : TPS_InURLPath,
180 : TPS_InFURL,
181 : TPS_InProtocolFirst,
182 : TPS_InProtocolSecond,
183 : TPS_InProtocolEnd,
184 : TPS_InHyphenAsciiWordFirst,
185 : TPS_InHyphenAsciiWord,
186 : TPS_InHyphenWordFirst,
187 : TPS_InHyphenWord,
188 : TPS_InHyphenNumWordFirst,
189 : TPS_InHyphenNumWord,
190 : TPS_InHyphenDigitLookahead,
191 : TPS_InParseHyphen,
192 : TPS_InParseHyphenHyphen,
193 : TPS_InHyphenWordPart,
194 : TPS_InHyphenAsciiWordPart,
195 : TPS_InHyphenNumWordPart,
196 : TPS_InHyphenUnsignedInt,
197 : TPS_Null /* last state (fake value) */
198 : } TParserState;
199 :
200 : /* forward declaration */
201 : struct TParser;
202 :
203 : typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
204 : * except p_iseq */
205 : typedef void (*TParserSpecial) (struct TParser *); /* special handler for
206 : * special cases... */
207 :
208 : typedef struct
209 : {
210 : TParserCharTest isclass;
211 : char c;
212 : uint16 flags;
213 : TParserState tostate;
214 : int type;
215 : TParserSpecial special;
216 : } TParserStateActionItem;
217 :
218 : /* Flag bits in TParserStateActionItem.flags */
219 : #define A_NEXT 0x0000
220 : #define A_BINGO 0x0001
221 : #define A_POP 0x0002
222 : #define A_PUSH 0x0004
223 : #define A_RERUN 0x0008
224 : #define A_CLEAR 0x0010
225 : #define A_MERGE 0x0020
226 : #define A_CLRALL 0x0040
227 :
228 : typedef struct TParserPosition
229 : {
230 : int posbyte; /* position of parser in bytes */
231 : int poschar; /* position of parser in characters */
232 : int charlen; /* length of current char */
233 : int lenbytetoken; /* length of token-so-far in bytes */
234 : int lenchartoken; /* and in chars */
235 : TParserState state;
236 : struct TParserPosition *prev;
237 : const TParserStateActionItem *pushedAtAction;
238 : } TParserPosition;
239 :
240 : typedef struct TParser
241 : {
242 : /* string and position information */
243 : char *str; /* multibyte string */
244 : int lenstr; /* length of mbstring */
245 : wchar_t *wstr; /* wide character string */
246 : pg_wchar *pgwstr; /* wide character string for C-locale */
247 : bool usewide;
248 :
249 : /* State of parse */
250 : int charmaxlen;
251 : TParserPosition *state;
252 : bool ignore;
253 : bool wanthost;
254 :
255 : /* silly char */
256 : char c;
257 :
258 : /* out */
259 : char *token;
260 : int lenbytetoken;
261 : int lenchartoken;
262 : int type;
263 : } TParser;
264 :
265 :
266 : /* forward decls here */
267 : static bool TParserGet(TParser *prs);
268 :
269 :
270 : static TParserPosition *
271 10208 : newTParserPosition(TParserPosition *prev)
272 : {
273 10208 : TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
274 :
275 10208 : if (prev)
276 5238 : memcpy(res, prev, sizeof(TParserPosition));
277 : else
278 4970 : memset(res, 0, sizeof(TParserPosition));
279 :
280 10208 : res->prev = prev;
281 :
282 10208 : res->pushedAtAction = NULL;
283 :
284 10208 : return res;
285 : }
286 :
287 : static TParser *
288 4730 : TParserInit(char *str, int len)
289 : {
290 4730 : TParser *prs = (TParser *) palloc0(sizeof(TParser));
291 :
292 4730 : prs->charmaxlen = pg_database_encoding_max_length();
293 4730 : prs->str = str;
294 4730 : prs->lenstr = len;
295 :
296 : /*
297 : * Use wide char code only when max encoding length > 1.
298 : */
299 4730 : if (prs->charmaxlen > 1)
300 : {
301 1574 : pg_locale_t mylocale = 0; /* TODO */
302 :
303 1574 : prs->usewide = true;
304 1574 : if (database_ctype_is_c)
305 : {
306 : /*
307 : * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
308 : * be different from sizeof(wchar_t)
309 : */
310 1574 : prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
311 1574 : pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
312 : }
313 : else
314 : {
315 0 : prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
316 0 : char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
317 : mylocale);
318 : }
319 : }
320 : else
321 3156 : prs->usewide = false;
322 :
323 4730 : prs->state = newTParserPosition(NULL);
324 4730 : prs->state->state = TPS_Base;
325 :
326 : #ifdef WPARSER_TRACE
327 : fprintf(stderr, "parsing \"%.*s\"\n", len, str);
328 : #endif
329 :
330 4730 : return prs;
331 : }
332 :
333 : /*
334 : * As an alternative to a full TParserInit one can create a
335 : * TParserCopy which basically is a regular TParser without a private
336 : * copy of the string - instead it uses the one from another TParser.
337 : * This is useful because at some places TParsers are created
338 : * recursively and the repeated copying around of the strings can
339 : * cause major inefficiency if the source string is long.
340 : * The new parser starts parsing at the original's current position.
341 : *
342 : * Obviously one must not close the original TParser before the copy.
343 : */
344 : static TParser *
345 240 : TParserCopyInit(const TParser *orig)
346 : {
347 240 : TParser *prs = (TParser *) palloc0(sizeof(TParser));
348 :
349 240 : prs->charmaxlen = orig->charmaxlen;
350 240 : prs->str = orig->str + orig->state->posbyte;
351 240 : prs->lenstr = orig->lenstr - orig->state->posbyte;
352 240 : prs->usewide = orig->usewide;
353 :
354 240 : if (orig->pgwstr)
355 80 : prs->pgwstr = orig->pgwstr + orig->state->poschar;
356 240 : if (orig->wstr)
357 0 : prs->wstr = orig->wstr + orig->state->poschar;
358 :
359 240 : prs->state = newTParserPosition(NULL);
360 240 : prs->state->state = TPS_Base;
361 :
362 : #ifdef WPARSER_TRACE
363 : fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
364 : #endif
365 :
366 240 : return prs;
367 : }
368 :
369 :
370 : static void
371 4730 : TParserClose(TParser *prs)
372 : {
373 9460 : while (prs->state)
374 : {
375 4730 : TParserPosition *ptr = prs->state->prev;
376 :
377 4730 : pfree(prs->state);
378 4730 : prs->state = ptr;
379 : }
380 :
381 4730 : if (prs->wstr)
382 0 : pfree(prs->wstr);
383 4730 : if (prs->pgwstr)
384 1574 : pfree(prs->pgwstr);
385 :
386 : #ifdef WPARSER_TRACE
387 : fprintf(stderr, "closing parser\n");
388 : #endif
389 4730 : pfree(prs);
390 4730 : }
391 :
392 : /*
393 : * Close a parser created with TParserCopyInit
394 : */
395 : static void
396 240 : TParserCopyClose(TParser *prs)
397 : {
398 612 : while (prs->state)
399 : {
400 372 : TParserPosition *ptr = prs->state->prev;
401 :
402 372 : pfree(prs->state);
403 372 : prs->state = ptr;
404 : }
405 :
406 : #ifdef WPARSER_TRACE
407 : fprintf(stderr, "closing parser copy\n");
408 : #endif
409 240 : pfree(prs);
410 240 : }
411 :
412 :
413 : /*
414 : * Character-type support functions, equivalent to is* macros, but
415 : * working with any possible encodings and locales. Notes:
416 : * - with multibyte encoding and C-locale isw* function may fail
417 : * or give wrong result.
418 : * - multibyte encoding and C-locale often are used for
419 : * Asian languages.
420 : * - if locale is C then we use pgwstr instead of wstr.
421 : */
422 :
423 : #define p_iswhat(type, nonascii) \
424 : \
425 : static int \
426 : p_is##type(TParser *prs) \
427 : { \
428 : Assert(prs->state); \
429 : if (prs->usewide) \
430 : { \
431 : if (prs->pgwstr) \
432 : { \
433 : unsigned int c = *(prs->pgwstr + prs->state->poschar); \
434 : if (c > 0x7f) \
435 : return nonascii; \
436 : return is##type(c); \
437 : } \
438 : return isw##type(*(prs->wstr + prs->state->poschar)); \
439 : } \
440 : return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \
441 : } \
442 : \
443 : static int \
444 : p_isnot##type(TParser *prs) \
445 : { \
446 : return !p_is##type(prs); \
447 : }
448 :
449 : /*
450 : * In C locale with a multibyte encoding, any non-ASCII symbol is considered
451 : * an alpha character, but not a member of other char classes.
452 : */
453 25122 : p_iswhat(alnum, 1)
454 93772 : p_iswhat(alpha, 1)
455 37132 : p_iswhat(digit, 0)
456 0 : p_iswhat(lower, 0)
457 0 : p_iswhat(print, 0)
458 0 : p_iswhat(punct, 0)
459 678 : p_iswhat(space, 0)
460 0 : p_iswhat(upper, 0)
461 18 : p_iswhat(xdigit, 0)
462 :
463 : /* p_iseq should be used only for ascii symbols */
464 :
465 : static int
466 231368 : p_iseq(TParser *prs, char c)
467 : {
468 : Assert(prs->state);
469 231368 : return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
470 : }
471 :
472 : static int
473 100050 : p_isEOF(TParser *prs)
474 : {
475 : Assert(prs->state);
476 100050 : return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
477 : }
478 :
479 : static int
480 231368 : p_iseqC(TParser *prs)
481 : {
482 231368 : return p_iseq(prs, prs->c);
483 : }
484 :
485 : static int
486 0 : p_isneC(TParser *prs)
487 : {
488 0 : return !p_iseq(prs, prs->c);
489 : }
490 :
491 : static int
492 73460 : p_isascii(TParser *prs)
493 : {
494 73460 : return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
495 : }
496 :
497 : static int
498 73460 : p_isasclet(TParser *prs)
499 : {
500 73460 : return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
501 : }
502 :
503 : static int
504 2658 : p_isurlchar(TParser *prs)
505 : {
506 : char ch;
507 :
508 : /* no non-ASCII need apply */
509 2658 : if (prs->state->charlen != 1)
510 0 : return 0;
511 2658 : ch = *(prs->str + prs->state->posbyte);
512 : /* no spaces or control characters */
513 2658 : if (ch <= 0x20 || ch >= 0x7F)
514 234 : return 0;
515 : /* reject characters disallowed by RFC 3986 */
516 2424 : switch (ch)
517 : {
518 24 : case '"':
519 : case '<':
520 : case '>':
521 : case '\\':
522 : case '^':
523 : case '`':
524 : case '{':
525 : case '|':
526 : case '}':
527 24 : return 0;
528 : }
529 2400 : return 1;
530 : }
531 :
532 :
533 : /* deliberately suppress unused-function complaints for the above */
534 : void _make_compiler_happy(void);
535 : void
536 0 : _make_compiler_happy(void)
537 : {
538 0 : p_isalnum(NULL);
539 0 : p_isnotalnum(NULL);
540 0 : p_isalpha(NULL);
541 0 : p_isnotalpha(NULL);
542 0 : p_isdigit(NULL);
543 0 : p_isnotdigit(NULL);
544 0 : p_islower(NULL);
545 0 : p_isnotlower(NULL);
546 0 : p_isprint(NULL);
547 0 : p_isnotprint(NULL);
548 0 : p_ispunct(NULL);
549 0 : p_isnotpunct(NULL);
550 0 : p_isspace(NULL);
551 0 : p_isnotspace(NULL);
552 0 : p_isupper(NULL);
553 0 : p_isnotupper(NULL);
554 0 : p_isxdigit(NULL);
555 0 : p_isnotxdigit(NULL);
556 0 : p_isEOF(NULL);
557 0 : p_iseqC(NULL);
558 0 : p_isneC(NULL);
559 0 : }
560 :
561 :
562 : static void
563 252 : SpecialTags(TParser *prs)
564 : {
565 252 : switch (prs->state->lenchartoken)
566 : {
567 6 : case 8: /* </script */
568 6 : if (pg_strncasecmp(prs->token, "</script", 8) == 0)
569 6 : prs->ignore = false;
570 6 : break;
571 24 : case 7: /* <script || </style */
572 24 : if (pg_strncasecmp(prs->token, "</style", 7) == 0)
573 0 : prs->ignore = false;
574 24 : else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
575 6 : prs->ignore = true;
576 24 : break;
577 18 : case 6: /* <style */
578 18 : if (pg_strncasecmp(prs->token, "<style", 6) == 0)
579 0 : prs->ignore = true;
580 18 : break;
581 204 : default:
582 204 : break;
583 : }
584 252 : }
585 :
586 : static void
587 132 : SpecialFURL(TParser *prs)
588 : {
589 132 : prs->wanthost = true;
590 132 : prs->state->posbyte -= prs->state->lenbytetoken;
591 132 : prs->state->poschar -= prs->state->lenchartoken;
592 132 : }
593 :
594 : static void
595 36 : SpecialHyphen(TParser *prs)
596 : {
597 36 : prs->state->posbyte -= prs->state->lenbytetoken;
598 36 : prs->state->poschar -= prs->state->lenchartoken;
599 36 : }
600 :
601 : static void
602 0 : SpecialVerVersion(TParser *prs)
603 : {
604 0 : prs->state->posbyte -= prs->state->lenbytetoken;
605 0 : prs->state->poschar -= prs->state->lenchartoken;
606 0 : prs->state->lenbytetoken = 0;
607 0 : prs->state->lenchartoken = 0;
608 0 : }
609 :
610 : static int
611 480 : p_isstophost(TParser *prs)
612 : {
613 480 : if (prs->wanthost)
614 : {
615 204 : prs->wanthost = false;
616 204 : return 1;
617 : }
618 276 : return 0;
619 : }
620 :
621 : static int
622 36062 : p_isignore(TParser *prs)
623 : {
624 36062 : return (prs->ignore) ? 1 : 0;
625 : }
626 :
627 : static int
628 90 : p_ishost(TParser *prs)
629 : {
630 90 : TParser *tmpprs = TParserCopyInit(prs);
631 90 : int res = 0;
632 :
633 90 : tmpprs->wanthost = true;
634 :
635 : /*
636 : * Check stack depth before recursing. (Since TParserGet() doesn't
637 : * normally recurse, we put the cost of checking here not there.)
638 : */
639 90 : check_stack_depth();
640 :
641 90 : if (TParserGet(tmpprs) && tmpprs->type == HOST)
642 : {
643 72 : prs->state->posbyte += tmpprs->lenbytetoken;
644 72 : prs->state->poschar += tmpprs->lenchartoken;
645 72 : prs->state->lenbytetoken += tmpprs->lenbytetoken;
646 72 : prs->state->lenchartoken += tmpprs->lenchartoken;
647 72 : prs->state->charlen = tmpprs->state->charlen;
648 72 : res = 1;
649 : }
650 90 : TParserCopyClose(tmpprs);
651 :
652 90 : return res;
653 : }
654 :
655 : static int
656 150 : p_isURLPath(TParser *prs)
657 : {
658 150 : TParser *tmpprs = TParserCopyInit(prs);
659 150 : int res = 0;
660 :
661 150 : tmpprs->state = newTParserPosition(tmpprs->state);
662 150 : tmpprs->state->state = TPS_InURLPathFirst;
663 :
664 : /*
665 : * Check stack depth before recursing. (Since TParserGet() doesn't
666 : * normally recurse, we put the cost of checking here not there.)
667 : */
668 150 : check_stack_depth();
669 :
670 150 : if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
671 : {
672 132 : prs->state->posbyte += tmpprs->lenbytetoken;
673 132 : prs->state->poschar += tmpprs->lenchartoken;
674 132 : prs->state->lenbytetoken += tmpprs->lenbytetoken;
675 132 : prs->state->lenchartoken += tmpprs->lenchartoken;
676 132 : prs->state->charlen = tmpprs->state->charlen;
677 132 : res = 1;
678 : }
679 150 : TParserCopyClose(tmpprs);
680 :
681 150 : return res;
682 : }
683 :
684 : /*
685 : * returns true if current character has zero display length or
686 : * it's a special sign in several languages. Such characters
687 : * aren't a word-breaker although they aren't an isalpha.
688 : * In beginning of word they aren't a part of it.
689 : */
690 : static int
691 8724 : p_isspecial(TParser *prs)
692 : {
693 : /*
694 : * pg_dsplen could return -1 which means error or control character
695 : */
696 8724 : if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
697 0 : return 1;
698 :
699 : /*
700 : * Unicode Characters in the 'Mark, Spacing Combining' Category That
701 : * characters are not alpha although they are not breakers of word too.
702 : * Check that only in utf encoding, because other encodings aren't
703 : * supported by postgres or even exists.
704 : */
705 8724 : if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
706 : {
707 : static const pg_wchar strange_letter[] = {
708 : /*
709 : * use binary search, so elements should be ordered
710 : */
711 : 0x0903, /* DEVANAGARI SIGN VISARGA */
712 : 0x093E, /* DEVANAGARI VOWEL SIGN AA */
713 : 0x093F, /* DEVANAGARI VOWEL SIGN I */
714 : 0x0940, /* DEVANAGARI VOWEL SIGN II */
715 : 0x0949, /* DEVANAGARI VOWEL SIGN CANDRA O */
716 : 0x094A, /* DEVANAGARI VOWEL SIGN SHORT O */
717 : 0x094B, /* DEVANAGARI VOWEL SIGN O */
718 : 0x094C, /* DEVANAGARI VOWEL SIGN AU */
719 : 0x0982, /* BENGALI SIGN ANUSVARA */
720 : 0x0983, /* BENGALI SIGN VISARGA */
721 : 0x09BE, /* BENGALI VOWEL SIGN AA */
722 : 0x09BF, /* BENGALI VOWEL SIGN I */
723 : 0x09C0, /* BENGALI VOWEL SIGN II */
724 : 0x09C7, /* BENGALI VOWEL SIGN E */
725 : 0x09C8, /* BENGALI VOWEL SIGN AI */
726 : 0x09CB, /* BENGALI VOWEL SIGN O */
727 : 0x09CC, /* BENGALI VOWEL SIGN AU */
728 : 0x09D7, /* BENGALI AU LENGTH MARK */
729 : 0x0A03, /* GURMUKHI SIGN VISARGA */
730 : 0x0A3E, /* GURMUKHI VOWEL SIGN AA */
731 : 0x0A3F, /* GURMUKHI VOWEL SIGN I */
732 : 0x0A40, /* GURMUKHI VOWEL SIGN II */
733 : 0x0A83, /* GUJARATI SIGN VISARGA */
734 : 0x0ABE, /* GUJARATI VOWEL SIGN AA */
735 : 0x0ABF, /* GUJARATI VOWEL SIGN I */
736 : 0x0AC0, /* GUJARATI VOWEL SIGN II */
737 : 0x0AC9, /* GUJARATI VOWEL SIGN CANDRA O */
738 : 0x0ACB, /* GUJARATI VOWEL SIGN O */
739 : 0x0ACC, /* GUJARATI VOWEL SIGN AU */
740 : 0x0B02, /* ORIYA SIGN ANUSVARA */
741 : 0x0B03, /* ORIYA SIGN VISARGA */
742 : 0x0B3E, /* ORIYA VOWEL SIGN AA */
743 : 0x0B40, /* ORIYA VOWEL SIGN II */
744 : 0x0B47, /* ORIYA VOWEL SIGN E */
745 : 0x0B48, /* ORIYA VOWEL SIGN AI */
746 : 0x0B4B, /* ORIYA VOWEL SIGN O */
747 : 0x0B4C, /* ORIYA VOWEL SIGN AU */
748 : 0x0B57, /* ORIYA AU LENGTH MARK */
749 : 0x0BBE, /* TAMIL VOWEL SIGN AA */
750 : 0x0BBF, /* TAMIL VOWEL SIGN I */
751 : 0x0BC1, /* TAMIL VOWEL SIGN U */
752 : 0x0BC2, /* TAMIL VOWEL SIGN UU */
753 : 0x0BC6, /* TAMIL VOWEL SIGN E */
754 : 0x0BC7, /* TAMIL VOWEL SIGN EE */
755 : 0x0BC8, /* TAMIL VOWEL SIGN AI */
756 : 0x0BCA, /* TAMIL VOWEL SIGN O */
757 : 0x0BCB, /* TAMIL VOWEL SIGN OO */
758 : 0x0BCC, /* TAMIL VOWEL SIGN AU */
759 : 0x0BD7, /* TAMIL AU LENGTH MARK */
760 : 0x0C01, /* TELUGU SIGN CANDRABINDU */
761 : 0x0C02, /* TELUGU SIGN ANUSVARA */
762 : 0x0C03, /* TELUGU SIGN VISARGA */
763 : 0x0C41, /* TELUGU VOWEL SIGN U */
764 : 0x0C42, /* TELUGU VOWEL SIGN UU */
765 : 0x0C43, /* TELUGU VOWEL SIGN VOCALIC R */
766 : 0x0C44, /* TELUGU VOWEL SIGN VOCALIC RR */
767 : 0x0C82, /* KANNADA SIGN ANUSVARA */
768 : 0x0C83, /* KANNADA SIGN VISARGA */
769 : 0x0CBE, /* KANNADA VOWEL SIGN AA */
770 : 0x0CC0, /* KANNADA VOWEL SIGN II */
771 : 0x0CC1, /* KANNADA VOWEL SIGN U */
772 : 0x0CC2, /* KANNADA VOWEL SIGN UU */
773 : 0x0CC3, /* KANNADA VOWEL SIGN VOCALIC R */
774 : 0x0CC4, /* KANNADA VOWEL SIGN VOCALIC RR */
775 : 0x0CC7, /* KANNADA VOWEL SIGN EE */
776 : 0x0CC8, /* KANNADA VOWEL SIGN AI */
777 : 0x0CCA, /* KANNADA VOWEL SIGN O */
778 : 0x0CCB, /* KANNADA VOWEL SIGN OO */
779 : 0x0CD5, /* KANNADA LENGTH MARK */
780 : 0x0CD6, /* KANNADA AI LENGTH MARK */
781 : 0x0D02, /* MALAYALAM SIGN ANUSVARA */
782 : 0x0D03, /* MALAYALAM SIGN VISARGA */
783 : 0x0D3E, /* MALAYALAM VOWEL SIGN AA */
784 : 0x0D3F, /* MALAYALAM VOWEL SIGN I */
785 : 0x0D40, /* MALAYALAM VOWEL SIGN II */
786 : 0x0D46, /* MALAYALAM VOWEL SIGN E */
787 : 0x0D47, /* MALAYALAM VOWEL SIGN EE */
788 : 0x0D48, /* MALAYALAM VOWEL SIGN AI */
789 : 0x0D4A, /* MALAYALAM VOWEL SIGN O */
790 : 0x0D4B, /* MALAYALAM VOWEL SIGN OO */
791 : 0x0D4C, /* MALAYALAM VOWEL SIGN AU */
792 : 0x0D57, /* MALAYALAM AU LENGTH MARK */
793 : 0x0D82, /* SINHALA SIGN ANUSVARAYA */
794 : 0x0D83, /* SINHALA SIGN VISARGAYA */
795 : 0x0DCF, /* SINHALA VOWEL SIGN AELA-PILLA */
796 : 0x0DD0, /* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
797 : 0x0DD1, /* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
798 : 0x0DD8, /* SINHALA VOWEL SIGN GAETTA-PILLA */
799 : 0x0DD9, /* SINHALA VOWEL SIGN KOMBUVA */
800 : 0x0DDA, /* SINHALA VOWEL SIGN DIGA KOMBUVA */
801 : 0x0DDB, /* SINHALA VOWEL SIGN KOMBU DEKA */
802 : 0x0DDC, /* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
803 : 0x0DDD, /* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
804 : * AELA-PILLA */
805 : 0x0DDE, /* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
806 : 0x0DDF, /* SINHALA VOWEL SIGN GAYANUKITTA */
807 : 0x0DF2, /* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
808 : 0x0DF3, /* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
809 : 0x0F3E, /* TIBETAN SIGN YAR TSHES */
810 : 0x0F3F, /* TIBETAN SIGN MAR TSHES */
811 : 0x0F7F, /* TIBETAN SIGN RNAM BCAD */
812 : 0x102B, /* MYANMAR VOWEL SIGN TALL AA */
813 : 0x102C, /* MYANMAR VOWEL SIGN AA */
814 : 0x1031, /* MYANMAR VOWEL SIGN E */
815 : 0x1038, /* MYANMAR SIGN VISARGA */
816 : 0x103B, /* MYANMAR CONSONANT SIGN MEDIAL YA */
817 : 0x103C, /* MYANMAR CONSONANT SIGN MEDIAL RA */
818 : 0x1056, /* MYANMAR VOWEL SIGN VOCALIC R */
819 : 0x1057, /* MYANMAR VOWEL SIGN VOCALIC RR */
820 : 0x1062, /* MYANMAR VOWEL SIGN SGAW KAREN EU */
821 : 0x1063, /* MYANMAR TONE MARK SGAW KAREN HATHI */
822 : 0x1064, /* MYANMAR TONE MARK SGAW KAREN KE PHO */
823 : 0x1067, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
824 : 0x1068, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
825 : 0x1069, /* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
826 : 0x106A, /* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
827 : 0x106B, /* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
828 : 0x106C, /* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
829 : 0x106D, /* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
830 : 0x1083, /* MYANMAR VOWEL SIGN SHAN AA */
831 : 0x1084, /* MYANMAR VOWEL SIGN SHAN E */
832 : 0x1087, /* MYANMAR SIGN SHAN TONE-2 */
833 : 0x1088, /* MYANMAR SIGN SHAN TONE-3 */
834 : 0x1089, /* MYANMAR SIGN SHAN TONE-5 */
835 : 0x108A, /* MYANMAR SIGN SHAN TONE-6 */
836 : 0x108B, /* MYANMAR SIGN SHAN COUNCIL TONE-2 */
837 : 0x108C, /* MYANMAR SIGN SHAN COUNCIL TONE-3 */
838 : 0x108F, /* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
839 : 0x17B6, /* KHMER VOWEL SIGN AA */
840 : 0x17BE, /* KHMER VOWEL SIGN OE */
841 : 0x17BF, /* KHMER VOWEL SIGN YA */
842 : 0x17C0, /* KHMER VOWEL SIGN IE */
843 : 0x17C1, /* KHMER VOWEL SIGN E */
844 : 0x17C2, /* KHMER VOWEL SIGN AE */
845 : 0x17C3, /* KHMER VOWEL SIGN AI */
846 : 0x17C4, /* KHMER VOWEL SIGN OO */
847 : 0x17C5, /* KHMER VOWEL SIGN AU */
848 : 0x17C7, /* KHMER SIGN REAHMUK */
849 : 0x17C8, /* KHMER SIGN YUUKALEAPINTU */
850 : 0x1923, /* LIMBU VOWEL SIGN EE */
851 : 0x1924, /* LIMBU VOWEL SIGN AI */
852 : 0x1925, /* LIMBU VOWEL SIGN OO */
853 : 0x1926, /* LIMBU VOWEL SIGN AU */
854 : 0x1929, /* LIMBU SUBJOINED LETTER YA */
855 : 0x192A, /* LIMBU SUBJOINED LETTER RA */
856 : 0x192B, /* LIMBU SUBJOINED LETTER WA */
857 : 0x1930, /* LIMBU SMALL LETTER KA */
858 : 0x1931, /* LIMBU SMALL LETTER NGA */
859 : 0x1933, /* LIMBU SMALL LETTER TA */
860 : 0x1934, /* LIMBU SMALL LETTER NA */
861 : 0x1935, /* LIMBU SMALL LETTER PA */
862 : 0x1936, /* LIMBU SMALL LETTER MA */
863 : 0x1937, /* LIMBU SMALL LETTER RA */
864 : 0x1938, /* LIMBU SMALL LETTER LA */
865 : 0x19B0, /* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
866 : 0x19B1, /* NEW TAI LUE VOWEL SIGN AA */
867 : 0x19B2, /* NEW TAI LUE VOWEL SIGN II */
868 : 0x19B3, /* NEW TAI LUE VOWEL SIGN U */
869 : 0x19B4, /* NEW TAI LUE VOWEL SIGN UU */
870 : 0x19B5, /* NEW TAI LUE VOWEL SIGN E */
871 : 0x19B6, /* NEW TAI LUE VOWEL SIGN AE */
872 : 0x19B7, /* NEW TAI LUE VOWEL SIGN O */
873 : 0x19B8, /* NEW TAI LUE VOWEL SIGN OA */
874 : 0x19B9, /* NEW TAI LUE VOWEL SIGN UE */
875 : 0x19BA, /* NEW TAI LUE VOWEL SIGN AY */
876 : 0x19BB, /* NEW TAI LUE VOWEL SIGN AAY */
877 : 0x19BC, /* NEW TAI LUE VOWEL SIGN UY */
878 : 0x19BD, /* NEW TAI LUE VOWEL SIGN OY */
879 : 0x19BE, /* NEW TAI LUE VOWEL SIGN OAY */
880 : 0x19BF, /* NEW TAI LUE VOWEL SIGN UEY */
881 : 0x19C0, /* NEW TAI LUE VOWEL SIGN IY */
882 : 0x19C8, /* NEW TAI LUE TONE MARK-1 */
883 : 0x19C9, /* NEW TAI LUE TONE MARK-2 */
884 : 0x1A19, /* BUGINESE VOWEL SIGN E */
885 : 0x1A1A, /* BUGINESE VOWEL SIGN O */
886 : 0x1A1B, /* BUGINESE VOWEL SIGN AE */
887 : 0x1B04, /* BALINESE SIGN BISAH */
888 : 0x1B35, /* BALINESE VOWEL SIGN TEDUNG */
889 : 0x1B3B, /* BALINESE VOWEL SIGN RA REPA TEDUNG */
890 : 0x1B3D, /* BALINESE VOWEL SIGN LA LENGA TEDUNG */
891 : 0x1B3E, /* BALINESE VOWEL SIGN TALING */
892 : 0x1B3F, /* BALINESE VOWEL SIGN TALING REPA */
893 : 0x1B40, /* BALINESE VOWEL SIGN TALING TEDUNG */
894 : 0x1B41, /* BALINESE VOWEL SIGN TALING REPA TEDUNG */
895 : 0x1B43, /* BALINESE VOWEL SIGN PEPET TEDUNG */
896 : 0x1B44, /* BALINESE ADEG ADEG */
897 : 0x1B82, /* SUNDANESE SIGN PANGWISAD */
898 : 0x1BA1, /* SUNDANESE CONSONANT SIGN PAMINGKAL */
899 : 0x1BA6, /* SUNDANESE VOWEL SIGN PANAELAENG */
900 : 0x1BA7, /* SUNDANESE VOWEL SIGN PANOLONG */
901 : 0x1BAA, /* SUNDANESE SIGN PAMAAEH */
902 : 0x1C24, /* LEPCHA SUBJOINED LETTER YA */
903 : 0x1C25, /* LEPCHA SUBJOINED LETTER RA */
904 : 0x1C26, /* LEPCHA VOWEL SIGN AA */
905 : 0x1C27, /* LEPCHA VOWEL SIGN I */
906 : 0x1C28, /* LEPCHA VOWEL SIGN O */
907 : 0x1C29, /* LEPCHA VOWEL SIGN OO */
908 : 0x1C2A, /* LEPCHA VOWEL SIGN U */
909 : 0x1C2B, /* LEPCHA VOWEL SIGN UU */
910 : 0x1C34, /* LEPCHA CONSONANT SIGN NYIN-DO */
911 : 0x1C35, /* LEPCHA CONSONANT SIGN KANG */
912 : 0xA823, /* SYLOTI NAGRI VOWEL SIGN A */
913 : 0xA824, /* SYLOTI NAGRI VOWEL SIGN I */
914 : 0xA827, /* SYLOTI NAGRI VOWEL SIGN OO */
915 : 0xA880, /* SAURASHTRA SIGN ANUSVARA */
916 : 0xA881, /* SAURASHTRA SIGN VISARGA */
917 : 0xA8B4, /* SAURASHTRA CONSONANT SIGN HAARU */
918 : 0xA8B5, /* SAURASHTRA VOWEL SIGN AA */
919 : 0xA8B6, /* SAURASHTRA VOWEL SIGN I */
920 : 0xA8B7, /* SAURASHTRA VOWEL SIGN II */
921 : 0xA8B8, /* SAURASHTRA VOWEL SIGN U */
922 : 0xA8B9, /* SAURASHTRA VOWEL SIGN UU */
923 : 0xA8BA, /* SAURASHTRA VOWEL SIGN VOCALIC R */
924 : 0xA8BB, /* SAURASHTRA VOWEL SIGN VOCALIC RR */
925 : 0xA8BC, /* SAURASHTRA VOWEL SIGN VOCALIC L */
926 : 0xA8BD, /* SAURASHTRA VOWEL SIGN VOCALIC LL */
927 : 0xA8BE, /* SAURASHTRA VOWEL SIGN E */
928 : 0xA8BF, /* SAURASHTRA VOWEL SIGN EE */
929 : 0xA8C0, /* SAURASHTRA VOWEL SIGN AI */
930 : 0xA8C1, /* SAURASHTRA VOWEL SIGN O */
931 : 0xA8C2, /* SAURASHTRA VOWEL SIGN OO */
932 : 0xA8C3, /* SAURASHTRA VOWEL SIGN AU */
933 : 0xA952, /* REJANG CONSONANT SIGN H */
934 : 0xA953, /* REJANG VIRAMA */
935 : 0xAA2F, /* CHAM VOWEL SIGN O */
936 : 0xAA30, /* CHAM VOWEL SIGN AI */
937 : 0xAA33, /* CHAM CONSONANT SIGN YA */
938 : 0xAA34, /* CHAM CONSONANT SIGN RA */
939 : 0xAA4D /* CHAM CONSONANT SIGN FINAL H */
940 : };
941 2908 : const pg_wchar *StopLow = strange_letter,
942 2908 : *StopHigh = strange_letter + lengthof(strange_letter),
943 : *StopMiddle;
944 : pg_wchar c;
945 :
946 2908 : if (prs->pgwstr)
947 2908 : c = *(prs->pgwstr + prs->state->poschar);
948 : else
949 0 : c = (pg_wchar) *(prs->wstr + prs->state->poschar);
950 :
951 26172 : while (StopLow < StopHigh)
952 : {
953 23264 : StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
954 23264 : if (*StopMiddle == c)
955 0 : return 1;
956 23264 : else if (*StopMiddle < c)
957 0 : StopLow = StopMiddle + 1;
958 : else
959 23264 : StopHigh = StopMiddle;
960 : }
961 : }
962 :
963 8724 : return 0;
964 : }
965 :
966 : /*
967 : * Table of state/action of parser
968 : */
969 :
970 : static const TParserStateActionItem actionTPS_Base[] = {
971 : {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
972 : {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
973 : {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
974 : {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
975 : {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
976 : {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
977 : {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
978 : {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
979 : {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
980 : {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
981 : {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
982 : {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
983 : {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
984 : };
985 :
986 :
987 : static const TParserStateActionItem actionTPS_InNumWord[] = {
988 : {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
989 : {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
990 : {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
991 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
992 : {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
993 : {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
994 : {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
995 : {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
996 : };
997 :
998 : static const TParserStateActionItem actionTPS_InAsciiWord[] = {
999 : {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
1000 : {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1001 : {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1002 : {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1003 : {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1004 : {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1005 : {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1006 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1007 : {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
1008 : {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1009 : {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1010 : {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1011 : {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
1012 : {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
1013 : {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
1014 : };
1015 :
1016 : static const TParserStateActionItem actionTPS_InWord[] = {
1017 : {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
1018 : {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
1019 : {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
1020 : {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1021 : {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1022 : {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
1023 : };
1024 :
1025 : static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
1026 : {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
1027 : {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1028 : {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1029 : {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
1030 : {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1031 : {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1032 : {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1033 : {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1034 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1035 : {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
1036 : {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1037 : {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1038 : {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1039 : {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
1040 : };
1041 :
1042 : static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
1043 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1044 : {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
1045 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1046 : };
1047 :
1048 : static const TParserStateActionItem actionTPS_InSignedInt[] = {
1049 : {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
1050 : {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1051 : {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
1052 : {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1053 : {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1054 : {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
1055 : };
1056 :
1057 : static const TParserStateActionItem actionTPS_InSpace[] = {
1058 : {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
1059 : {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
1060 : {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
1061 : {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
1062 : {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
1063 : {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
1064 : {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
1065 : {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
1066 : {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
1067 : };
1068 :
1069 : static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
1070 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1071 : {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
1072 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1073 : };
1074 :
1075 : static const TParserStateActionItem actionTPS_InUDecimal[] = {
1076 : {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1077 : {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
1078 : {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1079 : {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1080 : {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1081 : {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1082 : };
1083 :
1084 : static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
1085 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1086 : {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
1087 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1088 : };
1089 :
1090 : static const TParserStateActionItem actionTPS_InDecimal[] = {
1091 : {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1092 : {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
1093 : {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
1094 : {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1095 : {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1096 : {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1097 : };
1098 :
1099 : static const TParserStateActionItem actionTPS_InVerVersion[] = {
1100 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1101 : {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
1102 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1103 : };
1104 :
1105 : static const TParserStateActionItem actionTPS_InSVerVersion[] = {
1106 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1107 : {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
1108 : {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1109 : };
1110 :
1111 :
1112 : static const TParserStateActionItem actionTPS_InVersionFirst[] = {
1113 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1114 : {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
1115 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1116 : };
1117 :
1118 : static const TParserStateActionItem actionTPS_InVersion[] = {
1119 : {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
1120 : {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
1121 : {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1122 : {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
1123 : };
1124 :
1125 : static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
1126 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1127 : {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1128 : {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
1129 : {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
1130 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1131 : };
1132 :
1133 : static const TParserStateActionItem actionTPS_InMantissaSign[] = {
1134 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1135 : {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1136 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1137 : };
1138 :
1139 : static const TParserStateActionItem actionTPS_InMantissa[] = {
1140 : {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
1141 : {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
1142 : {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
1143 : };
1144 :
1145 : static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
1146 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1147 : {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
1148 : {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1149 : {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1150 : {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1151 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1152 : };
1153 :
1154 : static const TParserStateActionItem actionTPS_InXMLEntity[] = {
1155 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1156 : {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1157 : {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1158 : {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1159 : {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
1160 : {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
1161 : {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1162 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1163 : };
1164 :
1165 : static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
1166 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1167 : {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1168 : {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1169 : {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1170 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1171 : };
1172 :
1173 : static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
1174 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1175 : {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1176 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1177 : };
1178 :
1179 : static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
1180 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1181 : {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1182 : {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1183 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1184 : };
1185 :
1186 : static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
1187 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1188 : {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1189 : {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1190 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1191 : };
1192 :
1193 : static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
1194 : {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
1195 : };
1196 :
1197 : static const TParserStateActionItem actionTPS_InTagFirst[] = {
1198 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1199 : {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
1200 : {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
1201 : {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
1202 : {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
1203 : {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
1204 : {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
1205 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1206 : };
1207 :
1208 : static const TParserStateActionItem actionTPS_InXMLBegin[] = {
1209 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1210 : /* <?xml ... */
1211 : /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
1212 : {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
1213 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1214 : };
1215 :
1216 : static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
1217 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1218 : {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
1219 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1220 : };
1221 :
1222 : static const TParserStateActionItem actionTPS_InTagName[] = {
1223 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1224 : /* <br/> case */
1225 : {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
1226 : {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1227 : {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
1228 : {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
1229 : {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1230 : {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1231 : {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1232 : {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1233 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1234 : };
1235 :
1236 : static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
1237 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1238 : {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
1239 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1240 : };
1241 :
1242 : static const TParserStateActionItem actionTPS_InTag[] = {
1243 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1244 : {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1245 : {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
1246 : {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
1247 : {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1248 : {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1249 : {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
1250 : {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1251 : {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1252 : {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
1253 : {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
1254 : {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1255 : {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1256 : {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
1257 : {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
1258 : {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
1259 : {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
1260 : {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
1261 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1262 : };
1263 :
1264 : static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
1265 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1266 : {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1267 : {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
1268 : {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
1269 : };
1270 :
1271 : static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
1272 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1273 : {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1274 : {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
1275 : {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
1276 : };
1277 :
1278 : static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
1279 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1280 : {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
1281 : };
1282 :
1283 : static const TParserStateActionItem actionTPS_InTagEnd[] = {
1284 : {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1285 : };
1286 :
1287 : static const TParserStateActionItem actionTPS_InCommentFirst[] = {
1288 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1289 : {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
1290 : /* <!DOCTYPE ...> */
1291 : {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
1292 : {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
1293 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1294 : };
1295 :
1296 : static const TParserStateActionItem actionTPS_InCommentLast[] = {
1297 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1298 : {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
1299 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1300 : };
1301 :
1302 : static const TParserStateActionItem actionTPS_InComment[] = {
1303 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1304 : {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
1305 : {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1306 : };
1307 :
1308 : static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
1309 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1310 : {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
1311 : {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1312 : };
1313 :
1314 : static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
1315 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1316 : {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1317 : {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
1318 : {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1319 : };
1320 :
1321 : static const TParserStateActionItem actionTPS_InCommentEnd[] = {
1322 : {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1323 : };
1324 :
1325 : static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
1326 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1327 : {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
1328 : {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1329 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1330 : };
1331 :
1332 : static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
1333 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1334 : {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1335 : {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1336 : {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1337 : {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1338 : {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1339 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1340 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1341 : };
1342 :
1343 : static const TParserStateActionItem actionTPS_InHostDomain[] = {
1344 : {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1345 : {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1346 : {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1347 : {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1348 : {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1349 : {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1350 : {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1351 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1352 : {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1353 : {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1354 : {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1355 : {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1356 : };
1357 :
1358 : static const TParserStateActionItem actionTPS_InPortFirst[] = {
1359 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1360 : {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1361 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1362 : };
1363 :
1364 : static const TParserStateActionItem actionTPS_InPort[] = {
1365 : {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1366 : {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1367 : {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1368 : {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1369 : {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1370 : };
1371 :
1372 : static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
1373 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1374 : {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1375 : {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1376 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1377 : };
1378 :
1379 : static const TParserStateActionItem actionTPS_InHost[] = {
1380 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1381 : {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1382 : {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1383 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1384 : {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1385 : {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1386 : {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1387 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1388 : };
1389 :
1390 : static const TParserStateActionItem actionTPS_InEmail[] = {
1391 : {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
1392 : {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1393 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1394 : };
1395 :
1396 : static const TParserStateActionItem actionTPS_InFileFirst[] = {
1397 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1398 : {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1399 : {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1400 : {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1401 : {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1402 : {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1403 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1404 : };
1405 :
1406 : static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
1407 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1408 : {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1409 : {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1410 : {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1411 : {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1412 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1413 : };
1414 :
1415 : static const TParserStateActionItem actionTPS_InPathFirst[] = {
1416 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1417 : {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1418 : {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1419 : {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1420 : {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1421 : {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1422 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1423 : };
1424 :
1425 : static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
1426 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1427 : {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1428 : {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1429 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1430 : };
1431 :
1432 : static const TParserStateActionItem actionTPS_InPathSecond[] = {
1433 : {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1434 : {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1435 : {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1436 : {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1437 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1438 : };
1439 :
1440 : static const TParserStateActionItem actionTPS_InFile[] = {
1441 : {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1442 : {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1443 : {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1444 : {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1445 : {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1446 : {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1447 : {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1448 : {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1449 : };
1450 :
1451 : static const TParserStateActionItem actionTPS_InFileNext[] = {
1452 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1453 : {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1454 : {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1455 : {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1456 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1457 : };
1458 :
1459 : static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
1460 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1461 : {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1462 : {NULL, 0, A_POP, TPS_Null, 0, NULL},
1463 : };
1464 :
1465 : static const TParserStateActionItem actionTPS_InURLPathStart[] = {
1466 : {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1467 : };
1468 :
1469 : static const TParserStateActionItem actionTPS_InURLPath[] = {
1470 : {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1471 : {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1472 : {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1473 : };
1474 :
1475 : static const TParserStateActionItem actionTPS_InFURL[] = {
1476 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1477 : {p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
1478 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1479 : };
1480 :
1481 : static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
1482 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1483 : {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1484 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1485 : };
1486 :
1487 : static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
1488 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1489 : {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1490 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1491 : };
1492 :
1493 : static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
1494 : {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1495 : };
1496 :
1497 : static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
1498 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1499 : {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1500 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1501 : {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1502 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1503 : };
1504 :
1505 : static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
1506 : {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
1507 : {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1508 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1509 : {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1510 : {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1511 : {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1512 : {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
1513 : };
1514 :
1515 : static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
1516 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1517 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1518 : {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1519 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1520 : };
1521 :
1522 : static const TParserStateActionItem actionTPS_InHyphenWord[] = {
1523 : {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
1524 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1525 : {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1526 : {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1527 : {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1528 : {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
1529 : };
1530 :
1531 : static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
1532 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1533 : {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1534 : {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1535 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1536 : };
1537 :
1538 : static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
1539 : {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
1540 : {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1541 : {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1542 : {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1543 : {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
1544 : };
1545 :
1546 : static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
1547 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1548 : {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1549 : {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1550 : {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1551 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1552 : };
1553 :
1554 : static const TParserStateActionItem actionTPS_InParseHyphen[] = {
1555 : {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1556 : {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1557 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1558 : {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1559 : {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1560 : {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1561 : };
1562 :
1563 : static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
1564 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1565 : {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1566 : {p_isspecial, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1567 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1568 : };
1569 :
1570 : static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
1571 : {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1572 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1573 : {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1574 : {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1575 : {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1576 : };
1577 :
1578 : static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
1579 : {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1580 : {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1581 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1582 : {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1583 : {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1584 : {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1585 : };
1586 :
1587 : static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
1588 : {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1589 : {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1590 : {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1591 : {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1592 : };
1593 :
1594 : static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
1595 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1596 : {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1597 : {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1598 : {p_isspecial, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1599 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1600 : };
1601 :
1602 :
1603 : /*
1604 : * main table of per-state parser actions
1605 : */
1606 : typedef struct
1607 : {
1608 : const TParserStateActionItem *action; /* the actual state info */
1609 : TParserState state; /* only for Assert crosscheck */
1610 : #ifdef WPARSER_TRACE
1611 : const char *state_name; /* only for debug printout */
1612 : #endif
1613 : } TParserStateAction;
1614 :
1615 : #ifdef WPARSER_TRACE
1616 : #define TPARSERSTATEACTION(state) \
1617 : { CppConcat(action,state), state, CppAsString(state) }
1618 : #else
1619 : #define TPARSERSTATEACTION(state) \
1620 : { CppConcat(action,state), state }
1621 : #endif
1622 :
1623 : /*
1624 : * order must be the same as in typedef enum {} TParserState!!
1625 : */
1626 :
1627 : static const TParserStateAction Actions[] = {
1628 : TPARSERSTATEACTION(TPS_Base),
1629 : TPARSERSTATEACTION(TPS_InNumWord),
1630 : TPARSERSTATEACTION(TPS_InAsciiWord),
1631 : TPARSERSTATEACTION(TPS_InWord),
1632 : TPARSERSTATEACTION(TPS_InUnsignedInt),
1633 : TPARSERSTATEACTION(TPS_InSignedIntFirst),
1634 : TPARSERSTATEACTION(TPS_InSignedInt),
1635 : TPARSERSTATEACTION(TPS_InSpace),
1636 : TPARSERSTATEACTION(TPS_InUDecimalFirst),
1637 : TPARSERSTATEACTION(TPS_InUDecimal),
1638 : TPARSERSTATEACTION(TPS_InDecimalFirst),
1639 : TPARSERSTATEACTION(TPS_InDecimal),
1640 : TPARSERSTATEACTION(TPS_InVerVersion),
1641 : TPARSERSTATEACTION(TPS_InSVerVersion),
1642 : TPARSERSTATEACTION(TPS_InVersionFirst),
1643 : TPARSERSTATEACTION(TPS_InVersion),
1644 : TPARSERSTATEACTION(TPS_InMantissaFirst),
1645 : TPARSERSTATEACTION(TPS_InMantissaSign),
1646 : TPARSERSTATEACTION(TPS_InMantissa),
1647 : TPARSERSTATEACTION(TPS_InXMLEntityFirst),
1648 : TPARSERSTATEACTION(TPS_InXMLEntity),
1649 : TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
1650 : TPARSERSTATEACTION(TPS_InXMLEntityNum),
1651 : TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
1652 : TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
1653 : TPARSERSTATEACTION(TPS_InXMLEntityEnd),
1654 : TPARSERSTATEACTION(TPS_InTagFirst),
1655 : TPARSERSTATEACTION(TPS_InXMLBegin),
1656 : TPARSERSTATEACTION(TPS_InTagCloseFirst),
1657 : TPARSERSTATEACTION(TPS_InTagName),
1658 : TPARSERSTATEACTION(TPS_InTagBeginEnd),
1659 : TPARSERSTATEACTION(TPS_InTag),
1660 : TPARSERSTATEACTION(TPS_InTagEscapeK),
1661 : TPARSERSTATEACTION(TPS_InTagEscapeKK),
1662 : TPARSERSTATEACTION(TPS_InTagBackSleshed),
1663 : TPARSERSTATEACTION(TPS_InTagEnd),
1664 : TPARSERSTATEACTION(TPS_InCommentFirst),
1665 : TPARSERSTATEACTION(TPS_InCommentLast),
1666 : TPARSERSTATEACTION(TPS_InComment),
1667 : TPARSERSTATEACTION(TPS_InCloseCommentFirst),
1668 : TPARSERSTATEACTION(TPS_InCloseCommentLast),
1669 : TPARSERSTATEACTION(TPS_InCommentEnd),
1670 : TPARSERSTATEACTION(TPS_InHostFirstDomain),
1671 : TPARSERSTATEACTION(TPS_InHostDomainSecond),
1672 : TPARSERSTATEACTION(TPS_InHostDomain),
1673 : TPARSERSTATEACTION(TPS_InPortFirst),
1674 : TPARSERSTATEACTION(TPS_InPort),
1675 : TPARSERSTATEACTION(TPS_InHostFirstAN),
1676 : TPARSERSTATEACTION(TPS_InHost),
1677 : TPARSERSTATEACTION(TPS_InEmail),
1678 : TPARSERSTATEACTION(TPS_InFileFirst),
1679 : TPARSERSTATEACTION(TPS_InFileTwiddle),
1680 : TPARSERSTATEACTION(TPS_InPathFirst),
1681 : TPARSERSTATEACTION(TPS_InPathFirstFirst),
1682 : TPARSERSTATEACTION(TPS_InPathSecond),
1683 : TPARSERSTATEACTION(TPS_InFile),
1684 : TPARSERSTATEACTION(TPS_InFileNext),
1685 : TPARSERSTATEACTION(TPS_InURLPathFirst),
1686 : TPARSERSTATEACTION(TPS_InURLPathStart),
1687 : TPARSERSTATEACTION(TPS_InURLPath),
1688 : TPARSERSTATEACTION(TPS_InFURL),
1689 : TPARSERSTATEACTION(TPS_InProtocolFirst),
1690 : TPARSERSTATEACTION(TPS_InProtocolSecond),
1691 : TPARSERSTATEACTION(TPS_InProtocolEnd),
1692 : TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
1693 : TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
1694 : TPARSERSTATEACTION(TPS_InHyphenWordFirst),
1695 : TPARSERSTATEACTION(TPS_InHyphenWord),
1696 : TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
1697 : TPARSERSTATEACTION(TPS_InHyphenNumWord),
1698 : TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
1699 : TPARSERSTATEACTION(TPS_InParseHyphen),
1700 : TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
1701 : TPARSERSTATEACTION(TPS_InHyphenWordPart),
1702 : TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
1703 : TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
1704 : TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
1705 : };
1706 :
1707 :
1708 : static bool
1709 28876 : TParserGet(TParser *prs)
1710 : {
1711 28876 : const TParserStateActionItem *item = NULL;
1712 :
1713 28876 : CHECK_FOR_INTERRUPTS();
1714 :
1715 : Assert(prs->state);
1716 :
1717 28876 : if (prs->state->posbyte >= prs->lenstr)
1718 4730 : return false;
1719 :
1720 24146 : prs->token = prs->str + prs->state->posbyte;
1721 24146 : prs->state->pushedAtAction = NULL;
1722 :
1723 : /* look at string */
1724 103170 : while (prs->state->posbyte <= prs->lenstr)
1725 : {
1726 103170 : if (prs->state->posbyte == prs->lenstr)
1727 4880 : prs->state->charlen = 0;
1728 : else
1729 131040 : prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1730 32750 : pg_mblen(prs->str + prs->state->posbyte);
1731 :
1732 : Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1733 : Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1734 : Assert(Actions[prs->state->state].state == prs->state->state);
1735 :
1736 103170 : if (prs->state->pushedAtAction)
1737 : {
1738 : /* After a POP, pick up at the next test */
1739 2592 : item = prs->state->pushedAtAction + 1;
1740 2592 : prs->state->pushedAtAction = NULL;
1741 : }
1742 : else
1743 : {
1744 100578 : item = Actions[prs->state->state].action;
1745 : Assert(item != NULL);
1746 : }
1747 :
1748 : /* find action by character class */
1749 555468 : while (item->isclass)
1750 : {
1751 524124 : prs->c = item->c;
1752 524124 : if (item->isclass(prs) != 0)
1753 71826 : break;
1754 452298 : item++;
1755 : }
1756 :
1757 : #ifdef WPARSER_TRACE
1758 : {
1759 : TParserPosition *ptr;
1760 :
1761 : fprintf(stderr, "state ");
1762 : /* indent according to stack depth */
1763 : for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1764 : fprintf(stderr, " ");
1765 : fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1766 : if (prs->state->posbyte < prs->lenstr)
1767 : fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1768 : else
1769 : fprintf(stderr, "at EOF");
1770 : fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1771 : (int) (item - Actions[prs->state->state].action),
1772 : (item->flags & A_BINGO) ? " BINGO" : "",
1773 : (item->flags & A_POP) ? " POP" : "",
1774 : (item->flags & A_PUSH) ? " PUSH" : "",
1775 : (item->flags & A_RERUN) ? " RERUN" : "",
1776 : (item->flags & A_CLEAR) ? " CLEAR" : "",
1777 : (item->flags & A_MERGE) ? " MERGE" : "",
1778 : (item->flags & A_CLRALL) ? " CLRALL" : "",
1779 : (item->tostate != TPS_Null) ? " tostate " : "",
1780 : (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1781 : (item->type > 0) ? " type " : "",
1782 : tok_alias[item->type]);
1783 : }
1784 : #endif
1785 :
1786 : /* call special handler if exists */
1787 103170 : if (item->special)
1788 420 : item->special(prs);
1789 :
1790 : /* BINGO, token is found */
1791 103170 : if (item->flags & A_BINGO)
1792 : {
1793 : Assert(item->type > 0);
1794 24146 : prs->lenbytetoken = prs->state->lenbytetoken;
1795 24146 : prs->lenchartoken = prs->state->lenchartoken;
1796 24146 : prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1797 24146 : prs->type = item->type;
1798 : }
1799 :
1800 : /* do various actions by flags */
1801 103170 : if (item->flags & A_POP)
1802 : { /* pop stored state in stack */
1803 2610 : TParserPosition *ptr = prs->state->prev;
1804 :
1805 2610 : pfree(prs->state);
1806 2610 : prs->state = ptr;
1807 : Assert(prs->state);
1808 : }
1809 100560 : else if (item->flags & A_PUSH)
1810 : { /* push (store) state in stack */
1811 5088 : prs->state->pushedAtAction = item; /* remember where we push */
1812 5088 : prs->state = newTParserPosition(prs->state);
1813 : }
1814 95472 : else if (item->flags & A_CLEAR)
1815 : { /* clear previous pushed state */
1816 : TParserPosition *ptr;
1817 :
1818 : Assert(prs->state->prev);
1819 498 : ptr = prs->state->prev->prev;
1820 498 : pfree(prs->state->prev);
1821 498 : prs->state->prev = ptr;
1822 : }
1823 94974 : else if (item->flags & A_CLRALL)
1824 : { /* clear all previous pushed state */
1825 : TParserPosition *ptr;
1826 :
1827 2778 : while (prs->state->prev)
1828 : {
1829 1998 : ptr = prs->state->prev->prev;
1830 1998 : pfree(prs->state->prev);
1831 1998 : prs->state->prev = ptr;
1832 : }
1833 : }
1834 94194 : else if (item->flags & A_MERGE)
1835 : { /* merge posinfo with current and pushed state */
1836 0 : TParserPosition *ptr = prs->state;
1837 :
1838 : Assert(prs->state->prev);
1839 0 : prs->state = prs->state->prev;
1840 :
1841 0 : prs->state->posbyte = ptr->posbyte;
1842 0 : prs->state->poschar = ptr->poschar;
1843 0 : prs->state->charlen = ptr->charlen;
1844 0 : prs->state->lenbytetoken = ptr->lenbytetoken;
1845 0 : prs->state->lenchartoken = ptr->lenchartoken;
1846 0 : pfree(ptr);
1847 : }
1848 :
1849 : /* set new state if pointed */
1850 103170 : if (item->tostate != TPS_Null)
1851 66154 : prs->state->state = item->tostate;
1852 :
1853 : /* check for go away */
1854 103170 : if ((item->flags & A_BINGO) ||
1855 79024 : (prs->state->posbyte >= prs->lenstr &&
1856 0 : (item->flags & A_RERUN) == 0))
1857 : break;
1858 :
1859 : /* go to beginning of loop if we should rerun or we just restore state */
1860 79024 : if (item->flags & (A_RERUN | A_POP))
1861 2634 : continue;
1862 :
1863 : /* move forward */
1864 76390 : if (prs->state->charlen)
1865 : {
1866 76390 : prs->state->posbyte += prs->state->charlen;
1867 76390 : prs->state->lenbytetoken += prs->state->charlen;
1868 76390 : prs->state->poschar++;
1869 76390 : prs->state->lenchartoken++;
1870 : }
1871 : }
1872 :
1873 24146 : return (item && (item->flags & A_BINGO));
1874 : }
1875 :
1876 : Datum
1877 5600 : prsd_lextype(PG_FUNCTION_ARGS)
1878 : {
1879 5600 : LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1880 : int i;
1881 :
1882 134400 : for (i = 1; i <= LASTNUM; i++)
1883 : {
1884 128800 : descr[i - 1].lexid = i;
1885 128800 : descr[i - 1].alias = pstrdup(tok_alias[i]);
1886 128800 : descr[i - 1].descr = pstrdup(lex_descr[i]);
1887 : }
1888 :
1889 5600 : descr[LASTNUM].lexid = 0;
1890 :
1891 5600 : PG_RETURN_POINTER(descr);
1892 : }
1893 :
1894 : Datum
1895 4730 : prsd_start(PG_FUNCTION_ARGS)
1896 : {
1897 4730 : PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
1898 : }
1899 :
1900 : Datum
1901 28636 : prsd_nexttoken(PG_FUNCTION_ARGS)
1902 : {
1903 28636 : TParser *p = (TParser *) PG_GETARG_POINTER(0);
1904 28636 : char **t = (char **) PG_GETARG_POINTER(1);
1905 28636 : int *tlen = (int *) PG_GETARG_POINTER(2);
1906 :
1907 28636 : if (!TParserGet(p))
1908 4730 : PG_RETURN_INT32(0);
1909 :
1910 23906 : *t = p->token;
1911 23906 : *tlen = p->lenbytetoken;
1912 :
1913 23906 : PG_RETURN_INT32(p->type);
1914 : }
1915 :
1916 : Datum
1917 4730 : prsd_end(PG_FUNCTION_ARGS)
1918 : {
1919 4730 : TParser *p = (TParser *) PG_GETARG_POINTER(0);
1920 :
1921 4730 : TParserClose(p);
1922 4730 : PG_RETURN_VOID();
1923 : }
1924 :
1925 :
1926 : /*
1927 : * ts_headline support begins here
1928 : */
1929 :
1930 : /* token type classification macros */
1931 : #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
1932 : #define HLIDREPLACE(x) ( (x)==TAG_T )
1933 : #define HLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1934 : #define XMLHLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1935 : #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
1936 : #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
1937 :
1938 : /*
1939 : * Macros useful in headline selection. These rely on availability of
1940 : * "HeadlineParsedText *prs" describing some text, and "int shortword"
1941 : * describing the "short word" length parameter.
1942 : */
1943 :
1944 : /* Interesting words are non-repeated search terms */
1945 : #define INTERESTINGWORD(j) \
1946 : (prs->words[j].item && !prs->words[j].repeated)
1947 :
1948 : /* Don't want to end at a non-word or a short word, unless interesting */
1949 : #define BADENDPOINT(j) \
1950 : ((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \
1951 : !INTERESTINGWORD(j))
1952 :
1953 : typedef struct
1954 : {
1955 : /* one cover (well, really one fragment) for mark_hl_fragments */
1956 : int32 startpos; /* fragment's starting word index */
1957 : int32 endpos; /* ending word index (inclusive) */
1958 : int32 poslen; /* number of interesting words */
1959 : int32 curlen; /* total number of words */
1960 : bool chosen; /* chosen? */
1961 : bool excluded; /* excluded? */
1962 : } CoverPos;
1963 :
1964 : typedef struct
1965 : {
1966 : /* callback data for checkcondition_HL */
1967 : HeadlineWordEntry *words;
1968 : int len;
1969 : } hlCheck;
1970 :
1971 :
1972 : /*
1973 : * TS_execute callback for matching a tsquery operand to headline words
1974 : *
1975 : * Note: it's tempting to report words[] indexes as pos values to save
1976 : * searching in hlCover; but that would screw up phrase matching, which
1977 : * expects to measure distances in lexemes not tokens.
1978 : */
1979 : static TSTernaryValue
1980 1000 : checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
1981 : {
1982 1000 : hlCheck *checkval = (hlCheck *) opaque;
1983 : int i;
1984 :
1985 : /* scan words array for matching items */
1986 25450 : for (i = 0; i < checkval->len; i++)
1987 : {
1988 24650 : if (checkval->words[i].item == val)
1989 : {
1990 : /* if data == NULL, don't need to report positions */
1991 874 : if (!data)
1992 200 : return TS_YES;
1993 :
1994 674 : if (!data->pos)
1995 : {
1996 476 : data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
1997 476 : data->allocated = true;
1998 476 : data->npos = 1;
1999 476 : data->pos[0] = checkval->words[i].pos;
2000 : }
2001 198 : else if (data->pos[data->npos - 1] < checkval->words[i].pos)
2002 : {
2003 198 : data->pos[data->npos++] = checkval->words[i].pos;
2004 : }
2005 : }
2006 : }
2007 :
2008 800 : if (data && data->npos > 0)
2009 476 : return TS_YES;
2010 :
2011 324 : return TS_NO;
2012 : }
2013 :
2014 : /*
2015 : * hlCover: try to find a substring of prs' word list that satisfies query
2016 : *
2017 : * locations is the result of TS_execute_locations() for the query.
2018 : * We use this to identify plausible subranges of the query.
2019 : *
2020 : * *nextpos is the lexeme position (NOT word index) to start the search
2021 : * at. Caller should initialize this to zero. If successful, we'll
2022 : * advance it to the next place to search at.
2023 : *
2024 : * On success, sets *p to first word index and *q to last word index of the
2025 : * cover substring, and returns true.
2026 : *
2027 : * The result is a minimal cover, in the sense that both *p and *q will be
2028 : * words used in the query.
2029 : */
2030 : static bool
2031 562 : hlCover(HeadlineParsedText *prs, TSQuery query, List *locations,
2032 : int *nextpos, int *p, int *q)
2033 : {
2034 562 : int pos = *nextpos;
2035 :
2036 : /* This loop repeats when our selected word-range fails the query */
2037 : for (;;)
2038 60 : {
2039 : int posb,
2040 : pose;
2041 : ListCell *lc;
2042 :
2043 : /*
2044 : * For each AND'ed query term or phrase, find its first occurrence at
2045 : * or after pos; set pose to the maximum of those positions.
2046 : *
2047 : * We need not consider ORs or NOTs here; see the comments for
2048 : * TS_execute_locations(). Rechecking the match with TS_execute(),
2049 : * below, will deal with any ensuing imprecision.
2050 : */
2051 622 : pose = -1;
2052 966 : foreach(lc, locations)
2053 : {
2054 466 : ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
2055 466 : int first = -1;
2056 :
2057 792 : for (int i = 0; i < pdata->npos; i++)
2058 : {
2059 : /* For phrase matches, use the ending lexeme */
2060 670 : int endp = pdata->pos[i];
2061 :
2062 670 : if (endp >= pos)
2063 : {
2064 344 : first = endp;
2065 344 : break;
2066 : }
2067 : }
2068 466 : if (first < 0)
2069 122 : return false; /* no more matches for this term */
2070 344 : if (first > pose)
2071 326 : pose = first;
2072 : }
2073 :
2074 500 : if (pose < 0)
2075 246 : return false; /* we only get here if empty list */
2076 :
2077 : /*
2078 : * Now, for each AND'ed query term or phrase, find its last occurrence
2079 : * at or before pose; set posb to the minimum of those positions.
2080 : *
2081 : * We start posb at INT_MAX - 1 to guarantee no overflow if we compute
2082 : * posb + 1 below.
2083 : */
2084 254 : posb = INT_MAX - 1;
2085 586 : foreach(lc, locations)
2086 : {
2087 332 : ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
2088 332 : int last = -1;
2089 :
2090 494 : for (int i = pdata->npos - 1; i >= 0; i--)
2091 : {
2092 : /* For phrase matches, use the starting lexeme */
2093 494 : int startp = pdata->pos[i] - pdata->width;
2094 :
2095 494 : if (startp <= pose)
2096 : {
2097 332 : last = startp;
2098 332 : break;
2099 : }
2100 : }
2101 332 : if (last < posb)
2102 272 : posb = last;
2103 : }
2104 :
2105 : /*
2106 : * We could end up with posb to the left of pos, in case some phrase
2107 : * match crosses pos. Try the match starting at pos anyway, since the
2108 : * result of TS_execute_locations is imprecise for phrase matches OR'd
2109 : * with plain matches; that is, if the query is "(A <-> B) | C" then C
2110 : * could match at pos even though the phrase match would have to
2111 : * extend to the left of pos.
2112 : */
2113 254 : posb = Max(posb, pos);
2114 :
2115 : /* This test probably always succeeds, but be paranoid */
2116 254 : if (posb <= pose)
2117 : {
2118 : /*
2119 : * posb .. pose is now the shortest, earliest-after-pos range of
2120 : * lexeme positions containing all the query terms. It will
2121 : * contain all phrase matches, too, except in the corner case
2122 : * described just above.
2123 : *
2124 : * Now convert these lexeme positions to indexes in prs->words[].
2125 : */
2126 254 : int idxb = -1;
2127 254 : int idxe = -1;
2128 :
2129 11624 : for (int i = 0; i < prs->curwords; i++)
2130 : {
2131 11496 : if (prs->words[i].item == NULL)
2132 10612 : continue;
2133 884 : if (idxb < 0 && prs->words[i].pos >= posb)
2134 254 : idxb = i;
2135 884 : if (prs->words[i].pos <= pose)
2136 758 : idxe = i;
2137 : else
2138 126 : break;
2139 : }
2140 :
2141 : /* This test probably always succeeds, but be paranoid */
2142 254 : if (idxb >= 0 && idxe >= idxb)
2143 : {
2144 : /*
2145 : * Finally, check that the selected range satisfies the query.
2146 : * This should succeed in all simple cases; but odd cases
2147 : * involving non-top-level NOT conditions or phrase matches
2148 : * OR'd with other things could fail, since the result of
2149 : * TS_execute_locations doesn't fully represent such things.
2150 : */
2151 : hlCheck ch;
2152 :
2153 254 : ch.words = &(prs->words[idxb]);
2154 254 : ch.len = idxe - idxb + 1;
2155 254 : if (TS_execute(GETQUERY(query), &ch,
2156 : TS_EXEC_EMPTY, checkcondition_HL))
2157 : {
2158 : /* Match! Advance *nextpos and return the word range. */
2159 194 : *nextpos = posb + 1;
2160 194 : *p = idxb;
2161 194 : *q = idxe;
2162 194 : return true;
2163 : }
2164 : }
2165 : }
2166 :
2167 : /*
2168 : * Advance pos and try again. Any later workable match must start
2169 : * beyond posb.
2170 : */
2171 60 : pos = posb + 1;
2172 : }
2173 : /* Can't get here, but stupider compilers complain if we leave it off */
2174 : return false;
2175 : }
2176 :
2177 : /*
2178 : * Apply suitable highlight marking to words selected by headline selector
2179 : *
2180 : * The words from startpos to endpos inclusive are marked per highlightall
2181 : */
2182 : static void
2183 386 : mark_fragment(HeadlineParsedText *prs, bool highlightall,
2184 : int startpos, int endpos)
2185 : {
2186 : int i;
2187 :
2188 5654 : for (i = startpos; i <= endpos; i++)
2189 : {
2190 5268 : if (prs->words[i].item)
2191 500 : prs->words[i].selected = 1;
2192 5268 : if (!highlightall)
2193 : {
2194 5022 : if (HLIDREPLACE(prs->words[i].type))
2195 0 : prs->words[i].replace = 1;
2196 5022 : else if (HLIDSKIP(prs->words[i].type))
2197 0 : prs->words[i].skip = 1;
2198 : }
2199 : else
2200 : {
2201 246 : if (XMLHLIDSKIP(prs->words[i].type))
2202 6 : prs->words[i].skip = 1;
2203 : }
2204 :
2205 5268 : prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2206 : }
2207 386 : }
2208 :
2209 : /*
2210 : * split a cover substring into fragments not longer than max_words
2211 : *
2212 : * At entry, *startpos and *endpos are the (remaining) bounds of the cover
2213 : * substring. They are updated to hold the bounds of the next fragment.
2214 : *
2215 : * *curlen and *poslen are set to the fragment's length, in words and
2216 : * interesting words respectively.
2217 : */
2218 : static void
2219 36 : get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
2220 : int *curlen, int *poslen, int max_words)
2221 : {
2222 : int i;
2223 :
2224 : /*
2225 : * Objective: select a fragment of words between startpos and endpos such
2226 : * that it has at most max_words and both ends have query words. If the
2227 : * startpos and endpos are the endpoints of the cover and the cover has
2228 : * fewer words than max_words, then this function should just return the
2229 : * cover
2230 : */
2231 : /* first move startpos to an item */
2232 888 : for (i = *startpos; i <= *endpos; i++)
2233 : {
2234 888 : *startpos = i;
2235 888 : if (INTERESTINGWORD(i))
2236 36 : break;
2237 : }
2238 : /* cut endpos to have only max_words */
2239 36 : *curlen = 0;
2240 36 : *poslen = 0;
2241 960 : for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
2242 : {
2243 924 : if (!NONWORDTOKEN(prs->words[i].type))
2244 480 : *curlen += 1;
2245 924 : if (INTERESTINGWORD(i))
2246 54 : *poslen += 1;
2247 : }
2248 : /* if the cover was cut then move back endpos to a query item */
2249 36 : if (*endpos > i)
2250 : {
2251 12 : *endpos = i;
2252 840 : for (i = *endpos; i >= *startpos; i--)
2253 : {
2254 840 : *endpos = i;
2255 840 : if (INTERESTINGWORD(i))
2256 12 : break;
2257 828 : if (!NONWORDTOKEN(prs->words[i].type))
2258 408 : *curlen -= 1;
2259 : }
2260 : }
2261 36 : }
2262 :
2263 : /*
2264 : * Headline selector used when MaxFragments > 0
2265 : *
2266 : * Note: in this mode, highlightall is disregarded for phrase selection;
2267 : * it only controls presentation details.
2268 : */
2269 : static void
2270 30 : mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, List *locations,
2271 : bool highlightall,
2272 : int shortword, int min_words,
2273 : int max_words, int max_fragments)
2274 : {
2275 : int32 poslen,
2276 : curlen,
2277 : i,
2278 : f,
2279 30 : num_f = 0;
2280 : int32 stretch,
2281 : maxstretch,
2282 : posmarker;
2283 :
2284 30 : int32 startpos = 0,
2285 30 : endpos = 0,
2286 30 : nextpos = 0,
2287 30 : p = 0,
2288 30 : q = 0;
2289 :
2290 30 : int32 numcovers = 0,
2291 30 : maxcovers = 32;
2292 :
2293 : int32 minI,
2294 : minwords,
2295 : maxitems;
2296 : CoverPos *covers;
2297 :
2298 30 : covers = palloc(maxcovers * sizeof(CoverPos));
2299 :
2300 : /* get all covers */
2301 54 : while (hlCover(prs, query, locations, &nextpos, &p, &q))
2302 : {
2303 24 : startpos = p;
2304 24 : endpos = q;
2305 :
2306 : /*
2307 : * Break the cover into smaller fragments such that each fragment has
2308 : * at most max_words. Also ensure that each end of each fragment is a
2309 : * query word. This will allow us to stretch the fragment in either
2310 : * direction
2311 : */
2312 :
2313 60 : while (startpos <= endpos)
2314 : {
2315 36 : get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
2316 36 : if (numcovers >= maxcovers)
2317 : {
2318 0 : maxcovers *= 2;
2319 0 : covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
2320 : }
2321 36 : covers[numcovers].startpos = startpos;
2322 36 : covers[numcovers].endpos = endpos;
2323 36 : covers[numcovers].curlen = curlen;
2324 36 : covers[numcovers].poslen = poslen;
2325 36 : covers[numcovers].chosen = false;
2326 36 : covers[numcovers].excluded = false;
2327 36 : numcovers++;
2328 36 : startpos = endpos + 1;
2329 36 : endpos = q;
2330 : }
2331 : }
2332 :
2333 : /* choose best covers */
2334 66 : for (f = 0; f < max_fragments; f++)
2335 : {
2336 48 : maxitems = 0;
2337 48 : minwords = PG_INT32_MAX;
2338 48 : minI = -1;
2339 :
2340 : /*
2341 : * Choose the cover that contains max items. In case of tie choose the
2342 : * one with smaller number of words.
2343 : */
2344 114 : for (i = 0; i < numcovers; i++)
2345 : {
2346 66 : if (!covers[i].chosen && !covers[i].excluded &&
2347 48 : (maxitems < covers[i].poslen ||
2348 12 : (maxitems == covers[i].poslen &&
2349 12 : minwords > covers[i].curlen)))
2350 : {
2351 36 : maxitems = covers[i].poslen;
2352 36 : minwords = covers[i].curlen;
2353 36 : minI = i;
2354 : }
2355 : }
2356 : /* if a cover was found mark it */
2357 48 : if (minI >= 0)
2358 : {
2359 36 : covers[minI].chosen = true;
2360 : /* adjust the size of cover */
2361 36 : startpos = covers[minI].startpos;
2362 36 : endpos = covers[minI].endpos;
2363 36 : curlen = covers[minI].curlen;
2364 : /* stretch the cover if cover size is lower than max_words */
2365 36 : if (curlen < max_words)
2366 : {
2367 : /* divide the stretch on both sides of cover */
2368 36 : maxstretch = (max_words - curlen) / 2;
2369 :
2370 : /*
2371 : * first stretch the startpos stop stretching if 1. we hit the
2372 : * beginning of document 2. exceed maxstretch 3. we hit an
2373 : * already marked fragment
2374 : */
2375 36 : stretch = 0;
2376 36 : posmarker = startpos;
2377 600 : for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
2378 : {
2379 564 : if (!NONWORDTOKEN(prs->words[i].type))
2380 : {
2381 270 : curlen++;
2382 270 : stretch++;
2383 : }
2384 564 : posmarker = i;
2385 : }
2386 : /* cut back startpos till we find a good endpoint */
2387 132 : for (i = posmarker; i < startpos && BADENDPOINT(i); i++)
2388 : {
2389 96 : if (!NONWORDTOKEN(prs->words[i].type))
2390 36 : curlen--;
2391 : }
2392 36 : startpos = i;
2393 : /* now stretch the endpos as much as possible */
2394 36 : posmarker = endpos;
2395 966 : for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
2396 : {
2397 930 : if (!NONWORDTOKEN(prs->words[i].type))
2398 462 : curlen++;
2399 930 : posmarker = i;
2400 : }
2401 : /* cut back endpos till we find a good endpoint */
2402 90 : for (i = posmarker; i > endpos && BADENDPOINT(i); i--)
2403 : {
2404 54 : if (!NONWORDTOKEN(prs->words[i].type))
2405 24 : curlen--;
2406 : }
2407 36 : endpos = i;
2408 : }
2409 36 : covers[minI].startpos = startpos;
2410 36 : covers[minI].endpos = endpos;
2411 36 : covers[minI].curlen = curlen;
2412 : /* Mark the chosen fragments (covers) */
2413 36 : mark_fragment(prs, highlightall, startpos, endpos);
2414 36 : num_f++;
2415 : /* Exclude covers overlapping this one from future consideration */
2416 96 : for (i = 0; i < numcovers; i++)
2417 : {
2418 60 : if (i != minI &&
2419 24 : ((covers[i].startpos >= startpos &&
2420 12 : covers[i].startpos <= endpos) ||
2421 24 : (covers[i].endpos >= startpos &&
2422 12 : covers[i].endpos <= endpos) ||
2423 24 : (covers[i].startpos < startpos &&
2424 12 : covers[i].endpos > endpos)))
2425 0 : covers[i].excluded = true;
2426 : }
2427 : }
2428 : else
2429 12 : break; /* no selectable covers remain */
2430 : }
2431 :
2432 : /* show the first min_words words if we have not marked anything */
2433 30 : if (num_f <= 0)
2434 : {
2435 6 : startpos = curlen = 0;
2436 6 : endpos = -1;
2437 186 : for (i = 0; i < prs->curwords && curlen < min_words; i++)
2438 : {
2439 180 : if (!NONWORDTOKEN(prs->words[i].type))
2440 90 : curlen++;
2441 180 : endpos = i;
2442 : }
2443 6 : mark_fragment(prs, highlightall, startpos, endpos);
2444 : }
2445 :
2446 30 : pfree(covers);
2447 30 : }
2448 :
2449 : /*
2450 : * Headline selector used when MaxFragments == 0
2451 : */
2452 : static void
2453 344 : mark_hl_words(HeadlineParsedText *prs, TSQuery query, List *locations,
2454 : bool highlightall,
2455 : int shortword, int min_words, int max_words)
2456 : {
2457 344 : int nextpos = 0,
2458 344 : p = 0,
2459 344 : q = 0;
2460 344 : int bestb = -1,
2461 344 : beste = -1;
2462 344 : int bestlen = -1;
2463 344 : bool bestcover = false;
2464 : int pose,
2465 : posb,
2466 : poslen,
2467 : curlen;
2468 : bool poscover;
2469 : int i;
2470 :
2471 344 : if (!highlightall)
2472 : {
2473 : /* examine all covers, select a headline using the best one */
2474 508 : while (hlCover(prs, query, locations, &nextpos, &p, &q))
2475 : {
2476 : /*
2477 : * Count words (curlen) and interesting words (poslen) within
2478 : * cover, but stop once we reach max_words. This step doesn't
2479 : * consider whether that's a good stopping point. posb and pose
2480 : * are set to the start and end indexes of the possible headline.
2481 : */
2482 170 : curlen = 0;
2483 170 : poslen = 0;
2484 170 : posb = pose = p;
2485 1456 : for (i = p; i <= q && curlen < max_words; i++)
2486 : {
2487 1286 : if (!NONWORDTOKEN(prs->words[i].type))
2488 728 : curlen++;
2489 1286 : if (INTERESTINGWORD(i))
2490 290 : poslen++;
2491 1286 : pose = i;
2492 : }
2493 :
2494 170 : if (curlen < max_words)
2495 : {
2496 : /*
2497 : * We have room to lengthen the headline, so search forward
2498 : * until it's full or we find a good stopping point. We'll
2499 : * reconsider the word at "q", then move forward.
2500 : */
2501 2938 : for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
2502 : {
2503 2912 : if (i > q)
2504 : {
2505 2754 : if (!NONWORDTOKEN(prs->words[i].type))
2506 1374 : curlen++;
2507 2754 : if (INTERESTINGWORD(i))
2508 120 : poslen++;
2509 : }
2510 2912 : pose = i;
2511 2912 : if (BADENDPOINT(i))
2512 1944 : continue;
2513 968 : if (curlen >= min_words)
2514 132 : break;
2515 : }
2516 158 : if (curlen < min_words)
2517 : {
2518 : /*
2519 : * Reached end of text and our headline is still shorter
2520 : * than min_words, so try to extend it to the left.
2521 : */
2522 366 : for (i = p - 1; i >= 0; i--)
2523 : {
2524 364 : if (!NONWORDTOKEN(prs->words[i].type))
2525 182 : curlen++;
2526 364 : if (INTERESTINGWORD(i))
2527 6 : poslen++;
2528 364 : if (curlen >= max_words)
2529 0 : break;
2530 364 : if (BADENDPOINT(i))
2531 236 : continue;
2532 128 : if (curlen >= min_words)
2533 24 : break;
2534 : }
2535 26 : posb = (i >= 0) ? i : 0;
2536 : }
2537 : }
2538 : else
2539 : {
2540 : /*
2541 : * Can't make headline longer, so consider making it shorter
2542 : * if needed to avoid a bad endpoint.
2543 : */
2544 12 : if (i > q)
2545 6 : i = q;
2546 30 : for (; curlen > min_words; i--)
2547 : {
2548 30 : if (!BADENDPOINT(i))
2549 : break;
2550 18 : if (!NONWORDTOKEN(prs->words[i].type))
2551 6 : curlen--;
2552 18 : if (INTERESTINGWORD(i))
2553 0 : poslen--;
2554 18 : pose = i - 1;
2555 : }
2556 : }
2557 :
2558 : /*
2559 : * Check whether the proposed headline includes the original
2560 : * cover; it might not if we trimmed it due to max_words.
2561 : */
2562 170 : poscover = (posb <= p && pose >= q);
2563 :
2564 : /*
2565 : * Adopt this headline if it's better than the last one, giving
2566 : * highest priority to headlines including the cover, then to
2567 : * headlines with more interesting words, then to headlines with
2568 : * good stopping points. (Since bestlen is initially -1, we will
2569 : * certainly adopt the first headline.)
2570 : */
2571 170 : if (poscover > bestcover ||
2572 78 : (poscover == bestcover && poslen > bestlen) ||
2573 72 : (poscover == bestcover && poslen == bestlen &&
2574 12 : !BADENDPOINT(pose) && BADENDPOINT(beste)))
2575 : {
2576 98 : bestb = posb;
2577 98 : beste = pose;
2578 98 : bestlen = poslen;
2579 98 : bestcover = poscover;
2580 : }
2581 : }
2582 :
2583 : /*
2584 : * If we found nothing acceptable, select min_words words starting at
2585 : * the beginning.
2586 : */
2587 338 : if (bestlen < 0)
2588 : {
2589 240 : curlen = 0;
2590 240 : pose = -1;
2591 1038 : for (i = 0; i < prs->curwords && curlen < min_words; i++)
2592 : {
2593 798 : if (!NONWORDTOKEN(prs->words[i].type))
2594 516 : curlen++;
2595 798 : pose = i;
2596 : }
2597 240 : bestb = 0;
2598 240 : beste = pose;
2599 : }
2600 : }
2601 : else
2602 : {
2603 : /* highlightall mode: headline is whole document */
2604 6 : bestb = 0;
2605 6 : beste = prs->curwords - 1;
2606 : }
2607 :
2608 344 : mark_fragment(prs, highlightall, bestb, beste);
2609 344 : }
2610 :
2611 : /*
2612 : * Default parser's prsheadline function
2613 : */
2614 : Datum
2615 374 : prsd_headline(PG_FUNCTION_ARGS)
2616 : {
2617 374 : HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
2618 374 : List *prsoptions = (List *) PG_GETARG_POINTER(1);
2619 374 : TSQuery query = PG_GETARG_TSQUERY(2);
2620 : List *locations;
2621 :
2622 : /* default option values: */
2623 374 : int min_words = 15;
2624 374 : int max_words = 35;
2625 374 : int shortword = 3;
2626 374 : int max_fragments = 0;
2627 374 : bool highlightall = false;
2628 : ListCell *l;
2629 :
2630 : /* Extract configuration option values */
2631 374 : prs->startsel = NULL;
2632 374 : prs->stopsel = NULL;
2633 374 : prs->fragdelim = NULL;
2634 728 : foreach(l, prsoptions)
2635 : {
2636 354 : DefElem *defel = (DefElem *) lfirst(l);
2637 354 : char *val = defGetString(defel);
2638 :
2639 354 : if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
2640 36 : max_words = pg_strtoint32(val);
2641 318 : else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
2642 36 : min_words = pg_strtoint32(val);
2643 282 : else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
2644 0 : shortword = pg_strtoint32(val);
2645 282 : else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
2646 30 : max_fragments = pg_strtoint32(val);
2647 252 : else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
2648 120 : prs->startsel = pstrdup(val);
2649 132 : else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
2650 120 : prs->stopsel = pstrdup(val);
2651 12 : else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
2652 6 : prs->fragdelim = pstrdup(val);
2653 6 : else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
2654 18 : highlightall = (pg_strcasecmp(val, "1") == 0 ||
2655 12 : pg_strcasecmp(val, "on") == 0 ||
2656 6 : pg_strcasecmp(val, "true") == 0 ||
2657 0 : pg_strcasecmp(val, "t") == 0 ||
2658 12 : pg_strcasecmp(val, "y") == 0 ||
2659 0 : pg_strcasecmp(val, "yes") == 0);
2660 : else
2661 0 : ereport(ERROR,
2662 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2663 : errmsg("unrecognized headline parameter: \"%s\"",
2664 : defel->defname)));
2665 : }
2666 :
2667 : /* in HighlightAll mode these parameters are ignored */
2668 374 : if (!highlightall)
2669 : {
2670 368 : if (min_words >= max_words)
2671 0 : ereport(ERROR,
2672 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2673 : errmsg("MinWords should be less than MaxWords")));
2674 368 : if (min_words <= 0)
2675 0 : ereport(ERROR,
2676 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2677 : errmsg("MinWords should be positive")));
2678 368 : if (shortword < 0)
2679 0 : ereport(ERROR,
2680 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2681 : errmsg("ShortWord should be >= 0")));
2682 368 : if (max_fragments < 0)
2683 0 : ereport(ERROR,
2684 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2685 : errmsg("MaxFragments should be >= 0")));
2686 : }
2687 :
2688 : /* Locate words and phrases matching the query */
2689 374 : if (query->size > 0)
2690 : {
2691 : hlCheck ch;
2692 :
2693 362 : ch.words = prs->words;
2694 362 : ch.len = prs->curwords;
2695 362 : locations = TS_execute_locations(GETQUERY(query), &ch, TS_EXEC_EMPTY,
2696 : checkcondition_HL);
2697 : }
2698 : else
2699 12 : locations = NIL; /* empty query matches nothing */
2700 :
2701 : /* Apply appropriate headline selector */
2702 374 : if (max_fragments == 0)
2703 344 : mark_hl_words(prs, query, locations, highlightall, shortword,
2704 : min_words, max_words);
2705 : else
2706 30 : mark_hl_fragments(prs, query, locations, highlightall, shortword,
2707 : min_words, max_words, max_fragments);
2708 :
2709 : /* Fill in default values for string options */
2710 374 : if (!prs->startsel)
2711 254 : prs->startsel = pstrdup("<b>");
2712 374 : if (!prs->stopsel)
2713 254 : prs->stopsel = pstrdup("</b>");
2714 374 : if (!prs->fragdelim)
2715 368 : prs->fragdelim = pstrdup(" ... ");
2716 :
2717 : /* Caller will need these lengths, too */
2718 374 : prs->startsellen = strlen(prs->startsel);
2719 374 : prs->stopsellen = strlen(prs->stopsel);
2720 374 : prs->fragdelimlen = strlen(prs->fragdelim);
2721 :
2722 374 : PG_RETURN_POINTER(prs);
2723 : }
|