Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * wparser_def.c
4 : * Default text search parser
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : *
8 : *
9 : * IDENTIFICATION
10 : * src/backend/tsearch/wparser_def.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include <limits.h>
18 : #include <wctype.h>
19 :
20 : #include "commands/defrem.h"
21 : #include "mb/pg_wchar.h"
22 : #include "miscadmin.h"
23 : #include "tsearch/ts_public.h"
24 : #include "tsearch/ts_type.h"
25 : #include "tsearch/ts_utils.h"
26 : #include "utils/builtins.h"
27 : #include "utils/pg_locale.h"
28 :
29 :
30 : /* Define me to enable tracing of parser behavior */
31 : /* #define WPARSER_TRACE */
32 :
33 :
34 : /* Output token categories */
35 :
36 : #define ASCIIWORD 1
37 : #define WORD_T 2
38 : #define NUMWORD 3
39 : #define EMAIL 4
40 : #define URL_T 5
41 : #define HOST 6
42 : #define SCIENTIFIC 7
43 : #define VERSIONNUMBER 8
44 : #define NUMPARTHWORD 9
45 : #define PARTHWORD 10
46 : #define ASCIIPARTHWORD 11
47 : #define SPACE 12
48 : #define TAG_T 13
49 : #define PROTOCOL 14
50 : #define NUMHWORD 15
51 : #define ASCIIHWORD 16
52 : #define HWORD 17
53 : #define URLPATH 18
54 : #define FILEPATH 19
55 : #define DECIMAL_T 20
56 : #define SIGNEDINT 21
57 : #define UNSIGNEDINT 22
58 : #define XMLENTITY 23
59 :
60 : #define LASTNUM 23
61 :
62 : static const char *const tok_alias[] = {
63 : "",
64 : "asciiword",
65 : "word",
66 : "numword",
67 : "email",
68 : "url",
69 : "host",
70 : "sfloat",
71 : "version",
72 : "hword_numpart",
73 : "hword_part",
74 : "hword_asciipart",
75 : "blank",
76 : "tag",
77 : "protocol",
78 : "numhword",
79 : "asciihword",
80 : "hword",
81 : "url_path",
82 : "file",
83 : "float",
84 : "int",
85 : "uint",
86 : "entity"
87 : };
88 :
89 : static const char *const lex_descr[] = {
90 : "",
91 : "Word, all ASCII",
92 : "Word, all letters",
93 : "Word, letters and digits",
94 : "Email address",
95 : "URL",
96 : "Host",
97 : "Scientific notation",
98 : "Version number",
99 : "Hyphenated word part, letters and digits",
100 : "Hyphenated word part, all letters",
101 : "Hyphenated word part, all ASCII",
102 : "Space symbols",
103 : "XML tag",
104 : "Protocol head",
105 : "Hyphenated word, letters and digits",
106 : "Hyphenated word, all ASCII",
107 : "Hyphenated word, all letters",
108 : "URL path",
109 : "File or path name",
110 : "Decimal notation",
111 : "Signed integer",
112 : "Unsigned integer",
113 : "XML entity"
114 : };
115 :
116 :
117 : /* Parser states */
118 :
119 : typedef enum
120 : {
121 : TPS_Base = 0,
122 : TPS_InNumWord,
123 : TPS_InAsciiWord,
124 : TPS_InWord,
125 : TPS_InUnsignedInt,
126 : TPS_InSignedIntFirst,
127 : TPS_InSignedInt,
128 : TPS_InSpace,
129 : TPS_InUDecimalFirst,
130 : TPS_InUDecimal,
131 : TPS_InDecimalFirst,
132 : TPS_InDecimal,
133 : TPS_InVerVersion,
134 : TPS_InSVerVersion,
135 : TPS_InVersionFirst,
136 : TPS_InVersion,
137 : TPS_InMantissaFirst,
138 : TPS_InMantissaSign,
139 : TPS_InMantissa,
140 : TPS_InXMLEntityFirst,
141 : TPS_InXMLEntity,
142 : TPS_InXMLEntityNumFirst,
143 : TPS_InXMLEntityNum,
144 : TPS_InXMLEntityHexNumFirst,
145 : TPS_InXMLEntityHexNum,
146 : TPS_InXMLEntityEnd,
147 : TPS_InTagFirst,
148 : TPS_InXMLBegin,
149 : TPS_InTagCloseFirst,
150 : TPS_InTagName,
151 : TPS_InTagBeginEnd,
152 : TPS_InTag,
153 : TPS_InTagEscapeK,
154 : TPS_InTagEscapeKK,
155 : TPS_InTagBackSleshed,
156 : TPS_InTagEnd,
157 : TPS_InCommentFirst,
158 : TPS_InCommentLast,
159 : TPS_InComment,
160 : TPS_InCloseCommentFirst,
161 : TPS_InCloseCommentLast,
162 : TPS_InCommentEnd,
163 : TPS_InHostFirstDomain,
164 : TPS_InHostDomainSecond,
165 : TPS_InHostDomain,
166 : TPS_InPortFirst,
167 : TPS_InPort,
168 : TPS_InHostFirstAN,
169 : TPS_InHost,
170 : TPS_InEmail,
171 : TPS_InFileFirst,
172 : TPS_InFileTwiddle,
173 : TPS_InPathFirst,
174 : TPS_InPathFirstFirst,
175 : TPS_InPathSecond,
176 : TPS_InFile,
177 : TPS_InFileNext,
178 : TPS_InURLPathFirst,
179 : TPS_InURLPathStart,
180 : TPS_InURLPath,
181 : TPS_InFURL,
182 : TPS_InProtocolFirst,
183 : TPS_InProtocolSecond,
184 : TPS_InProtocolEnd,
185 : TPS_InHyphenAsciiWordFirst,
186 : TPS_InHyphenAsciiWord,
187 : TPS_InHyphenWordFirst,
188 : TPS_InHyphenWord,
189 : TPS_InHyphenNumWordFirst,
190 : TPS_InHyphenNumWord,
191 : TPS_InHyphenDigitLookahead,
192 : TPS_InParseHyphen,
193 : TPS_InParseHyphenHyphen,
194 : TPS_InHyphenWordPart,
195 : TPS_InHyphenAsciiWordPart,
196 : TPS_InHyphenNumWordPart,
197 : TPS_InHyphenUnsignedInt,
198 : TPS_Null /* last state (fake value) */
199 : } TParserState;
200 :
201 : /* forward declaration */
202 : struct TParser;
203 :
204 : typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
205 : * except p_iseq */
206 : typedef void (*TParserSpecial) (struct TParser *); /* special handler for
207 : * special cases... */
208 :
209 : typedef struct
210 : {
211 : TParserCharTest isclass;
212 : char c;
213 : uint16 flags;
214 : TParserState tostate;
215 : int type;
216 : TParserSpecial special;
217 : } TParserStateActionItem;
218 :
219 : /* Flag bits in TParserStateActionItem.flags */
220 : #define A_NEXT 0x0000
221 : #define A_BINGO 0x0001
222 : #define A_POP 0x0002
223 : #define A_PUSH 0x0004
224 : #define A_RERUN 0x0008
225 : #define A_CLEAR 0x0010
226 : #define A_MERGE 0x0020
227 : #define A_CLRALL 0x0040
228 :
229 : typedef struct TParserPosition
230 : {
231 : int posbyte; /* position of parser in bytes */
232 : int poschar; /* position of parser in characters */
233 : int charlen; /* length of current char */
234 : int lenbytetoken; /* length of token-so-far in bytes */
235 : int lenchartoken; /* and in chars */
236 : TParserState state;
237 : struct TParserPosition *prev;
238 : const TParserStateActionItem *pushedAtAction;
239 : } TParserPosition;
240 :
241 : typedef struct TParser
242 : {
243 : /* string and position information */
244 : char *str; /* multibyte string */
245 : int lenstr; /* length of mbstring */
246 : pg_wchar *pgwstr; /* wide character string for C-locale */
247 :
248 : /* State of parse */
249 : int charmaxlen;
250 : TParserPosition *state;
251 : bool ignore;
252 : bool wanthost;
253 :
254 : /* silly char */
255 : char c;
256 :
257 : /* out */
258 : char *token;
259 : int lenbytetoken;
260 : int lenchartoken;
261 : int type;
262 : } TParser;
263 :
264 :
265 : /* forward decls here */
266 : static bool TParserGet(TParser *prs);
267 :
268 :
269 : static TParserPosition *
270 10232 : newTParserPosition(TParserPosition *prev)
271 : {
272 10232 : TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
273 :
274 10232 : if (prev)
275 5238 : memcpy(res, prev, sizeof(TParserPosition));
276 : else
277 4994 : memset(res, 0, sizeof(TParserPosition));
278 :
279 10232 : res->prev = prev;
280 :
281 10232 : res->pushedAtAction = NULL;
282 :
283 10232 : return res;
284 : }
285 :
286 : static TParser *
287 4754 : TParserInit(char *str, int len)
288 : {
289 4754 : TParser *prs = (TParser *) palloc0(sizeof(TParser));
290 :
291 4754 : prs->charmaxlen = pg_database_encoding_max_length();
292 4754 : prs->str = str;
293 4754 : prs->lenstr = len;
294 4754 : prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
295 4754 : pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
296 :
297 4754 : prs->state = newTParserPosition(NULL);
298 4754 : prs->state->state = TPS_Base;
299 :
300 : #ifdef WPARSER_TRACE
301 : fprintf(stderr, "parsing \"%.*s\"\n", len, str);
302 : #endif
303 :
304 4754 : return prs;
305 : }
306 :
307 : /*
308 : * As an alternative to a full TParserInit one can create a
309 : * TParserCopy which basically is a regular TParser without a private
310 : * copy of the string - instead it uses the one from another TParser.
311 : * This is useful because at some places TParsers are created
312 : * recursively and the repeated copying around of the strings can
313 : * cause major inefficiency if the source string is long.
314 : * The new parser starts parsing at the original's current position.
315 : *
316 : * Obviously one must not close the original TParser before the copy.
317 : */
318 : static TParser *
319 240 : TParserCopyInit(const TParser *orig)
320 : {
321 240 : TParser *prs = (TParser *) palloc0(sizeof(TParser));
322 :
323 240 : prs->charmaxlen = orig->charmaxlen;
324 240 : prs->str = orig->str + orig->state->posbyte;
325 240 : prs->lenstr = orig->lenstr - orig->state->posbyte;
326 :
327 240 : if (orig->pgwstr)
328 240 : prs->pgwstr = orig->pgwstr + orig->state->poschar;
329 :
330 240 : prs->state = newTParserPosition(NULL);
331 240 : prs->state->state = TPS_Base;
332 :
333 : #ifdef WPARSER_TRACE
334 : fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
335 : #endif
336 :
337 240 : return prs;
338 : }
339 :
340 :
341 : static void
342 4754 : TParserClose(TParser *prs)
343 : {
344 9508 : while (prs->state)
345 : {
346 4754 : TParserPosition *ptr = prs->state->prev;
347 :
348 4754 : pfree(prs->state);
349 4754 : prs->state = ptr;
350 : }
351 :
352 4754 : if (prs->pgwstr)
353 4754 : pfree(prs->pgwstr);
354 :
355 : #ifdef WPARSER_TRACE
356 : fprintf(stderr, "closing parser\n");
357 : #endif
358 4754 : pfree(prs);
359 4754 : }
360 :
361 : /*
362 : * Close a parser created with TParserCopyInit
363 : */
364 : static void
365 240 : TParserCopyClose(TParser *prs)
366 : {
367 612 : while (prs->state)
368 : {
369 372 : TParserPosition *ptr = prs->state->prev;
370 :
371 372 : pfree(prs->state);
372 372 : prs->state = ptr;
373 : }
374 :
375 : #ifdef WPARSER_TRACE
376 : fprintf(stderr, "closing parser copy\n");
377 : #endif
378 240 : pfree(prs);
379 240 : }
380 :
381 :
382 : /*
383 : * Character-type support functions using the database default locale. If the
384 : * locale is C, and the input character is non-ascii, the value to be returned
385 : * is determined by the 'nonascii' macro argument.
386 : */
387 :
388 : #define p_iswhat(type, nonascii) \
389 : \
390 : static int \
391 : p_is##type(TParser *prs) \
392 : { \
393 : pg_locale_t locale = pg_database_locale(); \
394 : pg_wchar wc; \
395 : Assert(prs->state); \
396 : wc = prs->pgwstr[prs->state->poschar]; \
397 : if (prs->charmaxlen > 1 && locale->ctype_is_c && wc > 0x7f) \
398 : return nonascii; \
399 : return pg_isw##type(wc, pg_database_locale()); \
400 : } \
401 : \
402 : static int \
403 : p_isnot##type(TParser *prs) \
404 : { \
405 : return !p_is##type(prs); \
406 : }
407 :
408 : /*
409 : * In C locale with a multibyte encoding, any non-ASCII symbol is considered
410 : * an alpha character, but not a member of other char classes.
411 : */
412 25122 : p_iswhat(alnum, 1)
413 93856 : p_iswhat(alpha, 1)
414 37132 : p_iswhat(digit, 0)
415 0 : p_iswhat(lower, 0)
416 0 : p_iswhat(print, 0)
417 0 : p_iswhat(punct, 0)
418 678 : p_iswhat(space, 0)
419 0 : p_iswhat(upper, 0)
420 18 : p_iswhat(xdigit, 0)
421 :
422 : /* p_iseq should be used only for ascii symbols */
423 :
424 : static int
425 231392 : p_iseq(TParser *prs, char c)
426 : {
427 : Assert(prs->state);
428 231392 : return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
429 : }
430 :
431 : static int
432 100158 : p_isEOF(TParser *prs)
433 : {
434 : Assert(prs->state);
435 100158 : return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
436 : }
437 :
438 : static int
439 231392 : p_iseqC(TParser *prs)
440 : {
441 231392 : return p_iseq(prs, prs->c);
442 : }
443 :
444 : static int
445 0 : p_isneC(TParser *prs)
446 : {
447 0 : return !p_iseq(prs, prs->c);
448 : }
449 :
450 : static int
451 73544 : p_isascii(TParser *prs)
452 : {
453 73544 : return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
454 : }
455 :
456 : static int
457 73544 : p_isasclet(TParser *prs)
458 : {
459 73544 : return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
460 : }
461 :
462 : static int
463 2658 : p_isurlchar(TParser *prs)
464 : {
465 : char ch;
466 :
467 : /* no non-ASCII need apply */
468 2658 : if (prs->state->charlen != 1)
469 0 : return 0;
470 2658 : ch = *(prs->str + prs->state->posbyte);
471 : /* no spaces or control characters */
472 2658 : if (ch <= 0x20 || ch >= 0x7F)
473 234 : return 0;
474 : /* reject characters disallowed by RFC 3986 */
475 2424 : switch (ch)
476 : {
477 24 : case '"':
478 : case '<':
479 : case '>':
480 : case '\\':
481 : case '^':
482 : case '`':
483 : case '{':
484 : case '|':
485 : case '}':
486 24 : return 0;
487 : }
488 2400 : return 1;
489 : }
490 :
491 :
492 : /* deliberately suppress unused-function complaints for the above */
493 : void _make_compiler_happy(void);
494 : void
495 0 : _make_compiler_happy(void)
496 : {
497 0 : p_isalnum(NULL);
498 0 : p_isnotalnum(NULL);
499 0 : p_isalpha(NULL);
500 0 : p_isnotalpha(NULL);
501 0 : p_isdigit(NULL);
502 0 : p_isnotdigit(NULL);
503 0 : p_islower(NULL);
504 0 : p_isnotlower(NULL);
505 0 : p_isprint(NULL);
506 0 : p_isnotprint(NULL);
507 0 : p_ispunct(NULL);
508 0 : p_isnotpunct(NULL);
509 0 : p_isspace(NULL);
510 0 : p_isnotspace(NULL);
511 0 : p_isupper(NULL);
512 0 : p_isnotupper(NULL);
513 0 : p_isxdigit(NULL);
514 0 : p_isnotxdigit(NULL);
515 0 : p_isEOF(NULL);
516 0 : p_iseqC(NULL);
517 0 : p_isneC(NULL);
518 0 : }
519 :
520 :
521 : static void
522 252 : SpecialTags(TParser *prs)
523 : {
524 252 : switch (prs->state->lenchartoken)
525 : {
526 6 : case 8: /* </script */
527 6 : if (pg_strncasecmp(prs->token, "</script", 8) == 0)
528 6 : prs->ignore = false;
529 6 : break;
530 24 : case 7: /* <script || </style */
531 24 : if (pg_strncasecmp(prs->token, "</style", 7) == 0)
532 0 : prs->ignore = false;
533 24 : else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
534 6 : prs->ignore = true;
535 24 : break;
536 18 : case 6: /* <style */
537 18 : if (pg_strncasecmp(prs->token, "<style", 6) == 0)
538 0 : prs->ignore = true;
539 18 : break;
540 204 : default:
541 204 : break;
542 : }
543 252 : }
544 :
545 : static void
546 132 : SpecialFURL(TParser *prs)
547 : {
548 132 : prs->wanthost = true;
549 132 : prs->state->posbyte -= prs->state->lenbytetoken;
550 132 : prs->state->poschar -= prs->state->lenchartoken;
551 132 : }
552 :
553 : static void
554 36 : SpecialHyphen(TParser *prs)
555 : {
556 36 : prs->state->posbyte -= prs->state->lenbytetoken;
557 36 : prs->state->poschar -= prs->state->lenchartoken;
558 36 : }
559 :
560 : static void
561 0 : SpecialVerVersion(TParser *prs)
562 : {
563 0 : prs->state->posbyte -= prs->state->lenbytetoken;
564 0 : prs->state->poschar -= prs->state->lenchartoken;
565 0 : prs->state->lenbytetoken = 0;
566 0 : prs->state->lenchartoken = 0;
567 0 : }
568 :
569 : static int
570 480 : p_isstophost(TParser *prs)
571 : {
572 480 : if (prs->wanthost)
573 : {
574 204 : prs->wanthost = false;
575 204 : return 1;
576 : }
577 276 : return 0;
578 : }
579 :
580 : static int
581 36086 : p_isignore(TParser *prs)
582 : {
583 36086 : return (prs->ignore) ? 1 : 0;
584 : }
585 :
586 : static int
587 90 : p_ishost(TParser *prs)
588 : {
589 90 : TParser *tmpprs = TParserCopyInit(prs);
590 90 : int res = 0;
591 :
592 90 : tmpprs->wanthost = true;
593 :
594 : /*
595 : * Check stack depth before recursing. (Since TParserGet() doesn't
596 : * normally recurse, we put the cost of checking here not there.)
597 : */
598 90 : check_stack_depth();
599 :
600 90 : if (TParserGet(tmpprs) && tmpprs->type == HOST)
601 : {
602 72 : prs->state->posbyte += tmpprs->lenbytetoken;
603 72 : prs->state->poschar += tmpprs->lenchartoken;
604 72 : prs->state->lenbytetoken += tmpprs->lenbytetoken;
605 72 : prs->state->lenchartoken += tmpprs->lenchartoken;
606 72 : prs->state->charlen = tmpprs->state->charlen;
607 72 : res = 1;
608 : }
609 90 : TParserCopyClose(tmpprs);
610 :
611 90 : return res;
612 : }
613 :
614 : static int
615 150 : p_isURLPath(TParser *prs)
616 : {
617 150 : TParser *tmpprs = TParserCopyInit(prs);
618 150 : int res = 0;
619 :
620 150 : tmpprs->state = newTParserPosition(tmpprs->state);
621 150 : tmpprs->state->state = TPS_InURLPathFirst;
622 :
623 : /*
624 : * Check stack depth before recursing. (Since TParserGet() doesn't
625 : * normally recurse, we put the cost of checking here not there.)
626 : */
627 150 : check_stack_depth();
628 :
629 150 : if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
630 : {
631 132 : prs->state->posbyte += tmpprs->lenbytetoken;
632 132 : prs->state->poschar += tmpprs->lenchartoken;
633 132 : prs->state->lenbytetoken += tmpprs->lenbytetoken;
634 132 : prs->state->lenchartoken += tmpprs->lenchartoken;
635 132 : prs->state->charlen = tmpprs->state->charlen;
636 132 : res = 1;
637 : }
638 150 : TParserCopyClose(tmpprs);
639 :
640 150 : return res;
641 : }
642 :
643 : /*
644 : * returns true if current character has zero display length or
645 : * it's a special sign in several languages. Such characters
646 : * aren't a word-breaker although they aren't an isalpha.
647 : * In beginning of word they aren't a part of it.
648 : */
649 : static int
650 8724 : p_isspecial(TParser *prs)
651 : {
652 : /*
653 : * pg_dsplen could return -1 which means error or control character
654 : */
655 8724 : if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
656 0 : return 1;
657 :
658 : /*
659 : * Unicode Characters in the 'Mark, Spacing Combining' Category That
660 : * characters are not alpha although they are not breakers of word too.
661 : * Check that only in utf encoding, because other encodings aren't
662 : * supported by postgres or even exists.
663 : */
664 8724 : if (GetDatabaseEncoding() == PG_UTF8)
665 : {
666 : static const pg_wchar strange_letter[] = {
667 : /*
668 : * use binary search, so elements should be ordered
669 : */
670 : 0x0903, /* DEVANAGARI SIGN VISARGA */
671 : 0x093E, /* DEVANAGARI VOWEL SIGN AA */
672 : 0x093F, /* DEVANAGARI VOWEL SIGN I */
673 : 0x0940, /* DEVANAGARI VOWEL SIGN II */
674 : 0x0949, /* DEVANAGARI VOWEL SIGN CANDRA O */
675 : 0x094A, /* DEVANAGARI VOWEL SIGN SHORT O */
676 : 0x094B, /* DEVANAGARI VOWEL SIGN O */
677 : 0x094C, /* DEVANAGARI VOWEL SIGN AU */
678 : 0x0982, /* BENGALI SIGN ANUSVARA */
679 : 0x0983, /* BENGALI SIGN VISARGA */
680 : 0x09BE, /* BENGALI VOWEL SIGN AA */
681 : 0x09BF, /* BENGALI VOWEL SIGN I */
682 : 0x09C0, /* BENGALI VOWEL SIGN II */
683 : 0x09C7, /* BENGALI VOWEL SIGN E */
684 : 0x09C8, /* BENGALI VOWEL SIGN AI */
685 : 0x09CB, /* BENGALI VOWEL SIGN O */
686 : 0x09CC, /* BENGALI VOWEL SIGN AU */
687 : 0x09D7, /* BENGALI AU LENGTH MARK */
688 : 0x0A03, /* GURMUKHI SIGN VISARGA */
689 : 0x0A3E, /* GURMUKHI VOWEL SIGN AA */
690 : 0x0A3F, /* GURMUKHI VOWEL SIGN I */
691 : 0x0A40, /* GURMUKHI VOWEL SIGN II */
692 : 0x0A83, /* GUJARATI SIGN VISARGA */
693 : 0x0ABE, /* GUJARATI VOWEL SIGN AA */
694 : 0x0ABF, /* GUJARATI VOWEL SIGN I */
695 : 0x0AC0, /* GUJARATI VOWEL SIGN II */
696 : 0x0AC9, /* GUJARATI VOWEL SIGN CANDRA O */
697 : 0x0ACB, /* GUJARATI VOWEL SIGN O */
698 : 0x0ACC, /* GUJARATI VOWEL SIGN AU */
699 : 0x0B02, /* ORIYA SIGN ANUSVARA */
700 : 0x0B03, /* ORIYA SIGN VISARGA */
701 : 0x0B3E, /* ORIYA VOWEL SIGN AA */
702 : 0x0B40, /* ORIYA VOWEL SIGN II */
703 : 0x0B47, /* ORIYA VOWEL SIGN E */
704 : 0x0B48, /* ORIYA VOWEL SIGN AI */
705 : 0x0B4B, /* ORIYA VOWEL SIGN O */
706 : 0x0B4C, /* ORIYA VOWEL SIGN AU */
707 : 0x0B57, /* ORIYA AU LENGTH MARK */
708 : 0x0BBE, /* TAMIL VOWEL SIGN AA */
709 : 0x0BBF, /* TAMIL VOWEL SIGN I */
710 : 0x0BC1, /* TAMIL VOWEL SIGN U */
711 : 0x0BC2, /* TAMIL VOWEL SIGN UU */
712 : 0x0BC6, /* TAMIL VOWEL SIGN E */
713 : 0x0BC7, /* TAMIL VOWEL SIGN EE */
714 : 0x0BC8, /* TAMIL VOWEL SIGN AI */
715 : 0x0BCA, /* TAMIL VOWEL SIGN O */
716 : 0x0BCB, /* TAMIL VOWEL SIGN OO */
717 : 0x0BCC, /* TAMIL VOWEL SIGN AU */
718 : 0x0BD7, /* TAMIL AU LENGTH MARK */
719 : 0x0C01, /* TELUGU SIGN CANDRABINDU */
720 : 0x0C02, /* TELUGU SIGN ANUSVARA */
721 : 0x0C03, /* TELUGU SIGN VISARGA */
722 : 0x0C41, /* TELUGU VOWEL SIGN U */
723 : 0x0C42, /* TELUGU VOWEL SIGN UU */
724 : 0x0C43, /* TELUGU VOWEL SIGN VOCALIC R */
725 : 0x0C44, /* TELUGU VOWEL SIGN VOCALIC RR */
726 : 0x0C82, /* KANNADA SIGN ANUSVARA */
727 : 0x0C83, /* KANNADA SIGN VISARGA */
728 : 0x0CBE, /* KANNADA VOWEL SIGN AA */
729 : 0x0CC0, /* KANNADA VOWEL SIGN II */
730 : 0x0CC1, /* KANNADA VOWEL SIGN U */
731 : 0x0CC2, /* KANNADA VOWEL SIGN UU */
732 : 0x0CC3, /* KANNADA VOWEL SIGN VOCALIC R */
733 : 0x0CC4, /* KANNADA VOWEL SIGN VOCALIC RR */
734 : 0x0CC7, /* KANNADA VOWEL SIGN EE */
735 : 0x0CC8, /* KANNADA VOWEL SIGN AI */
736 : 0x0CCA, /* KANNADA VOWEL SIGN O */
737 : 0x0CCB, /* KANNADA VOWEL SIGN OO */
738 : 0x0CD5, /* KANNADA LENGTH MARK */
739 : 0x0CD6, /* KANNADA AI LENGTH MARK */
740 : 0x0D02, /* MALAYALAM SIGN ANUSVARA */
741 : 0x0D03, /* MALAYALAM SIGN VISARGA */
742 : 0x0D3E, /* MALAYALAM VOWEL SIGN AA */
743 : 0x0D3F, /* MALAYALAM VOWEL SIGN I */
744 : 0x0D40, /* MALAYALAM VOWEL SIGN II */
745 : 0x0D46, /* MALAYALAM VOWEL SIGN E */
746 : 0x0D47, /* MALAYALAM VOWEL SIGN EE */
747 : 0x0D48, /* MALAYALAM VOWEL SIGN AI */
748 : 0x0D4A, /* MALAYALAM VOWEL SIGN O */
749 : 0x0D4B, /* MALAYALAM VOWEL SIGN OO */
750 : 0x0D4C, /* MALAYALAM VOWEL SIGN AU */
751 : 0x0D57, /* MALAYALAM AU LENGTH MARK */
752 : 0x0D82, /* SINHALA SIGN ANUSVARAYA */
753 : 0x0D83, /* SINHALA SIGN VISARGAYA */
754 : 0x0DCF, /* SINHALA VOWEL SIGN AELA-PILLA */
755 : 0x0DD0, /* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
756 : 0x0DD1, /* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
757 : 0x0DD8, /* SINHALA VOWEL SIGN GAETTA-PILLA */
758 : 0x0DD9, /* SINHALA VOWEL SIGN KOMBUVA */
759 : 0x0DDA, /* SINHALA VOWEL SIGN DIGA KOMBUVA */
760 : 0x0DDB, /* SINHALA VOWEL SIGN KOMBU DEKA */
761 : 0x0DDC, /* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
762 : 0x0DDD, /* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
763 : * AELA-PILLA */
764 : 0x0DDE, /* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
765 : 0x0DDF, /* SINHALA VOWEL SIGN GAYANUKITTA */
766 : 0x0DF2, /* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
767 : 0x0DF3, /* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
768 : 0x0F3E, /* TIBETAN SIGN YAR TSHES */
769 : 0x0F3F, /* TIBETAN SIGN MAR TSHES */
770 : 0x0F7F, /* TIBETAN SIGN RNAM BCAD */
771 : 0x102B, /* MYANMAR VOWEL SIGN TALL AA */
772 : 0x102C, /* MYANMAR VOWEL SIGN AA */
773 : 0x1031, /* MYANMAR VOWEL SIGN E */
774 : 0x1038, /* MYANMAR SIGN VISARGA */
775 : 0x103B, /* MYANMAR CONSONANT SIGN MEDIAL YA */
776 : 0x103C, /* MYANMAR CONSONANT SIGN MEDIAL RA */
777 : 0x1056, /* MYANMAR VOWEL SIGN VOCALIC R */
778 : 0x1057, /* MYANMAR VOWEL SIGN VOCALIC RR */
779 : 0x1062, /* MYANMAR VOWEL SIGN SGAW KAREN EU */
780 : 0x1063, /* MYANMAR TONE MARK SGAW KAREN HATHI */
781 : 0x1064, /* MYANMAR TONE MARK SGAW KAREN KE PHO */
782 : 0x1067, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
783 : 0x1068, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
784 : 0x1069, /* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
785 : 0x106A, /* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
786 : 0x106B, /* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
787 : 0x106C, /* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
788 : 0x106D, /* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
789 : 0x1083, /* MYANMAR VOWEL SIGN SHAN AA */
790 : 0x1084, /* MYANMAR VOWEL SIGN SHAN E */
791 : 0x1087, /* MYANMAR SIGN SHAN TONE-2 */
792 : 0x1088, /* MYANMAR SIGN SHAN TONE-3 */
793 : 0x1089, /* MYANMAR SIGN SHAN TONE-5 */
794 : 0x108A, /* MYANMAR SIGN SHAN TONE-6 */
795 : 0x108B, /* MYANMAR SIGN SHAN COUNCIL TONE-2 */
796 : 0x108C, /* MYANMAR SIGN SHAN COUNCIL TONE-3 */
797 : 0x108F, /* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
798 : 0x17B6, /* KHMER VOWEL SIGN AA */
799 : 0x17BE, /* KHMER VOWEL SIGN OE */
800 : 0x17BF, /* KHMER VOWEL SIGN YA */
801 : 0x17C0, /* KHMER VOWEL SIGN IE */
802 : 0x17C1, /* KHMER VOWEL SIGN E */
803 : 0x17C2, /* KHMER VOWEL SIGN AE */
804 : 0x17C3, /* KHMER VOWEL SIGN AI */
805 : 0x17C4, /* KHMER VOWEL SIGN OO */
806 : 0x17C5, /* KHMER VOWEL SIGN AU */
807 : 0x17C7, /* KHMER SIGN REAHMUK */
808 : 0x17C8, /* KHMER SIGN YUUKALEAPINTU */
809 : 0x1923, /* LIMBU VOWEL SIGN EE */
810 : 0x1924, /* LIMBU VOWEL SIGN AI */
811 : 0x1925, /* LIMBU VOWEL SIGN OO */
812 : 0x1926, /* LIMBU VOWEL SIGN AU */
813 : 0x1929, /* LIMBU SUBJOINED LETTER YA */
814 : 0x192A, /* LIMBU SUBJOINED LETTER RA */
815 : 0x192B, /* LIMBU SUBJOINED LETTER WA */
816 : 0x1930, /* LIMBU SMALL LETTER KA */
817 : 0x1931, /* LIMBU SMALL LETTER NGA */
818 : 0x1933, /* LIMBU SMALL LETTER TA */
819 : 0x1934, /* LIMBU SMALL LETTER NA */
820 : 0x1935, /* LIMBU SMALL LETTER PA */
821 : 0x1936, /* LIMBU SMALL LETTER MA */
822 : 0x1937, /* LIMBU SMALL LETTER RA */
823 : 0x1938, /* LIMBU SMALL LETTER LA */
824 : 0x19B0, /* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
825 : 0x19B1, /* NEW TAI LUE VOWEL SIGN AA */
826 : 0x19B2, /* NEW TAI LUE VOWEL SIGN II */
827 : 0x19B3, /* NEW TAI LUE VOWEL SIGN U */
828 : 0x19B4, /* NEW TAI LUE VOWEL SIGN UU */
829 : 0x19B5, /* NEW TAI LUE VOWEL SIGN E */
830 : 0x19B6, /* NEW TAI LUE VOWEL SIGN AE */
831 : 0x19B7, /* NEW TAI LUE VOWEL SIGN O */
832 : 0x19B8, /* NEW TAI LUE VOWEL SIGN OA */
833 : 0x19B9, /* NEW TAI LUE VOWEL SIGN UE */
834 : 0x19BA, /* NEW TAI LUE VOWEL SIGN AY */
835 : 0x19BB, /* NEW TAI LUE VOWEL SIGN AAY */
836 : 0x19BC, /* NEW TAI LUE VOWEL SIGN UY */
837 : 0x19BD, /* NEW TAI LUE VOWEL SIGN OY */
838 : 0x19BE, /* NEW TAI LUE VOWEL SIGN OAY */
839 : 0x19BF, /* NEW TAI LUE VOWEL SIGN UEY */
840 : 0x19C0, /* NEW TAI LUE VOWEL SIGN IY */
841 : 0x19C8, /* NEW TAI LUE TONE MARK-1 */
842 : 0x19C9, /* NEW TAI LUE TONE MARK-2 */
843 : 0x1A19, /* BUGINESE VOWEL SIGN E */
844 : 0x1A1A, /* BUGINESE VOWEL SIGN O */
845 : 0x1A1B, /* BUGINESE VOWEL SIGN AE */
846 : 0x1B04, /* BALINESE SIGN BISAH */
847 : 0x1B35, /* BALINESE VOWEL SIGN TEDUNG */
848 : 0x1B3B, /* BALINESE VOWEL SIGN RA REPA TEDUNG */
849 : 0x1B3D, /* BALINESE VOWEL SIGN LA LENGA TEDUNG */
850 : 0x1B3E, /* BALINESE VOWEL SIGN TALING */
851 : 0x1B3F, /* BALINESE VOWEL SIGN TALING REPA */
852 : 0x1B40, /* BALINESE VOWEL SIGN TALING TEDUNG */
853 : 0x1B41, /* BALINESE VOWEL SIGN TALING REPA TEDUNG */
854 : 0x1B43, /* BALINESE VOWEL SIGN PEPET TEDUNG */
855 : 0x1B44, /* BALINESE ADEG ADEG */
856 : 0x1B82, /* SUNDANESE SIGN PANGWISAD */
857 : 0x1BA1, /* SUNDANESE CONSONANT SIGN PAMINGKAL */
858 : 0x1BA6, /* SUNDANESE VOWEL SIGN PANAELAENG */
859 : 0x1BA7, /* SUNDANESE VOWEL SIGN PANOLONG */
860 : 0x1BAA, /* SUNDANESE SIGN PAMAAEH */
861 : 0x1C24, /* LEPCHA SUBJOINED LETTER YA */
862 : 0x1C25, /* LEPCHA SUBJOINED LETTER RA */
863 : 0x1C26, /* LEPCHA VOWEL SIGN AA */
864 : 0x1C27, /* LEPCHA VOWEL SIGN I */
865 : 0x1C28, /* LEPCHA VOWEL SIGN O */
866 : 0x1C29, /* LEPCHA VOWEL SIGN OO */
867 : 0x1C2A, /* LEPCHA VOWEL SIGN U */
868 : 0x1C2B, /* LEPCHA VOWEL SIGN UU */
869 : 0x1C34, /* LEPCHA CONSONANT SIGN NYIN-DO */
870 : 0x1C35, /* LEPCHA CONSONANT SIGN KANG */
871 : 0xA823, /* SYLOTI NAGRI VOWEL SIGN A */
872 : 0xA824, /* SYLOTI NAGRI VOWEL SIGN I */
873 : 0xA827, /* SYLOTI NAGRI VOWEL SIGN OO */
874 : 0xA880, /* SAURASHTRA SIGN ANUSVARA */
875 : 0xA881, /* SAURASHTRA SIGN VISARGA */
876 : 0xA8B4, /* SAURASHTRA CONSONANT SIGN HAARU */
877 : 0xA8B5, /* SAURASHTRA VOWEL SIGN AA */
878 : 0xA8B6, /* SAURASHTRA VOWEL SIGN I */
879 : 0xA8B7, /* SAURASHTRA VOWEL SIGN II */
880 : 0xA8B8, /* SAURASHTRA VOWEL SIGN U */
881 : 0xA8B9, /* SAURASHTRA VOWEL SIGN UU */
882 : 0xA8BA, /* SAURASHTRA VOWEL SIGN VOCALIC R */
883 : 0xA8BB, /* SAURASHTRA VOWEL SIGN VOCALIC RR */
884 : 0xA8BC, /* SAURASHTRA VOWEL SIGN VOCALIC L */
885 : 0xA8BD, /* SAURASHTRA VOWEL SIGN VOCALIC LL */
886 : 0xA8BE, /* SAURASHTRA VOWEL SIGN E */
887 : 0xA8BF, /* SAURASHTRA VOWEL SIGN EE */
888 : 0xA8C0, /* SAURASHTRA VOWEL SIGN AI */
889 : 0xA8C1, /* SAURASHTRA VOWEL SIGN O */
890 : 0xA8C2, /* SAURASHTRA VOWEL SIGN OO */
891 : 0xA8C3, /* SAURASHTRA VOWEL SIGN AU */
892 : 0xA952, /* REJANG CONSONANT SIGN H */
893 : 0xA953, /* REJANG VIRAMA */
894 : 0xAA2F, /* CHAM VOWEL SIGN O */
895 : 0xAA30, /* CHAM VOWEL SIGN AI */
896 : 0xAA33, /* CHAM CONSONANT SIGN YA */
897 : 0xAA34, /* CHAM CONSONANT SIGN RA */
898 : 0xAA4D /* CHAM CONSONANT SIGN FINAL H */
899 : };
900 8724 : const pg_wchar *StopLow = strange_letter,
901 8724 : *StopHigh = strange_letter + lengthof(strange_letter),
902 : *StopMiddle;
903 : pg_wchar c;
904 :
905 8724 : c = *(prs->pgwstr + prs->state->poschar);
906 :
907 78516 : while (StopLow < StopHigh)
908 : {
909 69792 : StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
910 69792 : if (*StopMiddle == c)
911 0 : return 1;
912 69792 : else if (*StopMiddle < c)
913 0 : StopLow = StopMiddle + 1;
914 : else
915 69792 : StopHigh = StopMiddle;
916 : }
917 : }
918 :
919 8724 : return 0;
920 : }
921 :
922 : /*
923 : * Table of state/action of parser
924 : */
925 :
926 : static const TParserStateActionItem actionTPS_Base[] = {
927 : {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
928 : {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
929 : {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
930 : {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
931 : {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
932 : {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
933 : {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
934 : {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
935 : {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
936 : {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
937 : {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
938 : {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
939 : {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
940 : };
941 :
942 :
943 : static const TParserStateActionItem actionTPS_InNumWord[] = {
944 : {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
945 : {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
946 : {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
947 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
948 : {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
949 : {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
950 : {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
951 : {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
952 : };
953 :
954 : static const TParserStateActionItem actionTPS_InAsciiWord[] = {
955 : {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
956 : {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
957 : {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
958 : {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
959 : {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
960 : {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
961 : {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
962 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
963 : {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
964 : {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
965 : {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
966 : {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
967 : {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
968 : {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
969 : {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
970 : };
971 :
972 : static const TParserStateActionItem actionTPS_InWord[] = {
973 : {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
974 : {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
975 : {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
976 : {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
977 : {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
978 : {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
979 : };
980 :
981 : static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
982 : {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
983 : {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
984 : {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
985 : {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
986 : {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
987 : {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
988 : {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
989 : {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
990 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
991 : {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
992 : {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
993 : {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
994 : {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
995 : {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
996 : };
997 :
998 : static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
999 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1000 : {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
1001 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1002 : };
1003 :
1004 : static const TParserStateActionItem actionTPS_InSignedInt[] = {
1005 : {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
1006 : {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1007 : {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
1008 : {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1009 : {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1010 : {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
1011 : };
1012 :
1013 : static const TParserStateActionItem actionTPS_InSpace[] = {
1014 : {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
1015 : {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
1016 : {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
1017 : {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
1018 : {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
1019 : {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
1020 : {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
1021 : {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
1022 : {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
1023 : };
1024 :
1025 : static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
1026 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1027 : {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
1028 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1029 : };
1030 :
1031 : static const TParserStateActionItem actionTPS_InUDecimal[] = {
1032 : {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1033 : {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
1034 : {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1035 : {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1036 : {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1037 : {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1038 : };
1039 :
1040 : static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
1041 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1042 : {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
1043 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1044 : };
1045 :
1046 : static const TParserStateActionItem actionTPS_InDecimal[] = {
1047 : {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1048 : {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
1049 : {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
1050 : {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1051 : {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1052 : {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1053 : };
1054 :
1055 : static const TParserStateActionItem actionTPS_InVerVersion[] = {
1056 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1057 : {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
1058 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1059 : };
1060 :
1061 : static const TParserStateActionItem actionTPS_InSVerVersion[] = {
1062 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1063 : {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
1064 : {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1065 : };
1066 :
1067 :
1068 : static const TParserStateActionItem actionTPS_InVersionFirst[] = {
1069 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1070 : {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
1071 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1072 : };
1073 :
1074 : static const TParserStateActionItem actionTPS_InVersion[] = {
1075 : {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
1076 : {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
1077 : {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1078 : {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
1079 : };
1080 :
1081 : static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
1082 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1083 : {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1084 : {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
1085 : {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
1086 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1087 : };
1088 :
1089 : static const TParserStateActionItem actionTPS_InMantissaSign[] = {
1090 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1091 : {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1092 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1093 : };
1094 :
1095 : static const TParserStateActionItem actionTPS_InMantissa[] = {
1096 : {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
1097 : {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
1098 : {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
1099 : };
1100 :
1101 : static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
1102 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1103 : {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
1104 : {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1105 : {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1106 : {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1107 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1108 : };
1109 :
1110 : static const TParserStateActionItem actionTPS_InXMLEntity[] = {
1111 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1112 : {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1113 : {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1114 : {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1115 : {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
1116 : {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
1117 : {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1118 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1119 : };
1120 :
1121 : static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
1122 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1123 : {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1124 : {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1125 : {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1126 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1127 : };
1128 :
1129 : static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
1130 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1131 : {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1132 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1133 : };
1134 :
1135 : static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
1136 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1137 : {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1138 : {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1139 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1140 : };
1141 :
1142 : static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
1143 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1144 : {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1145 : {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1146 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1147 : };
1148 :
1149 : static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
1150 : {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
1151 : };
1152 :
1153 : static const TParserStateActionItem actionTPS_InTagFirst[] = {
1154 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1155 : {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
1156 : {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
1157 : {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
1158 : {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
1159 : {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
1160 : {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
1161 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1162 : };
1163 :
1164 : static const TParserStateActionItem actionTPS_InXMLBegin[] = {
1165 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1166 : /* <?xml ... */
1167 : /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
1168 : {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
1169 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1170 : };
1171 :
1172 : static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
1173 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1174 : {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
1175 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1176 : };
1177 :
1178 : static const TParserStateActionItem actionTPS_InTagName[] = {
1179 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1180 : /* <br/> case */
1181 : {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
1182 : {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1183 : {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
1184 : {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
1185 : {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1186 : {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1187 : {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1188 : {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1189 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1190 : };
1191 :
1192 : static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
1193 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1194 : {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
1195 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1196 : };
1197 :
1198 : static const TParserStateActionItem actionTPS_InTag[] = {
1199 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1200 : {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1201 : {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
1202 : {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
1203 : {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1204 : {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1205 : {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
1206 : {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1207 : {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1208 : {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
1209 : {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
1210 : {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1211 : {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1212 : {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
1213 : {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
1214 : {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
1215 : {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
1216 : {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
1217 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1218 : };
1219 :
1220 : static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
1221 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1222 : {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1223 : {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
1224 : {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
1225 : };
1226 :
1227 : static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
1228 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1229 : {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1230 : {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
1231 : {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
1232 : };
1233 :
1234 : static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
1235 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1236 : {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
1237 : };
1238 :
1239 : static const TParserStateActionItem actionTPS_InTagEnd[] = {
1240 : {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1241 : };
1242 :
1243 : static const TParserStateActionItem actionTPS_InCommentFirst[] = {
1244 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1245 : {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
1246 : /* <!DOCTYPE ...> */
1247 : {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
1248 : {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
1249 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1250 : };
1251 :
1252 : static const TParserStateActionItem actionTPS_InCommentLast[] = {
1253 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1254 : {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
1255 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1256 : };
1257 :
1258 : static const TParserStateActionItem actionTPS_InComment[] = {
1259 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1260 : {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
1261 : {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1262 : };
1263 :
1264 : static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
1265 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1266 : {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
1267 : {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1268 : };
1269 :
1270 : static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
1271 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1272 : {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1273 : {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
1274 : {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1275 : };
1276 :
1277 : static const TParserStateActionItem actionTPS_InCommentEnd[] = {
1278 : {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1279 : };
1280 :
1281 : static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
1282 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1283 : {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
1284 : {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1285 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1286 : };
1287 :
1288 : static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
1289 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1290 : {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1291 : {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1292 : {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1293 : {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1294 : {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1295 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1296 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1297 : };
1298 :
1299 : static const TParserStateActionItem actionTPS_InHostDomain[] = {
1300 : {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1301 : {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1302 : {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1303 : {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1304 : {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1305 : {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1306 : {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1307 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1308 : {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1309 : {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1310 : {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1311 : {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1312 : };
1313 :
1314 : static const TParserStateActionItem actionTPS_InPortFirst[] = {
1315 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1316 : {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1317 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1318 : };
1319 :
1320 : static const TParserStateActionItem actionTPS_InPort[] = {
1321 : {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1322 : {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1323 : {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1324 : {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1325 : {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1326 : };
1327 :
1328 : static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
1329 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1330 : {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1331 : {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1332 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1333 : };
1334 :
1335 : static const TParserStateActionItem actionTPS_InHost[] = {
1336 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1337 : {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1338 : {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1339 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1340 : {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1341 : {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1342 : {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1343 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1344 : };
1345 :
1346 : static const TParserStateActionItem actionTPS_InEmail[] = {
1347 : {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
1348 : {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1349 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1350 : };
1351 :
1352 : static const TParserStateActionItem actionTPS_InFileFirst[] = {
1353 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1354 : {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1355 : {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1356 : {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1357 : {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1358 : {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1359 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1360 : };
1361 :
1362 : static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
1363 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1364 : {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1365 : {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1366 : {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1367 : {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1368 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1369 : };
1370 :
1371 : static const TParserStateActionItem actionTPS_InPathFirst[] = {
1372 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1373 : {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1374 : {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1375 : {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1376 : {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1377 : {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1378 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1379 : };
1380 :
1381 : static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
1382 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1383 : {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1384 : {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1385 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1386 : };
1387 :
1388 : static const TParserStateActionItem actionTPS_InPathSecond[] = {
1389 : {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1390 : {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1391 : {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1392 : {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1393 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1394 : };
1395 :
1396 : static const TParserStateActionItem actionTPS_InFile[] = {
1397 : {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1398 : {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1399 : {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1400 : {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1401 : {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1402 : {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1403 : {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1404 : {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1405 : };
1406 :
1407 : static const TParserStateActionItem actionTPS_InFileNext[] = {
1408 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1409 : {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1410 : {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1411 : {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1412 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1413 : };
1414 :
1415 : static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
1416 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1417 : {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1418 : {NULL, 0, A_POP, TPS_Null, 0, NULL},
1419 : };
1420 :
1421 : static const TParserStateActionItem actionTPS_InURLPathStart[] = {
1422 : {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1423 : };
1424 :
1425 : static const TParserStateActionItem actionTPS_InURLPath[] = {
1426 : {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1427 : {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1428 : {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1429 : };
1430 :
1431 : static const TParserStateActionItem actionTPS_InFURL[] = {
1432 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1433 : {p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
1434 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1435 : };
1436 :
1437 : static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
1438 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1439 : {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1440 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1441 : };
1442 :
1443 : static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
1444 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1445 : {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1446 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1447 : };
1448 :
1449 : static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
1450 : {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1451 : };
1452 :
1453 : static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
1454 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1455 : {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1456 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1457 : {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1458 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1459 : };
1460 :
1461 : static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
1462 : {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
1463 : {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1464 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1465 : {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1466 : {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1467 : {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1468 : {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
1469 : };
1470 :
1471 : static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
1472 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1473 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1474 : {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1475 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1476 : };
1477 :
1478 : static const TParserStateActionItem actionTPS_InHyphenWord[] = {
1479 : {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
1480 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1481 : {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1482 : {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1483 : {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1484 : {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
1485 : };
1486 :
1487 : static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
1488 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1489 : {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1490 : {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1491 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1492 : };
1493 :
1494 : static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
1495 : {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
1496 : {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1497 : {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1498 : {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1499 : {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
1500 : };
1501 :
1502 : static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
1503 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1504 : {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1505 : {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1506 : {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1507 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1508 : };
1509 :
1510 : static const TParserStateActionItem actionTPS_InParseHyphen[] = {
1511 : {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1512 : {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1513 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1514 : {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1515 : {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1516 : {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1517 : };
1518 :
1519 : static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
1520 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1521 : {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1522 : {p_isspecial, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1523 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1524 : };
1525 :
1526 : static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
1527 : {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1528 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1529 : {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1530 : {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1531 : {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1532 : };
1533 :
1534 : static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
1535 : {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1536 : {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1537 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1538 : {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1539 : {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1540 : {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1541 : };
1542 :
1543 : static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
1544 : {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1545 : {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1546 : {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1547 : {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1548 : };
1549 :
1550 : static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
1551 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1552 : {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1553 : {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1554 : {p_isspecial, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1555 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1556 : };
1557 :
1558 :
1559 : /*
1560 : * main table of per-state parser actions
1561 : */
1562 : typedef struct
1563 : {
1564 : const TParserStateActionItem *action; /* the actual state info */
1565 : TParserState state; /* only for Assert crosscheck */
1566 : #ifdef WPARSER_TRACE
1567 : const char *state_name; /* only for debug printout */
1568 : #endif
1569 : } TParserStateAction;
1570 :
1571 : #ifdef WPARSER_TRACE
1572 : #define TPARSERSTATEACTION(state) \
1573 : { CppConcat(action,state), state, CppAsString(state) }
1574 : #else
1575 : #define TPARSERSTATEACTION(state) \
1576 : { CppConcat(action,state), state }
1577 : #endif
1578 :
1579 : /*
1580 : * order must be the same as in typedef enum {} TParserState!!
1581 : */
1582 :
1583 : static const TParserStateAction Actions[] = {
1584 : TPARSERSTATEACTION(TPS_Base),
1585 : TPARSERSTATEACTION(TPS_InNumWord),
1586 : TPARSERSTATEACTION(TPS_InAsciiWord),
1587 : TPARSERSTATEACTION(TPS_InWord),
1588 : TPARSERSTATEACTION(TPS_InUnsignedInt),
1589 : TPARSERSTATEACTION(TPS_InSignedIntFirst),
1590 : TPARSERSTATEACTION(TPS_InSignedInt),
1591 : TPARSERSTATEACTION(TPS_InSpace),
1592 : TPARSERSTATEACTION(TPS_InUDecimalFirst),
1593 : TPARSERSTATEACTION(TPS_InUDecimal),
1594 : TPARSERSTATEACTION(TPS_InDecimalFirst),
1595 : TPARSERSTATEACTION(TPS_InDecimal),
1596 : TPARSERSTATEACTION(TPS_InVerVersion),
1597 : TPARSERSTATEACTION(TPS_InSVerVersion),
1598 : TPARSERSTATEACTION(TPS_InVersionFirst),
1599 : TPARSERSTATEACTION(TPS_InVersion),
1600 : TPARSERSTATEACTION(TPS_InMantissaFirst),
1601 : TPARSERSTATEACTION(TPS_InMantissaSign),
1602 : TPARSERSTATEACTION(TPS_InMantissa),
1603 : TPARSERSTATEACTION(TPS_InXMLEntityFirst),
1604 : TPARSERSTATEACTION(TPS_InXMLEntity),
1605 : TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
1606 : TPARSERSTATEACTION(TPS_InXMLEntityNum),
1607 : TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
1608 : TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
1609 : TPARSERSTATEACTION(TPS_InXMLEntityEnd),
1610 : TPARSERSTATEACTION(TPS_InTagFirst),
1611 : TPARSERSTATEACTION(TPS_InXMLBegin),
1612 : TPARSERSTATEACTION(TPS_InTagCloseFirst),
1613 : TPARSERSTATEACTION(TPS_InTagName),
1614 : TPARSERSTATEACTION(TPS_InTagBeginEnd),
1615 : TPARSERSTATEACTION(TPS_InTag),
1616 : TPARSERSTATEACTION(TPS_InTagEscapeK),
1617 : TPARSERSTATEACTION(TPS_InTagEscapeKK),
1618 : TPARSERSTATEACTION(TPS_InTagBackSleshed),
1619 : TPARSERSTATEACTION(TPS_InTagEnd),
1620 : TPARSERSTATEACTION(TPS_InCommentFirst),
1621 : TPARSERSTATEACTION(TPS_InCommentLast),
1622 : TPARSERSTATEACTION(TPS_InComment),
1623 : TPARSERSTATEACTION(TPS_InCloseCommentFirst),
1624 : TPARSERSTATEACTION(TPS_InCloseCommentLast),
1625 : TPARSERSTATEACTION(TPS_InCommentEnd),
1626 : TPARSERSTATEACTION(TPS_InHostFirstDomain),
1627 : TPARSERSTATEACTION(TPS_InHostDomainSecond),
1628 : TPARSERSTATEACTION(TPS_InHostDomain),
1629 : TPARSERSTATEACTION(TPS_InPortFirst),
1630 : TPARSERSTATEACTION(TPS_InPort),
1631 : TPARSERSTATEACTION(TPS_InHostFirstAN),
1632 : TPARSERSTATEACTION(TPS_InHost),
1633 : TPARSERSTATEACTION(TPS_InEmail),
1634 : TPARSERSTATEACTION(TPS_InFileFirst),
1635 : TPARSERSTATEACTION(TPS_InFileTwiddle),
1636 : TPARSERSTATEACTION(TPS_InPathFirst),
1637 : TPARSERSTATEACTION(TPS_InPathFirstFirst),
1638 : TPARSERSTATEACTION(TPS_InPathSecond),
1639 : TPARSERSTATEACTION(TPS_InFile),
1640 : TPARSERSTATEACTION(TPS_InFileNext),
1641 : TPARSERSTATEACTION(TPS_InURLPathFirst),
1642 : TPARSERSTATEACTION(TPS_InURLPathStart),
1643 : TPARSERSTATEACTION(TPS_InURLPath),
1644 : TPARSERSTATEACTION(TPS_InFURL),
1645 : TPARSERSTATEACTION(TPS_InProtocolFirst),
1646 : TPARSERSTATEACTION(TPS_InProtocolSecond),
1647 : TPARSERSTATEACTION(TPS_InProtocolEnd),
1648 : TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
1649 : TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
1650 : TPARSERSTATEACTION(TPS_InHyphenWordFirst),
1651 : TPARSERSTATEACTION(TPS_InHyphenWord),
1652 : TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
1653 : TPARSERSTATEACTION(TPS_InHyphenNumWord),
1654 : TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
1655 : TPARSERSTATEACTION(TPS_InParseHyphen),
1656 : TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
1657 : TPARSERSTATEACTION(TPS_InHyphenWordPart),
1658 : TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
1659 : TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
1660 : TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
1661 : };
1662 :
1663 :
1664 : static bool
1665 28924 : TParserGet(TParser *prs)
1666 : {
1667 28924 : const TParserStateActionItem *item = NULL;
1668 :
1669 28924 : CHECK_FOR_INTERRUPTS();
1670 :
1671 : Assert(prs->state);
1672 :
1673 28924 : if (prs->state->posbyte >= prs->lenstr)
1674 4754 : return false;
1675 :
1676 24170 : prs->token = prs->str + prs->state->posbyte;
1677 24170 : prs->state->pushedAtAction = NULL;
1678 :
1679 : /* look at string */
1680 103278 : while (prs->state->posbyte <= prs->lenstr)
1681 : {
1682 103278 : if (prs->state->posbyte == prs->lenstr)
1683 4904 : prs->state->charlen = 0;
1684 : else
1685 196748 : prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1686 98374 : pg_mblen(prs->str + prs->state->posbyte);
1687 :
1688 : Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1689 : Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1690 : Assert(Actions[prs->state->state].state == prs->state->state);
1691 :
1692 103278 : if (prs->state->pushedAtAction)
1693 : {
1694 : /* After a POP, pick up at the next test */
1695 2592 : item = prs->state->pushedAtAction + 1;
1696 2592 : prs->state->pushedAtAction = NULL;
1697 : }
1698 : else
1699 : {
1700 100686 : item = Actions[prs->state->state].action;
1701 : Assert(item != NULL);
1702 : }
1703 :
1704 : /* find action by character class */
1705 555708 : while (item->isclass)
1706 : {
1707 524364 : prs->c = item->c;
1708 524364 : if (item->isclass(prs) != 0)
1709 71934 : break;
1710 452430 : item++;
1711 : }
1712 :
1713 : #ifdef WPARSER_TRACE
1714 : {
1715 : TParserPosition *ptr;
1716 :
1717 : fprintf(stderr, "state ");
1718 : /* indent according to stack depth */
1719 : for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1720 : fprintf(stderr, " ");
1721 : fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1722 : if (prs->state->posbyte < prs->lenstr)
1723 : fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1724 : else
1725 : fprintf(stderr, "at EOF");
1726 : fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1727 : (int) (item - Actions[prs->state->state].action),
1728 : (item->flags & A_BINGO) ? " BINGO" : "",
1729 : (item->flags & A_POP) ? " POP" : "",
1730 : (item->flags & A_PUSH) ? " PUSH" : "",
1731 : (item->flags & A_RERUN) ? " RERUN" : "",
1732 : (item->flags & A_CLEAR) ? " CLEAR" : "",
1733 : (item->flags & A_MERGE) ? " MERGE" : "",
1734 : (item->flags & A_CLRALL) ? " CLRALL" : "",
1735 : (item->tostate != TPS_Null) ? " tostate " : "",
1736 : (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1737 : (item->type > 0) ? " type " : "",
1738 : tok_alias[item->type]);
1739 : }
1740 : #endif
1741 :
1742 : /* call special handler if exists */
1743 103278 : if (item->special)
1744 420 : item->special(prs);
1745 :
1746 : /* BINGO, token is found */
1747 103278 : if (item->flags & A_BINGO)
1748 : {
1749 : Assert(item->type > 0);
1750 24170 : prs->lenbytetoken = prs->state->lenbytetoken;
1751 24170 : prs->lenchartoken = prs->state->lenchartoken;
1752 24170 : prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1753 24170 : prs->type = item->type;
1754 : }
1755 :
1756 : /* do various actions by flags */
1757 103278 : if (item->flags & A_POP)
1758 : { /* pop stored state in stack */
1759 2610 : TParserPosition *ptr = prs->state->prev;
1760 :
1761 2610 : pfree(prs->state);
1762 2610 : prs->state = ptr;
1763 : Assert(prs->state);
1764 : }
1765 100668 : else if (item->flags & A_PUSH)
1766 : { /* push (store) state in stack */
1767 5088 : prs->state->pushedAtAction = item; /* remember where we push */
1768 5088 : prs->state = newTParserPosition(prs->state);
1769 : }
1770 95580 : else if (item->flags & A_CLEAR)
1771 : { /* clear previous pushed state */
1772 : TParserPosition *ptr;
1773 :
1774 : Assert(prs->state->prev);
1775 498 : ptr = prs->state->prev->prev;
1776 498 : pfree(prs->state->prev);
1777 498 : prs->state->prev = ptr;
1778 : }
1779 95082 : else if (item->flags & A_CLRALL)
1780 : { /* clear all previous pushed state */
1781 : TParserPosition *ptr;
1782 :
1783 2778 : while (prs->state->prev)
1784 : {
1785 1998 : ptr = prs->state->prev->prev;
1786 1998 : pfree(prs->state->prev);
1787 1998 : prs->state->prev = ptr;
1788 : }
1789 : }
1790 94302 : else if (item->flags & A_MERGE)
1791 : { /* merge posinfo with current and pushed state */
1792 0 : TParserPosition *ptr = prs->state;
1793 :
1794 : Assert(prs->state->prev);
1795 0 : prs->state = prs->state->prev;
1796 :
1797 0 : prs->state->posbyte = ptr->posbyte;
1798 0 : prs->state->poschar = ptr->poschar;
1799 0 : prs->state->charlen = ptr->charlen;
1800 0 : prs->state->lenbytetoken = ptr->lenbytetoken;
1801 0 : prs->state->lenchartoken = ptr->lenchartoken;
1802 0 : pfree(ptr);
1803 : }
1804 :
1805 : /* set new state if pointed */
1806 103278 : if (item->tostate != TPS_Null)
1807 66202 : prs->state->state = item->tostate;
1808 :
1809 : /* check for go away */
1810 103278 : if ((item->flags & A_BINGO) ||
1811 79108 : (prs->state->posbyte >= prs->lenstr &&
1812 0 : (item->flags & A_RERUN) == 0))
1813 : break;
1814 :
1815 : /* go to beginning of loop if we should rerun or we just restore state */
1816 79108 : if (item->flags & (A_RERUN | A_POP))
1817 2634 : continue;
1818 :
1819 : /* move forward */
1820 76474 : if (prs->state->charlen)
1821 : {
1822 76474 : prs->state->posbyte += prs->state->charlen;
1823 76474 : prs->state->lenbytetoken += prs->state->charlen;
1824 76474 : prs->state->poschar++;
1825 76474 : prs->state->lenchartoken++;
1826 : }
1827 : }
1828 :
1829 24170 : return (item && (item->flags & A_BINGO));
1830 : }
1831 :
1832 : Datum
1833 9078 : prsd_lextype(PG_FUNCTION_ARGS)
1834 : {
1835 9078 : LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1836 : int i;
1837 :
1838 217872 : for (i = 1; i <= LASTNUM; i++)
1839 : {
1840 208794 : descr[i - 1].lexid = i;
1841 208794 : descr[i - 1].alias = pstrdup(tok_alias[i]);
1842 208794 : descr[i - 1].descr = pstrdup(lex_descr[i]);
1843 : }
1844 :
1845 9078 : descr[LASTNUM].lexid = 0;
1846 :
1847 9078 : PG_RETURN_POINTER(descr);
1848 : }
1849 :
1850 : Datum
1851 4754 : prsd_start(PG_FUNCTION_ARGS)
1852 : {
1853 4754 : PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
1854 : }
1855 :
1856 : Datum
1857 28684 : prsd_nexttoken(PG_FUNCTION_ARGS)
1858 : {
1859 28684 : TParser *p = (TParser *) PG_GETARG_POINTER(0);
1860 28684 : char **t = (char **) PG_GETARG_POINTER(1);
1861 28684 : int *tlen = (int *) PG_GETARG_POINTER(2);
1862 :
1863 28684 : if (!TParserGet(p))
1864 4754 : PG_RETURN_INT32(0);
1865 :
1866 23930 : *t = p->token;
1867 23930 : *tlen = p->lenbytetoken;
1868 :
1869 23930 : PG_RETURN_INT32(p->type);
1870 : }
1871 :
1872 : Datum
1873 4754 : prsd_end(PG_FUNCTION_ARGS)
1874 : {
1875 4754 : TParser *p = (TParser *) PG_GETARG_POINTER(0);
1876 :
1877 4754 : TParserClose(p);
1878 4754 : PG_RETURN_VOID();
1879 : }
1880 :
1881 :
1882 : /*
1883 : * ts_headline support begins here
1884 : */
1885 :
1886 : /* token type classification macros */
1887 : #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
1888 : #define HLIDREPLACE(x) ( (x)==TAG_T )
1889 : #define HLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1890 : #define XMLHLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1891 : #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
1892 : #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
1893 :
1894 : /*
1895 : * Macros useful in headline selection. These rely on availability of
1896 : * "HeadlineParsedText *prs" describing some text, and "int shortword"
1897 : * describing the "short word" length parameter.
1898 : */
1899 :
1900 : /* Interesting words are non-repeated search terms */
1901 : #define INTERESTINGWORD(j) \
1902 : (prs->words[j].item && !prs->words[j].repeated)
1903 :
1904 : /* Don't want to end at a non-word or a short word, unless interesting */
1905 : #define BADENDPOINT(j) \
1906 : ((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \
1907 : !INTERESTINGWORD(j))
1908 :
1909 : typedef struct
1910 : {
1911 : /* one cover (well, really one fragment) for mark_hl_fragments */
1912 : int32 startpos; /* fragment's starting word index */
1913 : int32 endpos; /* ending word index (inclusive) */
1914 : int32 poslen; /* number of interesting words */
1915 : int32 curlen; /* total number of words */
1916 : bool chosen; /* chosen? */
1917 : bool excluded; /* excluded? */
1918 : } CoverPos;
1919 :
1920 : typedef struct
1921 : {
1922 : /* callback data for checkcondition_HL */
1923 : HeadlineWordEntry *words;
1924 : int len;
1925 : } hlCheck;
1926 :
1927 :
1928 : /*
1929 : * TS_execute callback for matching a tsquery operand to headline words
1930 : *
1931 : * Note: it's tempting to report words[] indexes as pos values to save
1932 : * searching in hlCover; but that would screw up phrase matching, which
1933 : * expects to measure distances in lexemes not tokens.
1934 : */
1935 : static TSTernaryValue
1936 1000 : checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
1937 : {
1938 1000 : hlCheck *checkval = (hlCheck *) opaque;
1939 : int i;
1940 :
1941 : /* scan words array for matching items */
1942 25450 : for (i = 0; i < checkval->len; i++)
1943 : {
1944 24650 : if (checkval->words[i].item == val)
1945 : {
1946 : /* if data == NULL, don't need to report positions */
1947 874 : if (!data)
1948 200 : return TS_YES;
1949 :
1950 674 : if (!data->pos)
1951 : {
1952 476 : data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
1953 476 : data->allocated = true;
1954 476 : data->npos = 1;
1955 476 : data->pos[0] = checkval->words[i].pos;
1956 : }
1957 198 : else if (data->pos[data->npos - 1] < checkval->words[i].pos)
1958 : {
1959 198 : data->pos[data->npos++] = checkval->words[i].pos;
1960 : }
1961 : }
1962 : }
1963 :
1964 800 : if (data && data->npos > 0)
1965 476 : return TS_YES;
1966 :
1967 324 : return TS_NO;
1968 : }
1969 :
1970 : /*
1971 : * hlCover: try to find a substring of prs' word list that satisfies query
1972 : *
1973 : * locations is the result of TS_execute_locations() for the query.
1974 : * We use this to identify plausible subranges of the query.
1975 : *
1976 : * *nextpos is the lexeme position (NOT word index) to start the search
1977 : * at. Caller should initialize this to zero. If successful, we'll
1978 : * advance it to the next place to search at.
1979 : *
1980 : * On success, sets *p to first word index and *q to last word index of the
1981 : * cover substring, and returns true.
1982 : *
1983 : * The result is a minimal cover, in the sense that both *p and *q will be
1984 : * words used in the query.
1985 : */
1986 : static bool
1987 562 : hlCover(HeadlineParsedText *prs, TSQuery query, List *locations,
1988 : int *nextpos, int *p, int *q)
1989 : {
1990 562 : int pos = *nextpos;
1991 :
1992 : /* This loop repeats when our selected word-range fails the query */
1993 : for (;;)
1994 60 : {
1995 : int posb,
1996 : pose;
1997 : ListCell *lc;
1998 :
1999 : /*
2000 : * For each AND'ed query term or phrase, find its first occurrence at
2001 : * or after pos; set pose to the maximum of those positions.
2002 : *
2003 : * We need not consider ORs or NOTs here; see the comments for
2004 : * TS_execute_locations(). Rechecking the match with TS_execute(),
2005 : * below, will deal with any ensuing imprecision.
2006 : */
2007 622 : pose = -1;
2008 966 : foreach(lc, locations)
2009 : {
2010 466 : ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
2011 466 : int first = -1;
2012 :
2013 792 : for (int i = 0; i < pdata->npos; i++)
2014 : {
2015 : /* For phrase matches, use the ending lexeme */
2016 670 : int endp = pdata->pos[i];
2017 :
2018 670 : if (endp >= pos)
2019 : {
2020 344 : first = endp;
2021 344 : break;
2022 : }
2023 : }
2024 466 : if (first < 0)
2025 122 : return false; /* no more matches for this term */
2026 344 : if (first > pose)
2027 326 : pose = first;
2028 : }
2029 :
2030 500 : if (pose < 0)
2031 246 : return false; /* we only get here if empty list */
2032 :
2033 : /*
2034 : * Now, for each AND'ed query term or phrase, find its last occurrence
2035 : * at or before pose; set posb to the minimum of those positions.
2036 : *
2037 : * We start posb at INT_MAX - 1 to guarantee no overflow if we compute
2038 : * posb + 1 below.
2039 : */
2040 254 : posb = INT_MAX - 1;
2041 586 : foreach(lc, locations)
2042 : {
2043 332 : ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
2044 332 : int last = -1;
2045 :
2046 494 : for (int i = pdata->npos - 1; i >= 0; i--)
2047 : {
2048 : /* For phrase matches, use the starting lexeme */
2049 494 : int startp = pdata->pos[i] - pdata->width;
2050 :
2051 494 : if (startp <= pose)
2052 : {
2053 332 : last = startp;
2054 332 : break;
2055 : }
2056 : }
2057 332 : if (last < posb)
2058 272 : posb = last;
2059 : }
2060 :
2061 : /*
2062 : * We could end up with posb to the left of pos, in case some phrase
2063 : * match crosses pos. Try the match starting at pos anyway, since the
2064 : * result of TS_execute_locations is imprecise for phrase matches OR'd
2065 : * with plain matches; that is, if the query is "(A <-> B) | C" then C
2066 : * could match at pos even though the phrase match would have to
2067 : * extend to the left of pos.
2068 : */
2069 254 : posb = Max(posb, pos);
2070 :
2071 : /* This test probably always succeeds, but be paranoid */
2072 254 : if (posb <= pose)
2073 : {
2074 : /*
2075 : * posb .. pose is now the shortest, earliest-after-pos range of
2076 : * lexeme positions containing all the query terms. It will
2077 : * contain all phrase matches, too, except in the corner case
2078 : * described just above.
2079 : *
2080 : * Now convert these lexeme positions to indexes in prs->words[].
2081 : */
2082 254 : int idxb = -1;
2083 254 : int idxe = -1;
2084 :
2085 11624 : for (int i = 0; i < prs->curwords; i++)
2086 : {
2087 11496 : if (prs->words[i].item == NULL)
2088 10612 : continue;
2089 884 : if (idxb < 0 && prs->words[i].pos >= posb)
2090 254 : idxb = i;
2091 884 : if (prs->words[i].pos <= pose)
2092 758 : idxe = i;
2093 : else
2094 126 : break;
2095 : }
2096 :
2097 : /* This test probably always succeeds, but be paranoid */
2098 254 : if (idxb >= 0 && idxe >= idxb)
2099 : {
2100 : /*
2101 : * Finally, check that the selected range satisfies the query.
2102 : * This should succeed in all simple cases; but odd cases
2103 : * involving non-top-level NOT conditions or phrase matches
2104 : * OR'd with other things could fail, since the result of
2105 : * TS_execute_locations doesn't fully represent such things.
2106 : */
2107 : hlCheck ch;
2108 :
2109 254 : ch.words = &(prs->words[idxb]);
2110 254 : ch.len = idxe - idxb + 1;
2111 254 : if (TS_execute(GETQUERY(query), &ch,
2112 : TS_EXEC_EMPTY, checkcondition_HL))
2113 : {
2114 : /* Match! Advance *nextpos and return the word range. */
2115 194 : *nextpos = posb + 1;
2116 194 : *p = idxb;
2117 194 : *q = idxe;
2118 194 : return true;
2119 : }
2120 : }
2121 : }
2122 :
2123 : /*
2124 : * Advance pos and try again. Any later workable match must start
2125 : * beyond posb.
2126 : */
2127 60 : pos = posb + 1;
2128 : }
2129 : /* Can't get here, but stupider compilers complain if we leave it off */
2130 : return false;
2131 : }
2132 :
2133 : /*
2134 : * Apply suitable highlight marking to words selected by headline selector
2135 : *
2136 : * The words from startpos to endpos inclusive are marked per highlightall
2137 : */
2138 : static void
2139 386 : mark_fragment(HeadlineParsedText *prs, bool highlightall,
2140 : int startpos, int endpos)
2141 : {
2142 : int i;
2143 :
2144 5654 : for (i = startpos; i <= endpos; i++)
2145 : {
2146 5268 : if (prs->words[i].item)
2147 500 : prs->words[i].selected = 1;
2148 5268 : if (!highlightall)
2149 : {
2150 5022 : if (HLIDREPLACE(prs->words[i].type))
2151 0 : prs->words[i].replace = 1;
2152 5022 : else if (HLIDSKIP(prs->words[i].type))
2153 0 : prs->words[i].skip = 1;
2154 : }
2155 : else
2156 : {
2157 246 : if (XMLHLIDSKIP(prs->words[i].type))
2158 6 : prs->words[i].skip = 1;
2159 : }
2160 :
2161 5268 : prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2162 : }
2163 386 : }
2164 :
2165 : /*
2166 : * split a cover substring into fragments not longer than max_words
2167 : *
2168 : * At entry, *startpos and *endpos are the (remaining) bounds of the cover
2169 : * substring. They are updated to hold the bounds of the next fragment.
2170 : *
2171 : * *curlen and *poslen are set to the fragment's length, in words and
2172 : * interesting words respectively.
2173 : */
2174 : static void
2175 36 : get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
2176 : int *curlen, int *poslen, int max_words)
2177 : {
2178 : int i;
2179 :
2180 : /*
2181 : * Objective: select a fragment of words between startpos and endpos such
2182 : * that it has at most max_words and both ends have query words. If the
2183 : * startpos and endpos are the endpoints of the cover and the cover has
2184 : * fewer words than max_words, then this function should just return the
2185 : * cover
2186 : */
2187 : /* first move startpos to an item */
2188 888 : for (i = *startpos; i <= *endpos; i++)
2189 : {
2190 888 : *startpos = i;
2191 888 : if (INTERESTINGWORD(i))
2192 36 : break;
2193 : }
2194 : /* cut endpos to have only max_words */
2195 36 : *curlen = 0;
2196 36 : *poslen = 0;
2197 960 : for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
2198 : {
2199 924 : if (!NONWORDTOKEN(prs->words[i].type))
2200 480 : *curlen += 1;
2201 924 : if (INTERESTINGWORD(i))
2202 54 : *poslen += 1;
2203 : }
2204 : /* if the cover was cut then move back endpos to a query item */
2205 36 : if (*endpos > i)
2206 : {
2207 12 : *endpos = i;
2208 840 : for (i = *endpos; i >= *startpos; i--)
2209 : {
2210 840 : *endpos = i;
2211 840 : if (INTERESTINGWORD(i))
2212 12 : break;
2213 828 : if (!NONWORDTOKEN(prs->words[i].type))
2214 408 : *curlen -= 1;
2215 : }
2216 : }
2217 36 : }
2218 :
2219 : /*
2220 : * Headline selector used when MaxFragments > 0
2221 : *
2222 : * Note: in this mode, highlightall is disregarded for phrase selection;
2223 : * it only controls presentation details.
2224 : */
2225 : static void
2226 30 : mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, List *locations,
2227 : bool highlightall,
2228 : int shortword, int min_words,
2229 : int max_words, int max_fragments)
2230 : {
2231 : int32 poslen,
2232 : curlen,
2233 : i,
2234 : f,
2235 30 : num_f = 0;
2236 : int32 stretch,
2237 : maxstretch,
2238 : posmarker;
2239 :
2240 30 : int32 startpos = 0,
2241 30 : endpos = 0,
2242 30 : nextpos = 0,
2243 30 : p = 0,
2244 30 : q = 0;
2245 :
2246 30 : int32 numcovers = 0,
2247 30 : maxcovers = 32;
2248 :
2249 : int32 minI,
2250 : minwords,
2251 : maxitems;
2252 : CoverPos *covers;
2253 :
2254 30 : covers = palloc(maxcovers * sizeof(CoverPos));
2255 :
2256 : /* get all covers */
2257 54 : while (hlCover(prs, query, locations, &nextpos, &p, &q))
2258 : {
2259 24 : startpos = p;
2260 24 : endpos = q;
2261 :
2262 : /*
2263 : * Break the cover into smaller fragments such that each fragment has
2264 : * at most max_words. Also ensure that each end of each fragment is a
2265 : * query word. This will allow us to stretch the fragment in either
2266 : * direction
2267 : */
2268 :
2269 60 : while (startpos <= endpos)
2270 : {
2271 36 : get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
2272 36 : if (numcovers >= maxcovers)
2273 : {
2274 0 : maxcovers *= 2;
2275 0 : covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
2276 : }
2277 36 : covers[numcovers].startpos = startpos;
2278 36 : covers[numcovers].endpos = endpos;
2279 36 : covers[numcovers].curlen = curlen;
2280 36 : covers[numcovers].poslen = poslen;
2281 36 : covers[numcovers].chosen = false;
2282 36 : covers[numcovers].excluded = false;
2283 36 : numcovers++;
2284 36 : startpos = endpos + 1;
2285 36 : endpos = q;
2286 : }
2287 : }
2288 :
2289 : /* choose best covers */
2290 66 : for (f = 0; f < max_fragments; f++)
2291 : {
2292 48 : maxitems = 0;
2293 48 : minwords = PG_INT32_MAX;
2294 48 : minI = -1;
2295 :
2296 : /*
2297 : * Choose the cover that contains max items. In case of tie choose the
2298 : * one with smaller number of words.
2299 : */
2300 114 : for (i = 0; i < numcovers; i++)
2301 : {
2302 66 : if (!covers[i].chosen && !covers[i].excluded &&
2303 48 : (maxitems < covers[i].poslen ||
2304 12 : (maxitems == covers[i].poslen &&
2305 12 : minwords > covers[i].curlen)))
2306 : {
2307 36 : maxitems = covers[i].poslen;
2308 36 : minwords = covers[i].curlen;
2309 36 : minI = i;
2310 : }
2311 : }
2312 : /* if a cover was found mark it */
2313 48 : if (minI >= 0)
2314 : {
2315 36 : covers[minI].chosen = true;
2316 : /* adjust the size of cover */
2317 36 : startpos = covers[minI].startpos;
2318 36 : endpos = covers[minI].endpos;
2319 36 : curlen = covers[minI].curlen;
2320 : /* stretch the cover if cover size is lower than max_words */
2321 36 : if (curlen < max_words)
2322 : {
2323 : /* divide the stretch on both sides of cover */
2324 36 : maxstretch = (max_words - curlen) / 2;
2325 :
2326 : /*
2327 : * first stretch the startpos stop stretching if 1. we hit the
2328 : * beginning of document 2. exceed maxstretch 3. we hit an
2329 : * already marked fragment
2330 : */
2331 36 : stretch = 0;
2332 36 : posmarker = startpos;
2333 600 : for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
2334 : {
2335 564 : if (!NONWORDTOKEN(prs->words[i].type))
2336 : {
2337 270 : curlen++;
2338 270 : stretch++;
2339 : }
2340 564 : posmarker = i;
2341 : }
2342 : /* cut back startpos till we find a good endpoint */
2343 132 : for (i = posmarker; i < startpos && BADENDPOINT(i); i++)
2344 : {
2345 96 : if (!NONWORDTOKEN(prs->words[i].type))
2346 36 : curlen--;
2347 : }
2348 36 : startpos = i;
2349 : /* now stretch the endpos as much as possible */
2350 36 : posmarker = endpos;
2351 966 : for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
2352 : {
2353 930 : if (!NONWORDTOKEN(prs->words[i].type))
2354 462 : curlen++;
2355 930 : posmarker = i;
2356 : }
2357 : /* cut back endpos till we find a good endpoint */
2358 90 : for (i = posmarker; i > endpos && BADENDPOINT(i); i--)
2359 : {
2360 54 : if (!NONWORDTOKEN(prs->words[i].type))
2361 24 : curlen--;
2362 : }
2363 36 : endpos = i;
2364 : }
2365 36 : covers[minI].startpos = startpos;
2366 36 : covers[minI].endpos = endpos;
2367 36 : covers[minI].curlen = curlen;
2368 : /* Mark the chosen fragments (covers) */
2369 36 : mark_fragment(prs, highlightall, startpos, endpos);
2370 36 : num_f++;
2371 : /* Exclude covers overlapping this one from future consideration */
2372 96 : for (i = 0; i < numcovers; i++)
2373 : {
2374 60 : if (i != minI &&
2375 24 : ((covers[i].startpos >= startpos &&
2376 12 : covers[i].startpos <= endpos) ||
2377 24 : (covers[i].endpos >= startpos &&
2378 12 : covers[i].endpos <= endpos) ||
2379 24 : (covers[i].startpos < startpos &&
2380 12 : covers[i].endpos > endpos)))
2381 0 : covers[i].excluded = true;
2382 : }
2383 : }
2384 : else
2385 12 : break; /* no selectable covers remain */
2386 : }
2387 :
2388 : /* show the first min_words words if we have not marked anything */
2389 30 : if (num_f <= 0)
2390 : {
2391 6 : startpos = curlen = 0;
2392 6 : endpos = -1;
2393 186 : for (i = 0; i < prs->curwords && curlen < min_words; i++)
2394 : {
2395 180 : if (!NONWORDTOKEN(prs->words[i].type))
2396 90 : curlen++;
2397 180 : endpos = i;
2398 : }
2399 6 : mark_fragment(prs, highlightall, startpos, endpos);
2400 : }
2401 :
2402 30 : pfree(covers);
2403 30 : }
2404 :
2405 : /*
2406 : * Headline selector used when MaxFragments == 0
2407 : */
2408 : static void
2409 344 : mark_hl_words(HeadlineParsedText *prs, TSQuery query, List *locations,
2410 : bool highlightall,
2411 : int shortword, int min_words, int max_words)
2412 : {
2413 344 : int nextpos = 0,
2414 344 : p = 0,
2415 344 : q = 0;
2416 344 : int bestb = -1,
2417 344 : beste = -1;
2418 344 : int bestlen = -1;
2419 344 : bool bestcover = false;
2420 : int pose,
2421 : posb,
2422 : poslen,
2423 : curlen;
2424 : bool poscover;
2425 : int i;
2426 :
2427 344 : if (!highlightall)
2428 : {
2429 : /* examine all covers, select a headline using the best one */
2430 508 : while (hlCover(prs, query, locations, &nextpos, &p, &q))
2431 : {
2432 : /*
2433 : * Count words (curlen) and interesting words (poslen) within
2434 : * cover, but stop once we reach max_words. This step doesn't
2435 : * consider whether that's a good stopping point. posb and pose
2436 : * are set to the start and end indexes of the possible headline.
2437 : */
2438 170 : curlen = 0;
2439 170 : poslen = 0;
2440 170 : posb = pose = p;
2441 1456 : for (i = p; i <= q && curlen < max_words; i++)
2442 : {
2443 1286 : if (!NONWORDTOKEN(prs->words[i].type))
2444 728 : curlen++;
2445 1286 : if (INTERESTINGWORD(i))
2446 290 : poslen++;
2447 1286 : pose = i;
2448 : }
2449 :
2450 170 : if (curlen < max_words)
2451 : {
2452 : /*
2453 : * We have room to lengthen the headline, so search forward
2454 : * until it's full or we find a good stopping point. We'll
2455 : * reconsider the word at "q", then move forward.
2456 : */
2457 2938 : for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
2458 : {
2459 2912 : if (i > q)
2460 : {
2461 2754 : if (!NONWORDTOKEN(prs->words[i].type))
2462 1374 : curlen++;
2463 2754 : if (INTERESTINGWORD(i))
2464 120 : poslen++;
2465 : }
2466 2912 : pose = i;
2467 2912 : if (BADENDPOINT(i))
2468 1944 : continue;
2469 968 : if (curlen >= min_words)
2470 132 : break;
2471 : }
2472 158 : if (curlen < min_words)
2473 : {
2474 : /*
2475 : * Reached end of text and our headline is still shorter
2476 : * than min_words, so try to extend it to the left.
2477 : */
2478 366 : for (i = p - 1; i >= 0; i--)
2479 : {
2480 364 : if (!NONWORDTOKEN(prs->words[i].type))
2481 182 : curlen++;
2482 364 : if (INTERESTINGWORD(i))
2483 6 : poslen++;
2484 364 : if (curlen >= max_words)
2485 0 : break;
2486 364 : if (BADENDPOINT(i))
2487 236 : continue;
2488 128 : if (curlen >= min_words)
2489 24 : break;
2490 : }
2491 26 : posb = (i >= 0) ? i : 0;
2492 : }
2493 : }
2494 : else
2495 : {
2496 : /*
2497 : * Can't make headline longer, so consider making it shorter
2498 : * if needed to avoid a bad endpoint.
2499 : */
2500 12 : if (i > q)
2501 6 : i = q;
2502 30 : for (; curlen > min_words; i--)
2503 : {
2504 30 : if (!BADENDPOINT(i))
2505 : break;
2506 18 : if (!NONWORDTOKEN(prs->words[i].type))
2507 6 : curlen--;
2508 18 : if (INTERESTINGWORD(i))
2509 0 : poslen--;
2510 18 : pose = i - 1;
2511 : }
2512 : }
2513 :
2514 : /*
2515 : * Check whether the proposed headline includes the original
2516 : * cover; it might not if we trimmed it due to max_words.
2517 : */
2518 170 : poscover = (posb <= p && pose >= q);
2519 :
2520 : /*
2521 : * Adopt this headline if it's better than the last one, giving
2522 : * highest priority to headlines including the cover, then to
2523 : * headlines with more interesting words, then to headlines with
2524 : * good stopping points. (Since bestlen is initially -1, we will
2525 : * certainly adopt the first headline.)
2526 : */
2527 170 : if (poscover > bestcover ||
2528 78 : (poscover == bestcover && poslen > bestlen) ||
2529 72 : (poscover == bestcover && poslen == bestlen &&
2530 12 : !BADENDPOINT(pose) && BADENDPOINT(beste)))
2531 : {
2532 98 : bestb = posb;
2533 98 : beste = pose;
2534 98 : bestlen = poslen;
2535 98 : bestcover = poscover;
2536 : }
2537 : }
2538 :
2539 : /*
2540 : * If we found nothing acceptable, select min_words words starting at
2541 : * the beginning.
2542 : */
2543 338 : if (bestlen < 0)
2544 : {
2545 240 : curlen = 0;
2546 240 : pose = -1;
2547 1038 : for (i = 0; i < prs->curwords && curlen < min_words; i++)
2548 : {
2549 798 : if (!NONWORDTOKEN(prs->words[i].type))
2550 516 : curlen++;
2551 798 : pose = i;
2552 : }
2553 240 : bestb = 0;
2554 240 : beste = pose;
2555 : }
2556 : }
2557 : else
2558 : {
2559 : /* highlightall mode: headline is whole document */
2560 6 : bestb = 0;
2561 6 : beste = prs->curwords - 1;
2562 : }
2563 :
2564 344 : mark_fragment(prs, highlightall, bestb, beste);
2565 344 : }
2566 :
2567 : /*
2568 : * Default parser's prsheadline function
2569 : */
2570 : Datum
2571 374 : prsd_headline(PG_FUNCTION_ARGS)
2572 : {
2573 374 : HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
2574 374 : List *prsoptions = (List *) PG_GETARG_POINTER(1);
2575 374 : TSQuery query = PG_GETARG_TSQUERY(2);
2576 : List *locations;
2577 :
2578 : /* default option values: */
2579 374 : int min_words = 15;
2580 374 : int max_words = 35;
2581 374 : int shortword = 3;
2582 374 : int max_fragments = 0;
2583 374 : bool highlightall = false;
2584 : ListCell *l;
2585 :
2586 : /* Extract configuration option values */
2587 374 : prs->startsel = NULL;
2588 374 : prs->stopsel = NULL;
2589 374 : prs->fragdelim = NULL;
2590 728 : foreach(l, prsoptions)
2591 : {
2592 354 : DefElem *defel = (DefElem *) lfirst(l);
2593 354 : char *val = defGetString(defel);
2594 :
2595 354 : if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
2596 36 : max_words = pg_strtoint32(val);
2597 318 : else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
2598 36 : min_words = pg_strtoint32(val);
2599 282 : else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
2600 0 : shortword = pg_strtoint32(val);
2601 282 : else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
2602 30 : max_fragments = pg_strtoint32(val);
2603 252 : else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
2604 120 : prs->startsel = pstrdup(val);
2605 132 : else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
2606 120 : prs->stopsel = pstrdup(val);
2607 12 : else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
2608 6 : prs->fragdelim = pstrdup(val);
2609 6 : else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
2610 18 : highlightall = (pg_strcasecmp(val, "1") == 0 ||
2611 12 : pg_strcasecmp(val, "on") == 0 ||
2612 6 : pg_strcasecmp(val, "true") == 0 ||
2613 0 : pg_strcasecmp(val, "t") == 0 ||
2614 12 : pg_strcasecmp(val, "y") == 0 ||
2615 0 : pg_strcasecmp(val, "yes") == 0);
2616 : else
2617 0 : ereport(ERROR,
2618 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2619 : errmsg("unrecognized headline parameter: \"%s\"",
2620 : defel->defname)));
2621 : }
2622 :
2623 : /* in HighlightAll mode these parameters are ignored */
2624 374 : if (!highlightall)
2625 : {
2626 368 : if (min_words >= max_words)
2627 0 : ereport(ERROR,
2628 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2629 : errmsg("%s must be less than %s", "MinWords", "MaxWords")));
2630 368 : if (min_words <= 0)
2631 0 : ereport(ERROR,
2632 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2633 : errmsg("%s must be positive", "MinWords")));
2634 368 : if (shortword < 0)
2635 0 : ereport(ERROR,
2636 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2637 : errmsg("%s must be >= 0", "ShortWord")));
2638 368 : if (max_fragments < 0)
2639 0 : ereport(ERROR,
2640 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2641 : errmsg("%s must be >= 0", "MaxFragments")));
2642 : }
2643 :
2644 : /* Locate words and phrases matching the query */
2645 374 : if (query->size > 0)
2646 : {
2647 : hlCheck ch;
2648 :
2649 362 : ch.words = prs->words;
2650 362 : ch.len = prs->curwords;
2651 362 : locations = TS_execute_locations(GETQUERY(query), &ch, TS_EXEC_EMPTY,
2652 : checkcondition_HL);
2653 : }
2654 : else
2655 12 : locations = NIL; /* empty query matches nothing */
2656 :
2657 : /* Apply appropriate headline selector */
2658 374 : if (max_fragments == 0)
2659 344 : mark_hl_words(prs, query, locations, highlightall, shortword,
2660 : min_words, max_words);
2661 : else
2662 30 : mark_hl_fragments(prs, query, locations, highlightall, shortword,
2663 : min_words, max_words, max_fragments);
2664 :
2665 : /* Fill in default values for string options */
2666 374 : if (!prs->startsel)
2667 254 : prs->startsel = pstrdup("<b>");
2668 374 : if (!prs->stopsel)
2669 254 : prs->stopsel = pstrdup("</b>");
2670 374 : if (!prs->fragdelim)
2671 368 : prs->fragdelim = pstrdup(" ... ");
2672 :
2673 : /* Caller will need these lengths, too */
2674 374 : prs->startsellen = strlen(prs->startsel);
2675 374 : prs->stopsellen = strlen(prs->stopsel);
2676 374 : prs->fragdelimlen = strlen(prs->fragdelim);
2677 :
2678 374 : PG_RETURN_POINTER(prs);
2679 : }
|