LCOV - code coverage report
Current view: top level - src/backend/tsearch - wparser_def.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 537 602 89.2 %
Date: 2025-11-22 06:18:03 Functions: 37 52 71.2 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * wparser_def.c
       4             :  *      Default text search parser
       5             :  *
       6             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       7             :  *
       8             :  *
       9             :  * IDENTIFICATION
      10             :  *    src/backend/tsearch/wparser_def.c
      11             :  *
      12             :  *-------------------------------------------------------------------------
      13             :  */
      14             : 
      15             : #include "postgres.h"
      16             : 
      17             : #include <limits.h>
      18             : #include <wctype.h>
      19             : 
      20             : #include "commands/defrem.h"
      21             : #include "mb/pg_wchar.h"
      22             : #include "miscadmin.h"
      23             : #include "tsearch/ts_public.h"
      24             : #include "tsearch/ts_type.h"
      25             : #include "tsearch/ts_utils.h"
      26             : #include "utils/builtins.h"
      27             : #include "utils/pg_locale.h"
      28             : 
      29             : 
      30             : /* Define me to enable tracing of parser behavior */
      31             : /* #define WPARSER_TRACE */
      32             : 
      33             : 
      34             : /* Output token categories */
      35             : 
      36             : #define ASCIIWORD       1
      37             : #define WORD_T          2
      38             : #define NUMWORD         3
      39             : #define EMAIL           4
      40             : #define URL_T           5
      41             : #define HOST            6
      42             : #define SCIENTIFIC      7
      43             : #define VERSIONNUMBER   8
      44             : #define NUMPARTHWORD    9
      45             : #define PARTHWORD       10
      46             : #define ASCIIPARTHWORD  11
      47             : #define SPACE           12
      48             : #define TAG_T           13
      49             : #define PROTOCOL        14
      50             : #define NUMHWORD        15
      51             : #define ASCIIHWORD      16
      52             : #define HWORD           17
      53             : #define URLPATH         18
      54             : #define FILEPATH        19
      55             : #define DECIMAL_T       20
      56             : #define SIGNEDINT       21
      57             : #define UNSIGNEDINT     22
      58             : #define XMLENTITY       23
      59             : 
      60             : #define LASTNUM         23
      61             : 
      62             : static const char *const tok_alias[] = {
      63             :     "",
      64             :     "asciiword",
      65             :     "word",
      66             :     "numword",
      67             :     "email",
      68             :     "url",
      69             :     "host",
      70             :     "sfloat",
      71             :     "version",
      72             :     "hword_numpart",
      73             :     "hword_part",
      74             :     "hword_asciipart",
      75             :     "blank",
      76             :     "tag",
      77             :     "protocol",
      78             :     "numhword",
      79             :     "asciihword",
      80             :     "hword",
      81             :     "url_path",
      82             :     "file",
      83             :     "float",
      84             :     "int",
      85             :     "uint",
      86             :     "entity"
      87             : };
      88             : 
      89             : static const char *const lex_descr[] = {
      90             :     "",
      91             :     "Word, all ASCII",
      92             :     "Word, all letters",
      93             :     "Word, letters and digits",
      94             :     "Email address",
      95             :     "URL",
      96             :     "Host",
      97             :     "Scientific notation",
      98             :     "Version number",
      99             :     "Hyphenated word part, letters and digits",
     100             :     "Hyphenated word part, all letters",
     101             :     "Hyphenated word part, all ASCII",
     102             :     "Space symbols",
     103             :     "XML tag",
     104             :     "Protocol head",
     105             :     "Hyphenated word, letters and digits",
     106             :     "Hyphenated word, all ASCII",
     107             :     "Hyphenated word, all letters",
     108             :     "URL path",
     109             :     "File or path name",
     110             :     "Decimal notation",
     111             :     "Signed integer",
     112             :     "Unsigned integer",
     113             :     "XML entity"
     114             : };
     115             : 
     116             : 
     117             : /* Parser states */
     118             : 
     119             : typedef enum
     120             : {
     121             :     TPS_Base = 0,
     122             :     TPS_InNumWord,
     123             :     TPS_InAsciiWord,
     124             :     TPS_InWord,
     125             :     TPS_InUnsignedInt,
     126             :     TPS_InSignedIntFirst,
     127             :     TPS_InSignedInt,
     128             :     TPS_InSpace,
     129             :     TPS_InUDecimalFirst,
     130             :     TPS_InUDecimal,
     131             :     TPS_InDecimalFirst,
     132             :     TPS_InDecimal,
     133             :     TPS_InVerVersion,
     134             :     TPS_InSVerVersion,
     135             :     TPS_InVersionFirst,
     136             :     TPS_InVersion,
     137             :     TPS_InMantissaFirst,
     138             :     TPS_InMantissaSign,
     139             :     TPS_InMantissa,
     140             :     TPS_InXMLEntityFirst,
     141             :     TPS_InXMLEntity,
     142             :     TPS_InXMLEntityNumFirst,
     143             :     TPS_InXMLEntityNum,
     144             :     TPS_InXMLEntityHexNumFirst,
     145             :     TPS_InXMLEntityHexNum,
     146             :     TPS_InXMLEntityEnd,
     147             :     TPS_InTagFirst,
     148             :     TPS_InXMLBegin,
     149             :     TPS_InTagCloseFirst,
     150             :     TPS_InTagName,
     151             :     TPS_InTagBeginEnd,
     152             :     TPS_InTag,
     153             :     TPS_InTagEscapeK,
     154             :     TPS_InTagEscapeKK,
     155             :     TPS_InTagBackSleshed,
     156             :     TPS_InTagEnd,
     157             :     TPS_InCommentFirst,
     158             :     TPS_InCommentLast,
     159             :     TPS_InComment,
     160             :     TPS_InCloseCommentFirst,
     161             :     TPS_InCloseCommentLast,
     162             :     TPS_InCommentEnd,
     163             :     TPS_InHostFirstDomain,
     164             :     TPS_InHostDomainSecond,
     165             :     TPS_InHostDomain,
     166             :     TPS_InPortFirst,
     167             :     TPS_InPort,
     168             :     TPS_InHostFirstAN,
     169             :     TPS_InHost,
     170             :     TPS_InEmail,
     171             :     TPS_InFileFirst,
     172             :     TPS_InFileTwiddle,
     173             :     TPS_InPathFirst,
     174             :     TPS_InPathFirstFirst,
     175             :     TPS_InPathSecond,
     176             :     TPS_InFile,
     177             :     TPS_InFileNext,
     178             :     TPS_InURLPathFirst,
     179             :     TPS_InURLPathStart,
     180             :     TPS_InURLPath,
     181             :     TPS_InFURL,
     182             :     TPS_InProtocolFirst,
     183             :     TPS_InProtocolSecond,
     184             :     TPS_InProtocolEnd,
     185             :     TPS_InHyphenAsciiWordFirst,
     186             :     TPS_InHyphenAsciiWord,
     187             :     TPS_InHyphenWordFirst,
     188             :     TPS_InHyphenWord,
     189             :     TPS_InHyphenNumWordFirst,
     190             :     TPS_InHyphenNumWord,
     191             :     TPS_InHyphenDigitLookahead,
     192             :     TPS_InParseHyphen,
     193             :     TPS_InParseHyphenHyphen,
     194             :     TPS_InHyphenWordPart,
     195             :     TPS_InHyphenAsciiWordPart,
     196             :     TPS_InHyphenNumWordPart,
     197             :     TPS_InHyphenUnsignedInt,
     198             :     TPS_Null                    /* last state (fake value) */
     199             : } TParserState;
     200             : 
     201             : /* forward declaration */
     202             : struct TParser;
     203             : 
     204             : typedef int (*TParserCharTest) (struct TParser *);  /* any p_is* functions
     205             :                                                      * except p_iseq */
     206             : typedef void (*TParserSpecial) (struct TParser *);  /* special handler for
     207             :                                                      * special cases... */
     208             : 
     209             : typedef struct
     210             : {
     211             :     TParserCharTest isclass;
     212             :     char        c;
     213             :     uint16      flags;
     214             :     TParserState tostate;
     215             :     int         type;
     216             :     TParserSpecial special;
     217             : } TParserStateActionItem;
     218             : 
     219             : /* Flag bits in TParserStateActionItem.flags */
     220             : #define A_NEXT      0x0000
     221             : #define A_BINGO     0x0001
     222             : #define A_POP       0x0002
     223             : #define A_PUSH      0x0004
     224             : #define A_RERUN     0x0008
     225             : #define A_CLEAR     0x0010
     226             : #define A_MERGE     0x0020
     227             : #define A_CLRALL    0x0040
     228             : 
     229             : typedef struct TParserPosition
     230             : {
     231             :     int         posbyte;        /* position of parser in bytes */
     232             :     int         poschar;        /* position of parser in characters */
     233             :     int         charlen;        /* length of current char */
     234             :     int         lenbytetoken;   /* length of token-so-far in bytes */
     235             :     int         lenchartoken;   /* and in chars */
     236             :     TParserState state;
     237             :     struct TParserPosition *prev;
     238             :     const TParserStateActionItem *pushedAtAction;
     239             : } TParserPosition;
     240             : 
     241             : typedef struct TParser
     242             : {
     243             :     /* string and position information */
     244             :     char       *str;            /* multibyte string */
     245             :     int         lenstr;         /* length of mbstring */
     246             :     pg_wchar   *pgwstr;         /* wide character string for C-locale */
     247             : 
     248             :     /* State of parse */
     249             :     int         charmaxlen;
     250             :     TParserPosition *state;
     251             :     bool        ignore;
     252             :     bool        wanthost;
     253             : 
     254             :     /* silly char */
     255             :     char        c;
     256             : 
     257             :     /* out */
     258             :     char       *token;
     259             :     int         lenbytetoken;
     260             :     int         lenchartoken;
     261             :     int         type;
     262             : } TParser;
     263             : 
     264             : 
     265             : /* forward decls here */
     266             : static bool TParserGet(TParser *prs);
     267             : 
     268             : 
     269             : static TParserPosition *
     270       10232 : newTParserPosition(TParserPosition *prev)
     271             : {
     272       10232 :     TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
     273             : 
     274       10232 :     if (prev)
     275        5238 :         memcpy(res, prev, sizeof(TParserPosition));
     276             :     else
     277        4994 :         memset(res, 0, sizeof(TParserPosition));
     278             : 
     279       10232 :     res->prev = prev;
     280             : 
     281       10232 :     res->pushedAtAction = NULL;
     282             : 
     283       10232 :     return res;
     284             : }
     285             : 
     286             : static TParser *
     287        4754 : TParserInit(char *str, int len)
     288             : {
     289        4754 :     TParser    *prs = (TParser *) palloc0(sizeof(TParser));
     290             : 
     291        4754 :     prs->charmaxlen = pg_database_encoding_max_length();
     292        4754 :     prs->str = str;
     293        4754 :     prs->lenstr = len;
     294        4754 :     prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
     295        4754 :     pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
     296             : 
     297        4754 :     prs->state = newTParserPosition(NULL);
     298        4754 :     prs->state->state = TPS_Base;
     299             : 
     300             : #ifdef WPARSER_TRACE
     301             :     fprintf(stderr, "parsing \"%.*s\"\n", len, str);
     302             : #endif
     303             : 
     304        4754 :     return prs;
     305             : }
     306             : 
     307             : /*
     308             :  * As an alternative to a full TParserInit one can create a
     309             :  * TParserCopy which basically is a regular TParser without a private
     310             :  * copy of the string - instead it uses the one from another TParser.
     311             :  * This is useful because at some places TParsers are created
     312             :  * recursively and the repeated copying around of the strings can
     313             :  * cause major inefficiency if the source string is long.
     314             :  * The new parser starts parsing at the original's current position.
     315             :  *
     316             :  * Obviously one must not close the original TParser before the copy.
     317             :  */
     318             : static TParser *
     319         240 : TParserCopyInit(const TParser *orig)
     320             : {
     321         240 :     TParser    *prs = (TParser *) palloc0(sizeof(TParser));
     322             : 
     323         240 :     prs->charmaxlen = orig->charmaxlen;
     324         240 :     prs->str = orig->str + orig->state->posbyte;
     325         240 :     prs->lenstr = orig->lenstr - orig->state->posbyte;
     326             : 
     327         240 :     if (orig->pgwstr)
     328         240 :         prs->pgwstr = orig->pgwstr + orig->state->poschar;
     329             : 
     330         240 :     prs->state = newTParserPosition(NULL);
     331         240 :     prs->state->state = TPS_Base;
     332             : 
     333             : #ifdef WPARSER_TRACE
     334             :     fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
     335             : #endif
     336             : 
     337         240 :     return prs;
     338             : }
     339             : 
     340             : 
     341             : static void
     342        4754 : TParserClose(TParser *prs)
     343             : {
     344        9508 :     while (prs->state)
     345             :     {
     346        4754 :         TParserPosition *ptr = prs->state->prev;
     347             : 
     348        4754 :         pfree(prs->state);
     349        4754 :         prs->state = ptr;
     350             :     }
     351             : 
     352        4754 :     if (prs->pgwstr)
     353        4754 :         pfree(prs->pgwstr);
     354             : 
     355             : #ifdef WPARSER_TRACE
     356             :     fprintf(stderr, "closing parser\n");
     357             : #endif
     358        4754 :     pfree(prs);
     359        4754 : }
     360             : 
     361             : /*
     362             :  * Close a parser created with TParserCopyInit
     363             :  */
     364             : static void
     365         240 : TParserCopyClose(TParser *prs)
     366             : {
     367         612 :     while (prs->state)
     368             :     {
     369         372 :         TParserPosition *ptr = prs->state->prev;
     370             : 
     371         372 :         pfree(prs->state);
     372         372 :         prs->state = ptr;
     373             :     }
     374             : 
     375             : #ifdef WPARSER_TRACE
     376             :     fprintf(stderr, "closing parser copy\n");
     377             : #endif
     378         240 :     pfree(prs);
     379         240 : }
     380             : 
     381             : 
     382             : /*
     383             :  * Character-type support functions using the database default locale. If the
     384             :  * locale is C, and the input character is non-ascii, the value to be returned
     385             :  * is determined by the 'nonascii' macro argument.
     386             :  */
     387             : 
     388             : #define p_iswhat(type, nonascii)                                            \
     389             :                                                                             \
     390             : static int                                                                  \
     391             : p_is##type(TParser *prs)                                                    \
     392             : {                                                                           \
     393             :     pg_locale_t locale = pg_database_locale();                              \
     394             :     pg_wchar    wc;                                                         \
     395             :     Assert(prs->state);                                                      \
     396             :     wc = prs->pgwstr[prs->state->poschar];                                 \
     397             :     if (prs->charmaxlen > 1 && locale->ctype_is_c && wc > 0x7f)             \
     398             :         return nonascii;                                                    \
     399             :     return pg_isw##type(wc, pg_database_locale());                      \
     400             : }                                                                           \
     401             :                                                                             \
     402             : static int                                                                  \
     403             : p_isnot##type(TParser *prs)                                                 \
     404             : {                                                                           \
     405             :     return !p_is##type(prs);                                                \
     406             : }
     407             : 
     408             : /*
     409             :  * In C locale with a multibyte encoding, any non-ASCII symbol is considered
     410             :  * an alpha character, but not a member of other char classes.
     411             :  */
     412       25122 : p_iswhat(alnum, 1)
     413       93856 : p_iswhat(alpha, 1)
     414       37132 : p_iswhat(digit, 0)
     415           0 : p_iswhat(lower, 0)
     416           0 : p_iswhat(print, 0)
     417           0 : p_iswhat(punct, 0)
     418         678 : p_iswhat(space, 0)
     419           0 : p_iswhat(upper, 0)
     420          18 : p_iswhat(xdigit, 0)
     421             : 
     422             : /* p_iseq should be used only for ascii symbols */
     423             : 
     424             : static int
     425      231392 : p_iseq(TParser *prs, char c)
     426             : {
     427             :     Assert(prs->state);
     428      231392 :     return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
     429             : }
     430             : 
     431             : static int
     432      100158 : p_isEOF(TParser *prs)
     433             : {
     434             :     Assert(prs->state);
     435      100158 :     return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
     436             : }
     437             : 
     438             : static int
     439      231392 : p_iseqC(TParser *prs)
     440             : {
     441      231392 :     return p_iseq(prs, prs->c);
     442             : }
     443             : 
     444             : static int
     445           0 : p_isneC(TParser *prs)
     446             : {
     447           0 :     return !p_iseq(prs, prs->c);
     448             : }
     449             : 
     450             : static int
     451       73544 : p_isascii(TParser *prs)
     452             : {
     453       73544 :     return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
     454             : }
     455             : 
     456             : static int
     457       73544 : p_isasclet(TParser *prs)
     458             : {
     459       73544 :     return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
     460             : }
     461             : 
     462             : static int
     463        2658 : p_isurlchar(TParser *prs)
     464             : {
     465             :     char        ch;
     466             : 
     467             :     /* no non-ASCII need apply */
     468        2658 :     if (prs->state->charlen != 1)
     469           0 :         return 0;
     470        2658 :     ch = *(prs->str + prs->state->posbyte);
     471             :     /* no spaces or control characters */
     472        2658 :     if (ch <= 0x20 || ch >= 0x7F)
     473         234 :         return 0;
     474             :     /* reject characters disallowed by RFC 3986 */
     475        2424 :     switch (ch)
     476             :     {
     477          24 :         case '"':
     478             :         case '<':
     479             :         case '>':
     480             :         case '\\':
     481             :         case '^':
     482             :         case '`':
     483             :         case '{':
     484             :         case '|':
     485             :         case '}':
     486          24 :             return 0;
     487             :     }
     488        2400 :     return 1;
     489             : }
     490             : 
     491             : 
     492             : /* deliberately suppress unused-function complaints for the above */
     493             : void        _make_compiler_happy(void);
     494             : void
     495           0 : _make_compiler_happy(void)
     496             : {
     497           0 :     p_isalnum(NULL);
     498           0 :     p_isnotalnum(NULL);
     499           0 :     p_isalpha(NULL);
     500           0 :     p_isnotalpha(NULL);
     501           0 :     p_isdigit(NULL);
     502           0 :     p_isnotdigit(NULL);
     503           0 :     p_islower(NULL);
     504           0 :     p_isnotlower(NULL);
     505           0 :     p_isprint(NULL);
     506           0 :     p_isnotprint(NULL);
     507           0 :     p_ispunct(NULL);
     508           0 :     p_isnotpunct(NULL);
     509           0 :     p_isspace(NULL);
     510           0 :     p_isnotspace(NULL);
     511           0 :     p_isupper(NULL);
     512           0 :     p_isnotupper(NULL);
     513           0 :     p_isxdigit(NULL);
     514           0 :     p_isnotxdigit(NULL);
     515           0 :     p_isEOF(NULL);
     516           0 :     p_iseqC(NULL);
     517           0 :     p_isneC(NULL);
     518           0 : }
     519             : 
     520             : 
     521             : static void
     522         252 : SpecialTags(TParser *prs)
     523             : {
     524         252 :     switch (prs->state->lenchartoken)
     525             :     {
     526           6 :         case 8:                 /* </script */
     527           6 :             if (pg_strncasecmp(prs->token, "</script", 8) == 0)
     528           6 :                 prs->ignore = false;
     529           6 :             break;
     530          24 :         case 7:                 /* <script || </style */
     531          24 :             if (pg_strncasecmp(prs->token, "</style", 7) == 0)
     532           0 :                 prs->ignore = false;
     533          24 :             else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
     534           6 :                 prs->ignore = true;
     535          24 :             break;
     536          18 :         case 6:                 /* <style */
     537          18 :             if (pg_strncasecmp(prs->token, "<style", 6) == 0)
     538           0 :                 prs->ignore = true;
     539          18 :             break;
     540         204 :         default:
     541         204 :             break;
     542             :     }
     543         252 : }
     544             : 
     545             : static void
     546         132 : SpecialFURL(TParser *prs)
     547             : {
     548         132 :     prs->wanthost = true;
     549         132 :     prs->state->posbyte -= prs->state->lenbytetoken;
     550         132 :     prs->state->poschar -= prs->state->lenchartoken;
     551         132 : }
     552             : 
     553             : static void
     554          36 : SpecialHyphen(TParser *prs)
     555             : {
     556          36 :     prs->state->posbyte -= prs->state->lenbytetoken;
     557          36 :     prs->state->poschar -= prs->state->lenchartoken;
     558          36 : }
     559             : 
     560             : static void
     561           0 : SpecialVerVersion(TParser *prs)
     562             : {
     563           0 :     prs->state->posbyte -= prs->state->lenbytetoken;
     564           0 :     prs->state->poschar -= prs->state->lenchartoken;
     565           0 :     prs->state->lenbytetoken = 0;
     566           0 :     prs->state->lenchartoken = 0;
     567           0 : }
     568             : 
     569             : static int
     570         480 : p_isstophost(TParser *prs)
     571             : {
     572         480 :     if (prs->wanthost)
     573             :     {
     574         204 :         prs->wanthost = false;
     575         204 :         return 1;
     576             :     }
     577         276 :     return 0;
     578             : }
     579             : 
     580             : static int
     581       36086 : p_isignore(TParser *prs)
     582             : {
     583       36086 :     return (prs->ignore) ? 1 : 0;
     584             : }
     585             : 
     586             : static int
     587          90 : p_ishost(TParser *prs)
     588             : {
     589          90 :     TParser    *tmpprs = TParserCopyInit(prs);
     590          90 :     int         res = 0;
     591             : 
     592          90 :     tmpprs->wanthost = true;
     593             : 
     594             :     /*
     595             :      * Check stack depth before recursing.  (Since TParserGet() doesn't
     596             :      * normally recurse, we put the cost of checking here not there.)
     597             :      */
     598          90 :     check_stack_depth();
     599             : 
     600          90 :     if (TParserGet(tmpprs) && tmpprs->type == HOST)
     601             :     {
     602          72 :         prs->state->posbyte += tmpprs->lenbytetoken;
     603          72 :         prs->state->poschar += tmpprs->lenchartoken;
     604          72 :         prs->state->lenbytetoken += tmpprs->lenbytetoken;
     605          72 :         prs->state->lenchartoken += tmpprs->lenchartoken;
     606          72 :         prs->state->charlen = tmpprs->state->charlen;
     607          72 :         res = 1;
     608             :     }
     609          90 :     TParserCopyClose(tmpprs);
     610             : 
     611          90 :     return res;
     612             : }
     613             : 
     614             : static int
     615         150 : p_isURLPath(TParser *prs)
     616             : {
     617         150 :     TParser    *tmpprs = TParserCopyInit(prs);
     618         150 :     int         res = 0;
     619             : 
     620         150 :     tmpprs->state = newTParserPosition(tmpprs->state);
     621         150 :     tmpprs->state->state = TPS_InURLPathFirst;
     622             : 
     623             :     /*
     624             :      * Check stack depth before recursing.  (Since TParserGet() doesn't
     625             :      * normally recurse, we put the cost of checking here not there.)
     626             :      */
     627         150 :     check_stack_depth();
     628             : 
     629         150 :     if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
     630             :     {
     631         132 :         prs->state->posbyte += tmpprs->lenbytetoken;
     632         132 :         prs->state->poschar += tmpprs->lenchartoken;
     633         132 :         prs->state->lenbytetoken += tmpprs->lenbytetoken;
     634         132 :         prs->state->lenchartoken += tmpprs->lenchartoken;
     635         132 :         prs->state->charlen = tmpprs->state->charlen;
     636         132 :         res = 1;
     637             :     }
     638         150 :     TParserCopyClose(tmpprs);
     639             : 
     640         150 :     return res;
     641             : }
     642             : 
     643             : /*
     644             :  * returns true if current character has zero display length or
     645             :  * it's a special sign in several languages. Such characters
     646             :  * aren't a word-breaker although they aren't an isalpha.
     647             :  * In beginning of word they aren't a part of it.
     648             :  */
     649             : static int
     650        8724 : p_isspecial(TParser *prs)
     651             : {
     652             :     /*
     653             :      * pg_dsplen could return -1 which means error or control character
     654             :      */
     655        8724 :     if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
     656           0 :         return 1;
     657             : 
     658             :     /*
     659             :      * Unicode Characters in the 'Mark, Spacing Combining' Category That
     660             :      * characters are not alpha although they are not breakers of word too.
     661             :      * Check that only in utf encoding, because other encodings aren't
     662             :      * supported by postgres or even exists.
     663             :      */
     664        8724 :     if (GetDatabaseEncoding() == PG_UTF8)
     665             :     {
     666             :         static const pg_wchar strange_letter[] = {
     667             :             /*
     668             :              * use binary search, so elements should be ordered
     669             :              */
     670             :             0x0903,             /* DEVANAGARI SIGN VISARGA */
     671             :             0x093E,             /* DEVANAGARI VOWEL SIGN AA */
     672             :             0x093F,             /* DEVANAGARI VOWEL SIGN I */
     673             :             0x0940,             /* DEVANAGARI VOWEL SIGN II */
     674             :             0x0949,             /* DEVANAGARI VOWEL SIGN CANDRA O */
     675             :             0x094A,             /* DEVANAGARI VOWEL SIGN SHORT O */
     676             :             0x094B,             /* DEVANAGARI VOWEL SIGN O */
     677             :             0x094C,             /* DEVANAGARI VOWEL SIGN AU */
     678             :             0x0982,             /* BENGALI SIGN ANUSVARA */
     679             :             0x0983,             /* BENGALI SIGN VISARGA */
     680             :             0x09BE,             /* BENGALI VOWEL SIGN AA */
     681             :             0x09BF,             /* BENGALI VOWEL SIGN I */
     682             :             0x09C0,             /* BENGALI VOWEL SIGN II */
     683             :             0x09C7,             /* BENGALI VOWEL SIGN E */
     684             :             0x09C8,             /* BENGALI VOWEL SIGN AI */
     685             :             0x09CB,             /* BENGALI VOWEL SIGN O */
     686             :             0x09CC,             /* BENGALI VOWEL SIGN AU */
     687             :             0x09D7,             /* BENGALI AU LENGTH MARK */
     688             :             0x0A03,             /* GURMUKHI SIGN VISARGA */
     689             :             0x0A3E,             /* GURMUKHI VOWEL SIGN AA */
     690             :             0x0A3F,             /* GURMUKHI VOWEL SIGN I */
     691             :             0x0A40,             /* GURMUKHI VOWEL SIGN II */
     692             :             0x0A83,             /* GUJARATI SIGN VISARGA */
     693             :             0x0ABE,             /* GUJARATI VOWEL SIGN AA */
     694             :             0x0ABF,             /* GUJARATI VOWEL SIGN I */
     695             :             0x0AC0,             /* GUJARATI VOWEL SIGN II */
     696             :             0x0AC9,             /* GUJARATI VOWEL SIGN CANDRA O */
     697             :             0x0ACB,             /* GUJARATI VOWEL SIGN O */
     698             :             0x0ACC,             /* GUJARATI VOWEL SIGN AU */
     699             :             0x0B02,             /* ORIYA SIGN ANUSVARA */
     700             :             0x0B03,             /* ORIYA SIGN VISARGA */
     701             :             0x0B3E,             /* ORIYA VOWEL SIGN AA */
     702             :             0x0B40,             /* ORIYA VOWEL SIGN II */
     703             :             0x0B47,             /* ORIYA VOWEL SIGN E */
     704             :             0x0B48,             /* ORIYA VOWEL SIGN AI */
     705             :             0x0B4B,             /* ORIYA VOWEL SIGN O */
     706             :             0x0B4C,             /* ORIYA VOWEL SIGN AU */
     707             :             0x0B57,             /* ORIYA AU LENGTH MARK */
     708             :             0x0BBE,             /* TAMIL VOWEL SIGN AA */
     709             :             0x0BBF,             /* TAMIL VOWEL SIGN I */
     710             :             0x0BC1,             /* TAMIL VOWEL SIGN U */
     711             :             0x0BC2,             /* TAMIL VOWEL SIGN UU */
     712             :             0x0BC6,             /* TAMIL VOWEL SIGN E */
     713             :             0x0BC7,             /* TAMIL VOWEL SIGN EE */
     714             :             0x0BC8,             /* TAMIL VOWEL SIGN AI */
     715             :             0x0BCA,             /* TAMIL VOWEL SIGN O */
     716             :             0x0BCB,             /* TAMIL VOWEL SIGN OO */
     717             :             0x0BCC,             /* TAMIL VOWEL SIGN AU */
     718             :             0x0BD7,             /* TAMIL AU LENGTH MARK */
     719             :             0x0C01,             /* TELUGU SIGN CANDRABINDU */
     720             :             0x0C02,             /* TELUGU SIGN ANUSVARA */
     721             :             0x0C03,             /* TELUGU SIGN VISARGA */
     722             :             0x0C41,             /* TELUGU VOWEL SIGN U */
     723             :             0x0C42,             /* TELUGU VOWEL SIGN UU */
     724             :             0x0C43,             /* TELUGU VOWEL SIGN VOCALIC R */
     725             :             0x0C44,             /* TELUGU VOWEL SIGN VOCALIC RR */
     726             :             0x0C82,             /* KANNADA SIGN ANUSVARA */
     727             :             0x0C83,             /* KANNADA SIGN VISARGA */
     728             :             0x0CBE,             /* KANNADA VOWEL SIGN AA */
     729             :             0x0CC0,             /* KANNADA VOWEL SIGN II */
     730             :             0x0CC1,             /* KANNADA VOWEL SIGN U */
     731             :             0x0CC2,             /* KANNADA VOWEL SIGN UU */
     732             :             0x0CC3,             /* KANNADA VOWEL SIGN VOCALIC R */
     733             :             0x0CC4,             /* KANNADA VOWEL SIGN VOCALIC RR */
     734             :             0x0CC7,             /* KANNADA VOWEL SIGN EE */
     735             :             0x0CC8,             /* KANNADA VOWEL SIGN AI */
     736             :             0x0CCA,             /* KANNADA VOWEL SIGN O */
     737             :             0x0CCB,             /* KANNADA VOWEL SIGN OO */
     738             :             0x0CD5,             /* KANNADA LENGTH MARK */
     739             :             0x0CD6,             /* KANNADA AI LENGTH MARK */
     740             :             0x0D02,             /* MALAYALAM SIGN ANUSVARA */
     741             :             0x0D03,             /* MALAYALAM SIGN VISARGA */
     742             :             0x0D3E,             /* MALAYALAM VOWEL SIGN AA */
     743             :             0x0D3F,             /* MALAYALAM VOWEL SIGN I */
     744             :             0x0D40,             /* MALAYALAM VOWEL SIGN II */
     745             :             0x0D46,             /* MALAYALAM VOWEL SIGN E */
     746             :             0x0D47,             /* MALAYALAM VOWEL SIGN EE */
     747             :             0x0D48,             /* MALAYALAM VOWEL SIGN AI */
     748             :             0x0D4A,             /* MALAYALAM VOWEL SIGN O */
     749             :             0x0D4B,             /* MALAYALAM VOWEL SIGN OO */
     750             :             0x0D4C,             /* MALAYALAM VOWEL SIGN AU */
     751             :             0x0D57,             /* MALAYALAM AU LENGTH MARK */
     752             :             0x0D82,             /* SINHALA SIGN ANUSVARAYA */
     753             :             0x0D83,             /* SINHALA SIGN VISARGAYA */
     754             :             0x0DCF,             /* SINHALA VOWEL SIGN AELA-PILLA */
     755             :             0x0DD0,             /* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
     756             :             0x0DD1,             /* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
     757             :             0x0DD8,             /* SINHALA VOWEL SIGN GAETTA-PILLA */
     758             :             0x0DD9,             /* SINHALA VOWEL SIGN KOMBUVA */
     759             :             0x0DDA,             /* SINHALA VOWEL SIGN DIGA KOMBUVA */
     760             :             0x0DDB,             /* SINHALA VOWEL SIGN KOMBU DEKA */
     761             :             0x0DDC,             /* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
     762             :             0x0DDD,             /* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
     763             :                                  * AELA-PILLA */
     764             :             0x0DDE,             /* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
     765             :             0x0DDF,             /* SINHALA VOWEL SIGN GAYANUKITTA */
     766             :             0x0DF2,             /* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
     767             :             0x0DF3,             /* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
     768             :             0x0F3E,             /* TIBETAN SIGN YAR TSHES */
     769             :             0x0F3F,             /* TIBETAN SIGN MAR TSHES */
     770             :             0x0F7F,             /* TIBETAN SIGN RNAM BCAD */
     771             :             0x102B,             /* MYANMAR VOWEL SIGN TALL AA */
     772             :             0x102C,             /* MYANMAR VOWEL SIGN AA */
     773             :             0x1031,             /* MYANMAR VOWEL SIGN E */
     774             :             0x1038,             /* MYANMAR SIGN VISARGA */
     775             :             0x103B,             /* MYANMAR CONSONANT SIGN MEDIAL YA */
     776             :             0x103C,             /* MYANMAR CONSONANT SIGN MEDIAL RA */
     777             :             0x1056,             /* MYANMAR VOWEL SIGN VOCALIC R */
     778             :             0x1057,             /* MYANMAR VOWEL SIGN VOCALIC RR */
     779             :             0x1062,             /* MYANMAR VOWEL SIGN SGAW KAREN EU */
     780             :             0x1063,             /* MYANMAR TONE MARK SGAW KAREN HATHI */
     781             :             0x1064,             /* MYANMAR TONE MARK SGAW KAREN KE PHO */
     782             :             0x1067,             /* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
     783             :             0x1068,             /* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
     784             :             0x1069,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
     785             :             0x106A,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
     786             :             0x106B,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
     787             :             0x106C,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
     788             :             0x106D,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
     789             :             0x1083,             /* MYANMAR VOWEL SIGN SHAN AA */
     790             :             0x1084,             /* MYANMAR VOWEL SIGN SHAN E */
     791             :             0x1087,             /* MYANMAR SIGN SHAN TONE-2 */
     792             :             0x1088,             /* MYANMAR SIGN SHAN TONE-3 */
     793             :             0x1089,             /* MYANMAR SIGN SHAN TONE-5 */
     794             :             0x108A,             /* MYANMAR SIGN SHAN TONE-6 */
     795             :             0x108B,             /* MYANMAR SIGN SHAN COUNCIL TONE-2 */
     796             :             0x108C,             /* MYANMAR SIGN SHAN COUNCIL TONE-3 */
     797             :             0x108F,             /* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
     798             :             0x17B6,             /* KHMER VOWEL SIGN AA */
     799             :             0x17BE,             /* KHMER VOWEL SIGN OE */
     800             :             0x17BF,             /* KHMER VOWEL SIGN YA */
     801             :             0x17C0,             /* KHMER VOWEL SIGN IE */
     802             :             0x17C1,             /* KHMER VOWEL SIGN E */
     803             :             0x17C2,             /* KHMER VOWEL SIGN AE */
     804             :             0x17C3,             /* KHMER VOWEL SIGN AI */
     805             :             0x17C4,             /* KHMER VOWEL SIGN OO */
     806             :             0x17C5,             /* KHMER VOWEL SIGN AU */
     807             :             0x17C7,             /* KHMER SIGN REAHMUK */
     808             :             0x17C8,             /* KHMER SIGN YUUKALEAPINTU */
     809             :             0x1923,             /* LIMBU VOWEL SIGN EE */
     810             :             0x1924,             /* LIMBU VOWEL SIGN AI */
     811             :             0x1925,             /* LIMBU VOWEL SIGN OO */
     812             :             0x1926,             /* LIMBU VOWEL SIGN AU */
     813             :             0x1929,             /* LIMBU SUBJOINED LETTER YA */
     814             :             0x192A,             /* LIMBU SUBJOINED LETTER RA */
     815             :             0x192B,             /* LIMBU SUBJOINED LETTER WA */
     816             :             0x1930,             /* LIMBU SMALL LETTER KA */
     817             :             0x1931,             /* LIMBU SMALL LETTER NGA */
     818             :             0x1933,             /* LIMBU SMALL LETTER TA */
     819             :             0x1934,             /* LIMBU SMALL LETTER NA */
     820             :             0x1935,             /* LIMBU SMALL LETTER PA */
     821             :             0x1936,             /* LIMBU SMALL LETTER MA */
     822             :             0x1937,             /* LIMBU SMALL LETTER RA */
     823             :             0x1938,             /* LIMBU SMALL LETTER LA */
     824             :             0x19B0,             /* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
     825             :             0x19B1,             /* NEW TAI LUE VOWEL SIGN AA */
     826             :             0x19B2,             /* NEW TAI LUE VOWEL SIGN II */
     827             :             0x19B3,             /* NEW TAI LUE VOWEL SIGN U */
     828             :             0x19B4,             /* NEW TAI LUE VOWEL SIGN UU */
     829             :             0x19B5,             /* NEW TAI LUE VOWEL SIGN E */
     830             :             0x19B6,             /* NEW TAI LUE VOWEL SIGN AE */
     831             :             0x19B7,             /* NEW TAI LUE VOWEL SIGN O */
     832             :             0x19B8,             /* NEW TAI LUE VOWEL SIGN OA */
     833             :             0x19B9,             /* NEW TAI LUE VOWEL SIGN UE */
     834             :             0x19BA,             /* NEW TAI LUE VOWEL SIGN AY */
     835             :             0x19BB,             /* NEW TAI LUE VOWEL SIGN AAY */
     836             :             0x19BC,             /* NEW TAI LUE VOWEL SIGN UY */
     837             :             0x19BD,             /* NEW TAI LUE VOWEL SIGN OY */
     838             :             0x19BE,             /* NEW TAI LUE VOWEL SIGN OAY */
     839             :             0x19BF,             /* NEW TAI LUE VOWEL SIGN UEY */
     840             :             0x19C0,             /* NEW TAI LUE VOWEL SIGN IY */
     841             :             0x19C8,             /* NEW TAI LUE TONE MARK-1 */
     842             :             0x19C9,             /* NEW TAI LUE TONE MARK-2 */
     843             :             0x1A19,             /* BUGINESE VOWEL SIGN E */
     844             :             0x1A1A,             /* BUGINESE VOWEL SIGN O */
     845             :             0x1A1B,             /* BUGINESE VOWEL SIGN AE */
     846             :             0x1B04,             /* BALINESE SIGN BISAH */
     847             :             0x1B35,             /* BALINESE VOWEL SIGN TEDUNG */
     848             :             0x1B3B,             /* BALINESE VOWEL SIGN RA REPA TEDUNG */
     849             :             0x1B3D,             /* BALINESE VOWEL SIGN LA LENGA TEDUNG */
     850             :             0x1B3E,             /* BALINESE VOWEL SIGN TALING */
     851             :             0x1B3F,             /* BALINESE VOWEL SIGN TALING REPA */
     852             :             0x1B40,             /* BALINESE VOWEL SIGN TALING TEDUNG */
     853             :             0x1B41,             /* BALINESE VOWEL SIGN TALING REPA TEDUNG */
     854             :             0x1B43,             /* BALINESE VOWEL SIGN PEPET TEDUNG */
     855             :             0x1B44,             /* BALINESE ADEG ADEG */
     856             :             0x1B82,             /* SUNDANESE SIGN PANGWISAD */
     857             :             0x1BA1,             /* SUNDANESE CONSONANT SIGN PAMINGKAL */
     858             :             0x1BA6,             /* SUNDANESE VOWEL SIGN PANAELAENG */
     859             :             0x1BA7,             /* SUNDANESE VOWEL SIGN PANOLONG */
     860             :             0x1BAA,             /* SUNDANESE SIGN PAMAAEH */
     861             :             0x1C24,             /* LEPCHA SUBJOINED LETTER YA */
     862             :             0x1C25,             /* LEPCHA SUBJOINED LETTER RA */
     863             :             0x1C26,             /* LEPCHA VOWEL SIGN AA */
     864             :             0x1C27,             /* LEPCHA VOWEL SIGN I */
     865             :             0x1C28,             /* LEPCHA VOWEL SIGN O */
     866             :             0x1C29,             /* LEPCHA VOWEL SIGN OO */
     867             :             0x1C2A,             /* LEPCHA VOWEL SIGN U */
     868             :             0x1C2B,             /* LEPCHA VOWEL SIGN UU */
     869             :             0x1C34,             /* LEPCHA CONSONANT SIGN NYIN-DO */
     870             :             0x1C35,             /* LEPCHA CONSONANT SIGN KANG */
     871             :             0xA823,             /* SYLOTI NAGRI VOWEL SIGN A */
     872             :             0xA824,             /* SYLOTI NAGRI VOWEL SIGN I */
     873             :             0xA827,             /* SYLOTI NAGRI VOWEL SIGN OO */
     874             :             0xA880,             /* SAURASHTRA SIGN ANUSVARA */
     875             :             0xA881,             /* SAURASHTRA SIGN VISARGA */
     876             :             0xA8B4,             /* SAURASHTRA CONSONANT SIGN HAARU */
     877             :             0xA8B5,             /* SAURASHTRA VOWEL SIGN AA */
     878             :             0xA8B6,             /* SAURASHTRA VOWEL SIGN I */
     879             :             0xA8B7,             /* SAURASHTRA VOWEL SIGN II */
     880             :             0xA8B8,             /* SAURASHTRA VOWEL SIGN U */
     881             :             0xA8B9,             /* SAURASHTRA VOWEL SIGN UU */
     882             :             0xA8BA,             /* SAURASHTRA VOWEL SIGN VOCALIC R */
     883             :             0xA8BB,             /* SAURASHTRA VOWEL SIGN VOCALIC RR */
     884             :             0xA8BC,             /* SAURASHTRA VOWEL SIGN VOCALIC L */
     885             :             0xA8BD,             /* SAURASHTRA VOWEL SIGN VOCALIC LL */
     886             :             0xA8BE,             /* SAURASHTRA VOWEL SIGN E */
     887             :             0xA8BF,             /* SAURASHTRA VOWEL SIGN EE */
     888             :             0xA8C0,             /* SAURASHTRA VOWEL SIGN AI */
     889             :             0xA8C1,             /* SAURASHTRA VOWEL SIGN O */
     890             :             0xA8C2,             /* SAURASHTRA VOWEL SIGN OO */
     891             :             0xA8C3,             /* SAURASHTRA VOWEL SIGN AU */
     892             :             0xA952,             /* REJANG CONSONANT SIGN H */
     893             :             0xA953,             /* REJANG VIRAMA */
     894             :             0xAA2F,             /* CHAM VOWEL SIGN O */
     895             :             0xAA30,             /* CHAM VOWEL SIGN AI */
     896             :             0xAA33,             /* CHAM CONSONANT SIGN YA */
     897             :             0xAA34,             /* CHAM CONSONANT SIGN RA */
     898             :             0xAA4D              /* CHAM CONSONANT SIGN FINAL H */
     899             :         };
     900        8724 :         const pg_wchar *StopLow = strange_letter,
     901        8724 :                    *StopHigh = strange_letter + lengthof(strange_letter),
     902             :                    *StopMiddle;
     903             :         pg_wchar    c;
     904             : 
     905        8724 :         c = *(prs->pgwstr + prs->state->poschar);
     906             : 
     907       78516 :         while (StopLow < StopHigh)
     908             :         {
     909       69792 :             StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
     910       69792 :             if (*StopMiddle == c)
     911           0 :                 return 1;
     912       69792 :             else if (*StopMiddle < c)
     913           0 :                 StopLow = StopMiddle + 1;
     914             :             else
     915       69792 :                 StopHigh = StopMiddle;
     916             :         }
     917             :     }
     918             : 
     919        8724 :     return 0;
     920             : }
     921             : 
     922             : /*
     923             :  * Table of state/action of parser
     924             :  */
     925             : 
     926             : static const TParserStateActionItem actionTPS_Base[] = {
     927             :     {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
     928             :     {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
     929             :     {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
     930             :     {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
     931             :     {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
     932             :     {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
     933             :     {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
     934             :     {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
     935             :     {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
     936             :     {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
     937             :     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
     938             :     {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
     939             :     {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
     940             : };
     941             : 
     942             : 
     943             : static const TParserStateActionItem actionTPS_InNumWord[] = {
     944             :     {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
     945             :     {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
     946             :     {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
     947             :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
     948             :     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
     949             :     {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
     950             :     {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
     951             :     {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
     952             : };
     953             : 
     954             : static const TParserStateActionItem actionTPS_InAsciiWord[] = {
     955             :     {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
     956             :     {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
     957             :     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
     958             :     {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
     959             :     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
     960             :     {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
     961             :     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
     962             :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
     963             :     {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
     964             :     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
     965             :     {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
     966             :     {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
     967             :     {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
     968             :     {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
     969             :     {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
     970             : };
     971             : 
     972             : static const TParserStateActionItem actionTPS_InWord[] = {
     973             :     {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
     974             :     {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
     975             :     {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
     976             :     {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
     977             :     {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
     978             :     {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
     979             : };
     980             : 
     981             : static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
     982             :     {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
     983             :     {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
     984             :     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
     985             :     {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
     986             :     {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
     987             :     {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
     988             :     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
     989             :     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
     990             :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
     991             :     {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
     992             :     {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
     993             :     {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
     994             :     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
     995             :     {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
     996             : };
     997             : 
     998             : static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
     999             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1000             :     {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
    1001             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1002             : };
    1003             : 
    1004             : static const TParserStateActionItem actionTPS_InSignedInt[] = {
    1005             :     {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
    1006             :     {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
    1007             :     {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
    1008             :     {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
    1009             :     {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
    1010             :     {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
    1011             : };
    1012             : 
    1013             : static const TParserStateActionItem actionTPS_InSpace[] = {
    1014             :     {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
    1015             :     {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
    1016             :     {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
    1017             :     {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
    1018             :     {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
    1019             :     {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
    1020             :     {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
    1021             :     {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
    1022             :     {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
    1023             : };
    1024             : 
    1025             : static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
    1026             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1027             :     {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
    1028             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1029             : };
    1030             : 
    1031             : static const TParserStateActionItem actionTPS_InUDecimal[] = {
    1032             :     {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
    1033             :     {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
    1034             :     {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
    1035             :     {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
    1036             :     {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
    1037             :     {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
    1038             : };
    1039             : 
    1040             : static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
    1041             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1042             :     {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
    1043             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1044             : };
    1045             : 
    1046             : static const TParserStateActionItem actionTPS_InDecimal[] = {
    1047             :     {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
    1048             :     {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
    1049             :     {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
    1050             :     {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
    1051             :     {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
    1052             :     {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
    1053             : };
    1054             : 
    1055             : static const TParserStateActionItem actionTPS_InVerVersion[] = {
    1056             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1057             :     {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
    1058             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1059             : };
    1060             : 
    1061             : static const TParserStateActionItem actionTPS_InSVerVersion[] = {
    1062             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1063             :     {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
    1064             :     {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
    1065             : };
    1066             : 
    1067             : 
    1068             : static const TParserStateActionItem actionTPS_InVersionFirst[] = {
    1069             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1070             :     {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
    1071             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1072             : };
    1073             : 
    1074             : static const TParserStateActionItem actionTPS_InVersion[] = {
    1075             :     {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
    1076             :     {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
    1077             :     {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
    1078             :     {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
    1079             : };
    1080             : 
    1081             : static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
    1082             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1083             :     {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
    1084             :     {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
    1085             :     {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
    1086             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1087             : };
    1088             : 
    1089             : static const TParserStateActionItem actionTPS_InMantissaSign[] = {
    1090             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1091             :     {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
    1092             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1093             : };
    1094             : 
    1095             : static const TParserStateActionItem actionTPS_InMantissa[] = {
    1096             :     {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
    1097             :     {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
    1098             :     {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
    1099             : };
    1100             : 
    1101             : static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
    1102             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1103             :     {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
    1104             :     {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
    1105             :     {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
    1106             :     {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
    1107             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1108             : };
    1109             : 
    1110             : static const TParserStateActionItem actionTPS_InXMLEntity[] = {
    1111             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1112             :     {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
    1113             :     {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
    1114             :     {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
    1115             :     {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
    1116             :     {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
    1117             :     {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
    1118             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1119             : };
    1120             : 
    1121             : static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
    1122             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1123             :     {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
    1124             :     {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
    1125             :     {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
    1126             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1127             : };
    1128             : 
    1129             : static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
    1130             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1131             :     {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
    1132             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1133             : };
    1134             : 
    1135             : static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
    1136             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1137             :     {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
    1138             :     {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
    1139             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1140             : };
    1141             : 
    1142             : static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
    1143             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1144             :     {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
    1145             :     {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
    1146             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1147             : };
    1148             : 
    1149             : static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
    1150             :     {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
    1151             : };
    1152             : 
    1153             : static const TParserStateActionItem actionTPS_InTagFirst[] = {
    1154             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1155             :     {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
    1156             :     {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
    1157             :     {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
    1158             :     {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
    1159             :     {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
    1160             :     {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
    1161             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1162             : };
    1163             : 
    1164             : static const TParserStateActionItem actionTPS_InXMLBegin[] = {
    1165             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1166             :     /* <?xml ... */
    1167             :     /* XXX do we wants states for the m and l ?  Right now this accepts <?xZ */
    1168             :     {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
    1169             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1170             : };
    1171             : 
    1172             : static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
    1173             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1174             :     {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
    1175             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1176             : };
    1177             : 
    1178             : static const TParserStateActionItem actionTPS_InTagName[] = {
    1179             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1180             :     /* <br/> case */
    1181             :     {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
    1182             :     {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
    1183             :     {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
    1184             :     {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
    1185             :     {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
    1186             :     {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
    1187             :     {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
    1188             :     {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
    1189             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1190             : };
    1191             : 
    1192             : static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
    1193             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1194             :     {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
    1195             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1196             : };
    1197             : 
    1198             : static const TParserStateActionItem actionTPS_InTag[] = {
    1199             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1200             :     {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
    1201             :     {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
    1202             :     {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
    1203             :     {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
    1204             :     {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
    1205             :     {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
    1206             :     {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
    1207             :     {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
    1208             :     {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
    1209             :     {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
    1210             :     {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
    1211             :     {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
    1212             :     {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
    1213             :     {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
    1214             :     {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
    1215             :     {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
    1216             :     {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
    1217             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1218             : };
    1219             : 
    1220             : static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
    1221             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1222             :     {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
    1223             :     {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
    1224             :     {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
    1225             : };
    1226             : 
    1227             : static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
    1228             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1229             :     {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
    1230             :     {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
    1231             :     {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
    1232             : };
    1233             : 
    1234             : static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
    1235             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1236             :     {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
    1237             : };
    1238             : 
    1239             : static const TParserStateActionItem actionTPS_InTagEnd[] = {
    1240             :     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
    1241             : };
    1242             : 
    1243             : static const TParserStateActionItem actionTPS_InCommentFirst[] = {
    1244             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1245             :     {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
    1246             :     /* <!DOCTYPE ...> */
    1247             :     {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
    1248             :     {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
    1249             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1250             : };
    1251             : 
    1252             : static const TParserStateActionItem actionTPS_InCommentLast[] = {
    1253             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1254             :     {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
    1255             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1256             : };
    1257             : 
    1258             : static const TParserStateActionItem actionTPS_InComment[] = {
    1259             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1260             :     {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
    1261             :     {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
    1262             : };
    1263             : 
    1264             : static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
    1265             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1266             :     {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
    1267             :     {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
    1268             : };
    1269             : 
    1270             : static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
    1271             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1272             :     {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
    1273             :     {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
    1274             :     {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
    1275             : };
    1276             : 
    1277             : static const TParserStateActionItem actionTPS_InCommentEnd[] = {
    1278             :     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
    1279             : };
    1280             : 
    1281             : static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
    1282             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1283             :     {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
    1284             :     {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
    1285             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1286             : };
    1287             : 
    1288             : static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
    1289             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1290             :     {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
    1291             :     {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
    1292             :     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    1293             :     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    1294             :     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
    1295             :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
    1296             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1297             : };
    1298             : 
    1299             : static const TParserStateActionItem actionTPS_InHostDomain[] = {
    1300             :     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
    1301             :     {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
    1302             :     {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
    1303             :     {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
    1304             :     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    1305             :     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    1306             :     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
    1307             :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
    1308             :     {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
    1309             :     {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
    1310             :     {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
    1311             :     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
    1312             : };
    1313             : 
    1314             : static const TParserStateActionItem actionTPS_InPortFirst[] = {
    1315             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1316             :     {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
    1317             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1318             : };
    1319             : 
    1320             : static const TParserStateActionItem actionTPS_InPort[] = {
    1321             :     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
    1322             :     {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
    1323             :     {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
    1324             :     {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
    1325             :     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
    1326             : };
    1327             : 
    1328             : static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
    1329             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1330             :     {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
    1331             :     {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
    1332             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1333             : };
    1334             : 
    1335             : static const TParserStateActionItem actionTPS_InHost[] = {
    1336             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1337             :     {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
    1338             :     {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
    1339             :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
    1340             :     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
    1341             :     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    1342             :     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    1343             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1344             : };
    1345             : 
    1346             : static const TParserStateActionItem actionTPS_InEmail[] = {
    1347             :     {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
    1348             :     {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
    1349             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1350             : };
    1351             : 
    1352             : static const TParserStateActionItem actionTPS_InFileFirst[] = {
    1353             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1354             :     {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
    1355             :     {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
    1356             :     {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
    1357             :     {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
    1358             :     {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
    1359             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1360             : };
    1361             : 
    1362             : static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
    1363             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1364             :     {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
    1365             :     {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
    1366             :     {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
    1367             :     {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
    1368             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1369             : };
    1370             : 
    1371             : static const TParserStateActionItem actionTPS_InPathFirst[] = {
    1372             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1373             :     {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
    1374             :     {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
    1375             :     {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
    1376             :     {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
    1377             :     {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
    1378             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1379             : };
    1380             : 
    1381             : static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
    1382             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1383             :     {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
    1384             :     {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
    1385             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1386             : };
    1387             : 
    1388             : static const TParserStateActionItem actionTPS_InPathSecond[] = {
    1389             :     {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
    1390             :     {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
    1391             :     {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
    1392             :     {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
    1393             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1394             : };
    1395             : 
    1396             : static const TParserStateActionItem actionTPS_InFile[] = {
    1397             :     {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
    1398             :     {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
    1399             :     {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
    1400             :     {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
    1401             :     {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
    1402             :     {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
    1403             :     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
    1404             :     {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
    1405             : };
    1406             : 
    1407             : static const TParserStateActionItem actionTPS_InFileNext[] = {
    1408             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1409             :     {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
    1410             :     {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
    1411             :     {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
    1412             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1413             : };
    1414             : 
    1415             : static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
    1416             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1417             :     {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
    1418             :     {NULL, 0, A_POP, TPS_Null, 0, NULL},
    1419             : };
    1420             : 
    1421             : static const TParserStateActionItem actionTPS_InURLPathStart[] = {
    1422             :     {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
    1423             : };
    1424             : 
    1425             : static const TParserStateActionItem actionTPS_InURLPath[] = {
    1426             :     {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
    1427             :     {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
    1428             :     {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
    1429             : };
    1430             : 
    1431             : static const TParserStateActionItem actionTPS_InFURL[] = {
    1432             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1433             :     {p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
    1434             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1435             : };
    1436             : 
    1437             : static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
    1438             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1439             :     {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
    1440             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1441             : };
    1442             : 
    1443             : static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
    1444             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1445             :     {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
    1446             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1447             : };
    1448             : 
    1449             : static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
    1450             :     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
    1451             : };
    1452             : 
    1453             : static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
    1454             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1455             :     {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
    1456             :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
    1457             :     {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
    1458             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1459             : };
    1460             : 
    1461             : static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
    1462             :     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
    1463             :     {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
    1464             :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
    1465             :     {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
    1466             :     {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    1467             :     {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
    1468             :     {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
    1469             : };
    1470             : 
    1471             : static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
    1472             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1473             :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
    1474             :     {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
    1475             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1476             : };
    1477             : 
    1478             : static const TParserStateActionItem actionTPS_InHyphenWord[] = {
    1479             :     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
    1480             :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
    1481             :     {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
    1482             :     {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    1483             :     {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
    1484             :     {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
    1485             : };
    1486             : 
    1487             : static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
    1488             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1489             :     {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    1490             :     {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
    1491             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1492             : };
    1493             : 
    1494             : static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
    1495             :     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
    1496             :     {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    1497             :     {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    1498             :     {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
    1499             :     {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
    1500             : };
    1501             : 
    1502             : static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
    1503             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1504             :     {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
    1505             :     {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    1506             :     {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    1507             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1508             : };
    1509             : 
    1510             : static const TParserStateActionItem actionTPS_InParseHyphen[] = {
    1511             :     {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
    1512             :     {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
    1513             :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
    1514             :     {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
    1515             :     {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
    1516             :     {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
    1517             : };
    1518             : 
    1519             : static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
    1520             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1521             :     {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
    1522             :     {p_isspecial, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
    1523             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1524             : };
    1525             : 
    1526             : static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
    1527             :     {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
    1528             :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
    1529             :     {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
    1530             :     {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
    1531             :     {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
    1532             : };
    1533             : 
    1534             : static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
    1535             :     {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
    1536             :     {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
    1537             :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
    1538             :     {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
    1539             :     {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
    1540             :     {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
    1541             : };
    1542             : 
    1543             : static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
    1544             :     {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
    1545             :     {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
    1546             :     {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
    1547             :     {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
    1548             : };
    1549             : 
    1550             : static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
    1551             :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1552             :     {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
    1553             :     {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
    1554             :     {p_isspecial, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
    1555             :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1556             : };
    1557             : 
    1558             : 
    1559             : /*
    1560             :  * main table of per-state parser actions
    1561             :  */
    1562             : typedef struct
    1563             : {
    1564             :     const TParserStateActionItem *action;   /* the actual state info */
    1565             :     TParserState state;         /* only for Assert crosscheck */
    1566             : #ifdef WPARSER_TRACE
    1567             :     const char *state_name;     /* only for debug printout */
    1568             : #endif
    1569             : } TParserStateAction;
    1570             : 
    1571             : #ifdef WPARSER_TRACE
    1572             : #define TPARSERSTATEACTION(state) \
    1573             :     { CppConcat(action,state), state, CppAsString(state) }
    1574             : #else
    1575             : #define TPARSERSTATEACTION(state) \
    1576             :     { CppConcat(action,state), state }
    1577             : #endif
    1578             : 
    1579             : /*
    1580             :  * order must be the same as in typedef enum {} TParserState!!
    1581             :  */
    1582             : 
    1583             : static const TParserStateAction Actions[] = {
    1584             :     TPARSERSTATEACTION(TPS_Base),
    1585             :     TPARSERSTATEACTION(TPS_InNumWord),
    1586             :     TPARSERSTATEACTION(TPS_InAsciiWord),
    1587             :     TPARSERSTATEACTION(TPS_InWord),
    1588             :     TPARSERSTATEACTION(TPS_InUnsignedInt),
    1589             :     TPARSERSTATEACTION(TPS_InSignedIntFirst),
    1590             :     TPARSERSTATEACTION(TPS_InSignedInt),
    1591             :     TPARSERSTATEACTION(TPS_InSpace),
    1592             :     TPARSERSTATEACTION(TPS_InUDecimalFirst),
    1593             :     TPARSERSTATEACTION(TPS_InUDecimal),
    1594             :     TPARSERSTATEACTION(TPS_InDecimalFirst),
    1595             :     TPARSERSTATEACTION(TPS_InDecimal),
    1596             :     TPARSERSTATEACTION(TPS_InVerVersion),
    1597             :     TPARSERSTATEACTION(TPS_InSVerVersion),
    1598             :     TPARSERSTATEACTION(TPS_InVersionFirst),
    1599             :     TPARSERSTATEACTION(TPS_InVersion),
    1600             :     TPARSERSTATEACTION(TPS_InMantissaFirst),
    1601             :     TPARSERSTATEACTION(TPS_InMantissaSign),
    1602             :     TPARSERSTATEACTION(TPS_InMantissa),
    1603             :     TPARSERSTATEACTION(TPS_InXMLEntityFirst),
    1604             :     TPARSERSTATEACTION(TPS_InXMLEntity),
    1605             :     TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
    1606             :     TPARSERSTATEACTION(TPS_InXMLEntityNum),
    1607             :     TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
    1608             :     TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
    1609             :     TPARSERSTATEACTION(TPS_InXMLEntityEnd),
    1610             :     TPARSERSTATEACTION(TPS_InTagFirst),
    1611             :     TPARSERSTATEACTION(TPS_InXMLBegin),
    1612             :     TPARSERSTATEACTION(TPS_InTagCloseFirst),
    1613             :     TPARSERSTATEACTION(TPS_InTagName),
    1614             :     TPARSERSTATEACTION(TPS_InTagBeginEnd),
    1615             :     TPARSERSTATEACTION(TPS_InTag),
    1616             :     TPARSERSTATEACTION(TPS_InTagEscapeK),
    1617             :     TPARSERSTATEACTION(TPS_InTagEscapeKK),
    1618             :     TPARSERSTATEACTION(TPS_InTagBackSleshed),
    1619             :     TPARSERSTATEACTION(TPS_InTagEnd),
    1620             :     TPARSERSTATEACTION(TPS_InCommentFirst),
    1621             :     TPARSERSTATEACTION(TPS_InCommentLast),
    1622             :     TPARSERSTATEACTION(TPS_InComment),
    1623             :     TPARSERSTATEACTION(TPS_InCloseCommentFirst),
    1624             :     TPARSERSTATEACTION(TPS_InCloseCommentLast),
    1625             :     TPARSERSTATEACTION(TPS_InCommentEnd),
    1626             :     TPARSERSTATEACTION(TPS_InHostFirstDomain),
    1627             :     TPARSERSTATEACTION(TPS_InHostDomainSecond),
    1628             :     TPARSERSTATEACTION(TPS_InHostDomain),
    1629             :     TPARSERSTATEACTION(TPS_InPortFirst),
    1630             :     TPARSERSTATEACTION(TPS_InPort),
    1631             :     TPARSERSTATEACTION(TPS_InHostFirstAN),
    1632             :     TPARSERSTATEACTION(TPS_InHost),
    1633             :     TPARSERSTATEACTION(TPS_InEmail),
    1634             :     TPARSERSTATEACTION(TPS_InFileFirst),
    1635             :     TPARSERSTATEACTION(TPS_InFileTwiddle),
    1636             :     TPARSERSTATEACTION(TPS_InPathFirst),
    1637             :     TPARSERSTATEACTION(TPS_InPathFirstFirst),
    1638             :     TPARSERSTATEACTION(TPS_InPathSecond),
    1639             :     TPARSERSTATEACTION(TPS_InFile),
    1640             :     TPARSERSTATEACTION(TPS_InFileNext),
    1641             :     TPARSERSTATEACTION(TPS_InURLPathFirst),
    1642             :     TPARSERSTATEACTION(TPS_InURLPathStart),
    1643             :     TPARSERSTATEACTION(TPS_InURLPath),
    1644             :     TPARSERSTATEACTION(TPS_InFURL),
    1645             :     TPARSERSTATEACTION(TPS_InProtocolFirst),
    1646             :     TPARSERSTATEACTION(TPS_InProtocolSecond),
    1647             :     TPARSERSTATEACTION(TPS_InProtocolEnd),
    1648             :     TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
    1649             :     TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
    1650             :     TPARSERSTATEACTION(TPS_InHyphenWordFirst),
    1651             :     TPARSERSTATEACTION(TPS_InHyphenWord),
    1652             :     TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
    1653             :     TPARSERSTATEACTION(TPS_InHyphenNumWord),
    1654             :     TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
    1655             :     TPARSERSTATEACTION(TPS_InParseHyphen),
    1656             :     TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
    1657             :     TPARSERSTATEACTION(TPS_InHyphenWordPart),
    1658             :     TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
    1659             :     TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
    1660             :     TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
    1661             : };
    1662             : 
    1663             : 
    1664             : static bool
    1665       28924 : TParserGet(TParser *prs)
    1666             : {
    1667       28924 :     const TParserStateActionItem *item = NULL;
    1668             : 
    1669       28924 :     CHECK_FOR_INTERRUPTS();
    1670             : 
    1671             :     Assert(prs->state);
    1672             : 
    1673       28924 :     if (prs->state->posbyte >= prs->lenstr)
    1674        4754 :         return false;
    1675             : 
    1676       24170 :     prs->token = prs->str + prs->state->posbyte;
    1677       24170 :     prs->state->pushedAtAction = NULL;
    1678             : 
    1679             :     /* look at string */
    1680      103278 :     while (prs->state->posbyte <= prs->lenstr)
    1681             :     {
    1682      103278 :         if (prs->state->posbyte == prs->lenstr)
    1683        4904 :             prs->state->charlen = 0;
    1684             :         else
    1685      196748 :             prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
    1686       98374 :                 pg_mblen(prs->str + prs->state->posbyte);
    1687             : 
    1688             :         Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
    1689             :         Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
    1690             :         Assert(Actions[prs->state->state].state == prs->state->state);
    1691             : 
    1692      103278 :         if (prs->state->pushedAtAction)
    1693             :         {
    1694             :             /* After a POP, pick up at the next test */
    1695        2592 :             item = prs->state->pushedAtAction + 1;
    1696        2592 :             prs->state->pushedAtAction = NULL;
    1697             :         }
    1698             :         else
    1699             :         {
    1700      100686 :             item = Actions[prs->state->state].action;
    1701             :             Assert(item != NULL);
    1702             :         }
    1703             : 
    1704             :         /* find action by character class */
    1705      555708 :         while (item->isclass)
    1706             :         {
    1707      524364 :             prs->c = item->c;
    1708      524364 :             if (item->isclass(prs) != 0)
    1709       71934 :                 break;
    1710      452430 :             item++;
    1711             :         }
    1712             : 
    1713             : #ifdef WPARSER_TRACE
    1714             :         {
    1715             :             TParserPosition *ptr;
    1716             : 
    1717             :             fprintf(stderr, "state ");
    1718             :             /* indent according to stack depth */
    1719             :             for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
    1720             :                 fprintf(stderr, "  ");
    1721             :             fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
    1722             :             if (prs->state->posbyte < prs->lenstr)
    1723             :                 fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
    1724             :             else
    1725             :                 fprintf(stderr, "at EOF");
    1726             :             fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
    1727             :                     (int) (item - Actions[prs->state->state].action),
    1728             :                     (item->flags & A_BINGO) ? " BINGO" : "",
    1729             :                     (item->flags & A_POP) ? " POP" : "",
    1730             :                     (item->flags & A_PUSH) ? " PUSH" : "",
    1731             :                     (item->flags & A_RERUN) ? " RERUN" : "",
    1732             :                     (item->flags & A_CLEAR) ? " CLEAR" : "",
    1733             :                     (item->flags & A_MERGE) ? " MERGE" : "",
    1734             :                     (item->flags & A_CLRALL) ? " CLRALL" : "",
    1735             :                     (item->tostate != TPS_Null) ? " tostate " : "",
    1736             :                     (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
    1737             :                     (item->type > 0) ? " type " : "",
    1738             :                     tok_alias[item->type]);
    1739             :         }
    1740             : #endif
    1741             : 
    1742             :         /* call special handler if exists */
    1743      103278 :         if (item->special)
    1744         420 :             item->special(prs);
    1745             : 
    1746             :         /* BINGO, token is found */
    1747      103278 :         if (item->flags & A_BINGO)
    1748             :         {
    1749             :             Assert(item->type > 0);
    1750       24170 :             prs->lenbytetoken = prs->state->lenbytetoken;
    1751       24170 :             prs->lenchartoken = prs->state->lenchartoken;
    1752       24170 :             prs->state->lenbytetoken = prs->state->lenchartoken = 0;
    1753       24170 :             prs->type = item->type;
    1754             :         }
    1755             : 
    1756             :         /* do various actions by flags */
    1757      103278 :         if (item->flags & A_POP)
    1758             :         {                       /* pop stored state in stack */
    1759        2610 :             TParserPosition *ptr = prs->state->prev;
    1760             : 
    1761        2610 :             pfree(prs->state);
    1762        2610 :             prs->state = ptr;
    1763             :             Assert(prs->state);
    1764             :         }
    1765      100668 :         else if (item->flags & A_PUSH)
    1766             :         {                       /* push (store) state in stack */
    1767        5088 :             prs->state->pushedAtAction = item;    /* remember where we push */
    1768        5088 :             prs->state = newTParserPosition(prs->state);
    1769             :         }
    1770       95580 :         else if (item->flags & A_CLEAR)
    1771             :         {                       /* clear previous pushed state */
    1772             :             TParserPosition *ptr;
    1773             : 
    1774             :             Assert(prs->state->prev);
    1775         498 :             ptr = prs->state->prev->prev;
    1776         498 :             pfree(prs->state->prev);
    1777         498 :             prs->state->prev = ptr;
    1778             :         }
    1779       95082 :         else if (item->flags & A_CLRALL)
    1780             :         {                       /* clear all previous pushed state */
    1781             :             TParserPosition *ptr;
    1782             : 
    1783        2778 :             while (prs->state->prev)
    1784             :             {
    1785        1998 :                 ptr = prs->state->prev->prev;
    1786        1998 :                 pfree(prs->state->prev);
    1787        1998 :                 prs->state->prev = ptr;
    1788             :             }
    1789             :         }
    1790       94302 :         else if (item->flags & A_MERGE)
    1791             :         {                       /* merge posinfo with current and pushed state */
    1792           0 :             TParserPosition *ptr = prs->state;
    1793             : 
    1794             :             Assert(prs->state->prev);
    1795           0 :             prs->state = prs->state->prev;
    1796             : 
    1797           0 :             prs->state->posbyte = ptr->posbyte;
    1798           0 :             prs->state->poschar = ptr->poschar;
    1799           0 :             prs->state->charlen = ptr->charlen;
    1800           0 :             prs->state->lenbytetoken = ptr->lenbytetoken;
    1801           0 :             prs->state->lenchartoken = ptr->lenchartoken;
    1802           0 :             pfree(ptr);
    1803             :         }
    1804             : 
    1805             :         /* set new state if pointed */
    1806      103278 :         if (item->tostate != TPS_Null)
    1807       66202 :             prs->state->state = item->tostate;
    1808             : 
    1809             :         /* check for go away */
    1810      103278 :         if ((item->flags & A_BINGO) ||
    1811       79108 :             (prs->state->posbyte >= prs->lenstr &&
    1812           0 :              (item->flags & A_RERUN) == 0))
    1813             :             break;
    1814             : 
    1815             :         /* go to beginning of loop if we should rerun or we just restore state */
    1816       79108 :         if (item->flags & (A_RERUN | A_POP))
    1817        2634 :             continue;
    1818             : 
    1819             :         /* move forward */
    1820       76474 :         if (prs->state->charlen)
    1821             :         {
    1822       76474 :             prs->state->posbyte += prs->state->charlen;
    1823       76474 :             prs->state->lenbytetoken += prs->state->charlen;
    1824       76474 :             prs->state->poschar++;
    1825       76474 :             prs->state->lenchartoken++;
    1826             :         }
    1827             :     }
    1828             : 
    1829       24170 :     return (item && (item->flags & A_BINGO));
    1830             : }
    1831             : 
    1832             : Datum
    1833        9078 : prsd_lextype(PG_FUNCTION_ARGS)
    1834             : {
    1835        9078 :     LexDescr   *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
    1836             :     int         i;
    1837             : 
    1838      217872 :     for (i = 1; i <= LASTNUM; i++)
    1839             :     {
    1840      208794 :         descr[i - 1].lexid = i;
    1841      208794 :         descr[i - 1].alias = pstrdup(tok_alias[i]);
    1842      208794 :         descr[i - 1].descr = pstrdup(lex_descr[i]);
    1843             :     }
    1844             : 
    1845        9078 :     descr[LASTNUM].lexid = 0;
    1846             : 
    1847        9078 :     PG_RETURN_POINTER(descr);
    1848             : }
    1849             : 
    1850             : Datum
    1851        4754 : prsd_start(PG_FUNCTION_ARGS)
    1852             : {
    1853        4754 :     PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
    1854             : }
    1855             : 
    1856             : Datum
    1857       28684 : prsd_nexttoken(PG_FUNCTION_ARGS)
    1858             : {
    1859       28684 :     TParser    *p = (TParser *) PG_GETARG_POINTER(0);
    1860       28684 :     char      **t = (char **) PG_GETARG_POINTER(1);
    1861       28684 :     int        *tlen = (int *) PG_GETARG_POINTER(2);
    1862             : 
    1863       28684 :     if (!TParserGet(p))
    1864        4754 :         PG_RETURN_INT32(0);
    1865             : 
    1866       23930 :     *t = p->token;
    1867       23930 :     *tlen = p->lenbytetoken;
    1868             : 
    1869       23930 :     PG_RETURN_INT32(p->type);
    1870             : }
    1871             : 
    1872             : Datum
    1873        4754 : prsd_end(PG_FUNCTION_ARGS)
    1874             : {
    1875        4754 :     TParser    *p = (TParser *) PG_GETARG_POINTER(0);
    1876             : 
    1877        4754 :     TParserClose(p);
    1878        4754 :     PG_RETURN_VOID();
    1879             : }
    1880             : 
    1881             : 
    1882             : /*
    1883             :  * ts_headline support begins here
    1884             :  */
    1885             : 
    1886             : /* token type classification macros */
    1887             : #define TS_IDIGNORE(x)  ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
    1888             : #define HLIDREPLACE(x)  ( (x)==TAG_T )
    1889             : #define HLIDSKIP(x)     ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
    1890             : #define XMLHLIDSKIP(x)  ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
    1891             : #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
    1892             : #define NOENDTOKEN(x)   ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
    1893             : 
    1894             : /*
    1895             :  * Macros useful in headline selection.  These rely on availability of
    1896             :  * "HeadlineParsedText *prs" describing some text, and "int shortword"
    1897             :  * describing the "short word" length parameter.
    1898             :  */
    1899             : 
    1900             : /* Interesting words are non-repeated search terms */
    1901             : #define INTERESTINGWORD(j) \
    1902             :     (prs->words[j].item && !prs->words[j].repeated)
    1903             : 
    1904             : /* Don't want to end at a non-word or a short word, unless interesting */
    1905             : #define BADENDPOINT(j) \
    1906             :     ((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \
    1907             :      !INTERESTINGWORD(j))
    1908             : 
    1909             : typedef struct
    1910             : {
    1911             :     /* one cover (well, really one fragment) for mark_hl_fragments */
    1912             :     int32       startpos;       /* fragment's starting word index */
    1913             :     int32       endpos;         /* ending word index (inclusive) */
    1914             :     int32       poslen;         /* number of interesting words */
    1915             :     int32       curlen;         /* total number of words */
    1916             :     bool        chosen;         /* chosen? */
    1917             :     bool        excluded;       /* excluded? */
    1918             : } CoverPos;
    1919             : 
    1920             : typedef struct
    1921             : {
    1922             :     /* callback data for checkcondition_HL */
    1923             :     HeadlineWordEntry *words;
    1924             :     int         len;
    1925             : } hlCheck;
    1926             : 
    1927             : 
    1928             : /*
    1929             :  * TS_execute callback for matching a tsquery operand to headline words
    1930             :  *
    1931             :  * Note: it's tempting to report words[] indexes as pos values to save
    1932             :  * searching in hlCover; but that would screw up phrase matching, which
    1933             :  * expects to measure distances in lexemes not tokens.
    1934             :  */
    1935             : static TSTernaryValue
    1936        1000 : checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
    1937             : {
    1938        1000 :     hlCheck    *checkval = (hlCheck *) opaque;
    1939             :     int         i;
    1940             : 
    1941             :     /* scan words array for matching items */
    1942       25450 :     for (i = 0; i < checkval->len; i++)
    1943             :     {
    1944       24650 :         if (checkval->words[i].item == val)
    1945             :         {
    1946             :             /* if data == NULL, don't need to report positions */
    1947         874 :             if (!data)
    1948         200 :                 return TS_YES;
    1949             : 
    1950         674 :             if (!data->pos)
    1951             :             {
    1952         476 :                 data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
    1953         476 :                 data->allocated = true;
    1954         476 :                 data->npos = 1;
    1955         476 :                 data->pos[0] = checkval->words[i].pos;
    1956             :             }
    1957         198 :             else if (data->pos[data->npos - 1] < checkval->words[i].pos)
    1958             :             {
    1959         198 :                 data->pos[data->npos++] = checkval->words[i].pos;
    1960             :             }
    1961             :         }
    1962             :     }
    1963             : 
    1964         800 :     if (data && data->npos > 0)
    1965         476 :         return TS_YES;
    1966             : 
    1967         324 :     return TS_NO;
    1968             : }
    1969             : 
    1970             : /*
    1971             :  * hlCover: try to find a substring of prs' word list that satisfies query
    1972             :  *
    1973             :  * locations is the result of TS_execute_locations() for the query.
    1974             :  * We use this to identify plausible subranges of the query.
    1975             :  *
    1976             :  * *nextpos is the lexeme position (NOT word index) to start the search
    1977             :  * at.  Caller should initialize this to zero.  If successful, we'll
    1978             :  * advance it to the next place to search at.
    1979             :  *
    1980             :  * On success, sets *p to first word index and *q to last word index of the
    1981             :  * cover substring, and returns true.
    1982             :  *
    1983             :  * The result is a minimal cover, in the sense that both *p and *q will be
    1984             :  * words used in the query.
    1985             :  */
    1986             : static bool
    1987         562 : hlCover(HeadlineParsedText *prs, TSQuery query, List *locations,
    1988             :         int *nextpos, int *p, int *q)
    1989             : {
    1990         562 :     int         pos = *nextpos;
    1991             : 
    1992             :     /* This loop repeats when our selected word-range fails the query */
    1993             :     for (;;)
    1994          60 :     {
    1995             :         int         posb,
    1996             :                     pose;
    1997             :         ListCell   *lc;
    1998             : 
    1999             :         /*
    2000             :          * For each AND'ed query term or phrase, find its first occurrence at
    2001             :          * or after pos; set pose to the maximum of those positions.
    2002             :          *
    2003             :          * We need not consider ORs or NOTs here; see the comments for
    2004             :          * TS_execute_locations().  Rechecking the match with TS_execute(),
    2005             :          * below, will deal with any ensuing imprecision.
    2006             :          */
    2007         622 :         pose = -1;
    2008         966 :         foreach(lc, locations)
    2009             :         {
    2010         466 :             ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
    2011         466 :             int         first = -1;
    2012             : 
    2013         792 :             for (int i = 0; i < pdata->npos; i++)
    2014             :             {
    2015             :                 /* For phrase matches, use the ending lexeme */
    2016         670 :                 int         endp = pdata->pos[i];
    2017             : 
    2018         670 :                 if (endp >= pos)
    2019             :                 {
    2020         344 :                     first = endp;
    2021         344 :                     break;
    2022             :                 }
    2023             :             }
    2024         466 :             if (first < 0)
    2025         122 :                 return false;   /* no more matches for this term */
    2026         344 :             if (first > pose)
    2027         326 :                 pose = first;
    2028             :         }
    2029             : 
    2030         500 :         if (pose < 0)
    2031         246 :             return false;       /* we only get here if empty list */
    2032             : 
    2033             :         /*
    2034             :          * Now, for each AND'ed query term or phrase, find its last occurrence
    2035             :          * at or before pose; set posb to the minimum of those positions.
    2036             :          *
    2037             :          * We start posb at INT_MAX - 1 to guarantee no overflow if we compute
    2038             :          * posb + 1 below.
    2039             :          */
    2040         254 :         posb = INT_MAX - 1;
    2041         586 :         foreach(lc, locations)
    2042             :         {
    2043         332 :             ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
    2044         332 :             int         last = -1;
    2045             : 
    2046         494 :             for (int i = pdata->npos - 1; i >= 0; i--)
    2047             :             {
    2048             :                 /* For phrase matches, use the starting lexeme */
    2049         494 :                 int         startp = pdata->pos[i] - pdata->width;
    2050             : 
    2051         494 :                 if (startp <= pose)
    2052             :                 {
    2053         332 :                     last = startp;
    2054         332 :                     break;
    2055             :                 }
    2056             :             }
    2057         332 :             if (last < posb)
    2058         272 :                 posb = last;
    2059             :         }
    2060             : 
    2061             :         /*
    2062             :          * We could end up with posb to the left of pos, in case some phrase
    2063             :          * match crosses pos.  Try the match starting at pos anyway, since the
    2064             :          * result of TS_execute_locations is imprecise for phrase matches OR'd
    2065             :          * with plain matches; that is, if the query is "(A <-> B) | C" then C
    2066             :          * could match at pos even though the phrase match would have to
    2067             :          * extend to the left of pos.
    2068             :          */
    2069         254 :         posb = Max(posb, pos);
    2070             : 
    2071             :         /* This test probably always succeeds, but be paranoid */
    2072         254 :         if (posb <= pose)
    2073             :         {
    2074             :             /*
    2075             :              * posb .. pose is now the shortest, earliest-after-pos range of
    2076             :              * lexeme positions containing all the query terms.  It will
    2077             :              * contain all phrase matches, too, except in the corner case
    2078             :              * described just above.
    2079             :              *
    2080             :              * Now convert these lexeme positions to indexes in prs->words[].
    2081             :              */
    2082         254 :             int         idxb = -1;
    2083         254 :             int         idxe = -1;
    2084             : 
    2085       11624 :             for (int i = 0; i < prs->curwords; i++)
    2086             :             {
    2087       11496 :                 if (prs->words[i].item == NULL)
    2088       10612 :                     continue;
    2089         884 :                 if (idxb < 0 && prs->words[i].pos >= posb)
    2090         254 :                     idxb = i;
    2091         884 :                 if (prs->words[i].pos <= pose)
    2092         758 :                     idxe = i;
    2093             :                 else
    2094         126 :                     break;
    2095             :             }
    2096             : 
    2097             :             /* This test probably always succeeds, but be paranoid */
    2098         254 :             if (idxb >= 0 && idxe >= idxb)
    2099             :             {
    2100             :                 /*
    2101             :                  * Finally, check that the selected range satisfies the query.
    2102             :                  * This should succeed in all simple cases; but odd cases
    2103             :                  * involving non-top-level NOT conditions or phrase matches
    2104             :                  * OR'd with other things could fail, since the result of
    2105             :                  * TS_execute_locations doesn't fully represent such things.
    2106             :                  */
    2107             :                 hlCheck     ch;
    2108             : 
    2109         254 :                 ch.words = &(prs->words[idxb]);
    2110         254 :                 ch.len = idxe - idxb + 1;
    2111         254 :                 if (TS_execute(GETQUERY(query), &ch,
    2112             :                                TS_EXEC_EMPTY, checkcondition_HL))
    2113             :                 {
    2114             :                     /* Match!  Advance *nextpos and return the word range. */
    2115         194 :                     *nextpos = posb + 1;
    2116         194 :                     *p = idxb;
    2117         194 :                     *q = idxe;
    2118         194 :                     return true;
    2119             :                 }
    2120             :             }
    2121             :         }
    2122             : 
    2123             :         /*
    2124             :          * Advance pos and try again.  Any later workable match must start
    2125             :          * beyond posb.
    2126             :          */
    2127          60 :         pos = posb + 1;
    2128             :     }
    2129             :     /* Can't get here, but stupider compilers complain if we leave it off */
    2130             :     return false;
    2131             : }
    2132             : 
    2133             : /*
    2134             :  * Apply suitable highlight marking to words selected by headline selector
    2135             :  *
    2136             :  * The words from startpos to endpos inclusive are marked per highlightall
    2137             :  */
    2138             : static void
    2139         386 : mark_fragment(HeadlineParsedText *prs, bool highlightall,
    2140             :               int startpos, int endpos)
    2141             : {
    2142             :     int         i;
    2143             : 
    2144        5654 :     for (i = startpos; i <= endpos; i++)
    2145             :     {
    2146        5268 :         if (prs->words[i].item)
    2147         500 :             prs->words[i].selected = 1;
    2148        5268 :         if (!highlightall)
    2149             :         {
    2150        5022 :             if (HLIDREPLACE(prs->words[i].type))
    2151           0 :                 prs->words[i].replace = 1;
    2152        5022 :             else if (HLIDSKIP(prs->words[i].type))
    2153           0 :                 prs->words[i].skip = 1;
    2154             :         }
    2155             :         else
    2156             :         {
    2157         246 :             if (XMLHLIDSKIP(prs->words[i].type))
    2158           6 :                 prs->words[i].skip = 1;
    2159             :         }
    2160             : 
    2161        5268 :         prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
    2162             :     }
    2163         386 : }
    2164             : 
    2165             : /*
    2166             :  * split a cover substring into fragments not longer than max_words
    2167             :  *
    2168             :  * At entry, *startpos and *endpos are the (remaining) bounds of the cover
    2169             :  * substring.  They are updated to hold the bounds of the next fragment.
    2170             :  *
    2171             :  * *curlen and *poslen are set to the fragment's length, in words and
    2172             :  * interesting words respectively.
    2173             :  */
    2174             : static void
    2175          36 : get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
    2176             :                   int *curlen, int *poslen, int max_words)
    2177             : {
    2178             :     int         i;
    2179             : 
    2180             :     /*
    2181             :      * Objective: select a fragment of words between startpos and endpos such
    2182             :      * that it has at most max_words and both ends have query words. If the
    2183             :      * startpos and endpos are the endpoints of the cover and the cover has
    2184             :      * fewer words than max_words, then this function should just return the
    2185             :      * cover
    2186             :      */
    2187             :     /* first move startpos to an item */
    2188         888 :     for (i = *startpos; i <= *endpos; i++)
    2189             :     {
    2190         888 :         *startpos = i;
    2191         888 :         if (INTERESTINGWORD(i))
    2192          36 :             break;
    2193             :     }
    2194             :     /* cut endpos to have only max_words */
    2195          36 :     *curlen = 0;
    2196          36 :     *poslen = 0;
    2197         960 :     for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
    2198             :     {
    2199         924 :         if (!NONWORDTOKEN(prs->words[i].type))
    2200         480 :             *curlen += 1;
    2201         924 :         if (INTERESTINGWORD(i))
    2202          54 :             *poslen += 1;
    2203             :     }
    2204             :     /* if the cover was cut then move back endpos to a query item */
    2205          36 :     if (*endpos > i)
    2206             :     {
    2207          12 :         *endpos = i;
    2208         840 :         for (i = *endpos; i >= *startpos; i--)
    2209             :         {
    2210         840 :             *endpos = i;
    2211         840 :             if (INTERESTINGWORD(i))
    2212          12 :                 break;
    2213         828 :             if (!NONWORDTOKEN(prs->words[i].type))
    2214         408 :                 *curlen -= 1;
    2215             :         }
    2216             :     }
    2217          36 : }
    2218             : 
    2219             : /*
    2220             :  * Headline selector used when MaxFragments > 0
    2221             :  *
    2222             :  * Note: in this mode, highlightall is disregarded for phrase selection;
    2223             :  * it only controls presentation details.
    2224             :  */
    2225             : static void
    2226          30 : mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, List *locations,
    2227             :                   bool highlightall,
    2228             :                   int shortword, int min_words,
    2229             :                   int max_words, int max_fragments)
    2230             : {
    2231             :     int32       poslen,
    2232             :                 curlen,
    2233             :                 i,
    2234             :                 f,
    2235          30 :                 num_f = 0;
    2236             :     int32       stretch,
    2237             :                 maxstretch,
    2238             :                 posmarker;
    2239             : 
    2240          30 :     int32       startpos = 0,
    2241          30 :                 endpos = 0,
    2242          30 :                 nextpos = 0,
    2243          30 :                 p = 0,
    2244          30 :                 q = 0;
    2245             : 
    2246          30 :     int32       numcovers = 0,
    2247          30 :                 maxcovers = 32;
    2248             : 
    2249             :     int32       minI,
    2250             :                 minwords,
    2251             :                 maxitems;
    2252             :     CoverPos   *covers;
    2253             : 
    2254          30 :     covers = palloc(maxcovers * sizeof(CoverPos));
    2255             : 
    2256             :     /* get all covers */
    2257          54 :     while (hlCover(prs, query, locations, &nextpos, &p, &q))
    2258             :     {
    2259          24 :         startpos = p;
    2260          24 :         endpos = q;
    2261             : 
    2262             :         /*
    2263             :          * Break the cover into smaller fragments such that each fragment has
    2264             :          * at most max_words. Also ensure that each end of each fragment is a
    2265             :          * query word. This will allow us to stretch the fragment in either
    2266             :          * direction
    2267             :          */
    2268             : 
    2269          60 :         while (startpos <= endpos)
    2270             :         {
    2271          36 :             get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
    2272          36 :             if (numcovers >= maxcovers)
    2273             :             {
    2274           0 :                 maxcovers *= 2;
    2275           0 :                 covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
    2276             :             }
    2277          36 :             covers[numcovers].startpos = startpos;
    2278          36 :             covers[numcovers].endpos = endpos;
    2279          36 :             covers[numcovers].curlen = curlen;
    2280          36 :             covers[numcovers].poslen = poslen;
    2281          36 :             covers[numcovers].chosen = false;
    2282          36 :             covers[numcovers].excluded = false;
    2283          36 :             numcovers++;
    2284          36 :             startpos = endpos + 1;
    2285          36 :             endpos = q;
    2286             :         }
    2287             :     }
    2288             : 
    2289             :     /* choose best covers */
    2290          66 :     for (f = 0; f < max_fragments; f++)
    2291             :     {
    2292          48 :         maxitems = 0;
    2293          48 :         minwords = PG_INT32_MAX;
    2294          48 :         minI = -1;
    2295             : 
    2296             :         /*
    2297             :          * Choose the cover that contains max items. In case of tie choose the
    2298             :          * one with smaller number of words.
    2299             :          */
    2300         114 :         for (i = 0; i < numcovers; i++)
    2301             :         {
    2302          66 :             if (!covers[i].chosen && !covers[i].excluded &&
    2303          48 :                 (maxitems < covers[i].poslen ||
    2304          12 :                  (maxitems == covers[i].poslen &&
    2305          12 :                   minwords > covers[i].curlen)))
    2306             :             {
    2307          36 :                 maxitems = covers[i].poslen;
    2308          36 :                 minwords = covers[i].curlen;
    2309          36 :                 minI = i;
    2310             :             }
    2311             :         }
    2312             :         /* if a cover was found mark it */
    2313          48 :         if (minI >= 0)
    2314             :         {
    2315          36 :             covers[minI].chosen = true;
    2316             :             /* adjust the size of cover */
    2317          36 :             startpos = covers[minI].startpos;
    2318          36 :             endpos = covers[minI].endpos;
    2319          36 :             curlen = covers[minI].curlen;
    2320             :             /* stretch the cover if cover size is lower than max_words */
    2321          36 :             if (curlen < max_words)
    2322             :             {
    2323             :                 /* divide the stretch on both sides of cover */
    2324          36 :                 maxstretch = (max_words - curlen) / 2;
    2325             : 
    2326             :                 /*
    2327             :                  * first stretch the startpos stop stretching if 1. we hit the
    2328             :                  * beginning of document 2. exceed maxstretch 3. we hit an
    2329             :                  * already marked fragment
    2330             :                  */
    2331          36 :                 stretch = 0;
    2332          36 :                 posmarker = startpos;
    2333         600 :                 for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
    2334             :                 {
    2335         564 :                     if (!NONWORDTOKEN(prs->words[i].type))
    2336             :                     {
    2337         270 :                         curlen++;
    2338         270 :                         stretch++;
    2339             :                     }
    2340         564 :                     posmarker = i;
    2341             :                 }
    2342             :                 /* cut back startpos till we find a good endpoint */
    2343         132 :                 for (i = posmarker; i < startpos && BADENDPOINT(i); i++)
    2344             :                 {
    2345          96 :                     if (!NONWORDTOKEN(prs->words[i].type))
    2346          36 :                         curlen--;
    2347             :                 }
    2348          36 :                 startpos = i;
    2349             :                 /* now stretch the endpos as much as possible */
    2350          36 :                 posmarker = endpos;
    2351         966 :                 for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
    2352             :                 {
    2353         930 :                     if (!NONWORDTOKEN(prs->words[i].type))
    2354         462 :                         curlen++;
    2355         930 :                     posmarker = i;
    2356             :                 }
    2357             :                 /* cut back endpos till we find a good endpoint */
    2358          90 :                 for (i = posmarker; i > endpos && BADENDPOINT(i); i--)
    2359             :                 {
    2360          54 :                     if (!NONWORDTOKEN(prs->words[i].type))
    2361          24 :                         curlen--;
    2362             :                 }
    2363          36 :                 endpos = i;
    2364             :             }
    2365          36 :             covers[minI].startpos = startpos;
    2366          36 :             covers[minI].endpos = endpos;
    2367          36 :             covers[minI].curlen = curlen;
    2368             :             /* Mark the chosen fragments (covers) */
    2369          36 :             mark_fragment(prs, highlightall, startpos, endpos);
    2370          36 :             num_f++;
    2371             :             /* Exclude covers overlapping this one from future consideration */
    2372          96 :             for (i = 0; i < numcovers; i++)
    2373             :             {
    2374          60 :                 if (i != minI &&
    2375          24 :                     ((covers[i].startpos >= startpos &&
    2376          12 :                       covers[i].startpos <= endpos) ||
    2377          24 :                      (covers[i].endpos >= startpos &&
    2378          12 :                       covers[i].endpos <= endpos) ||
    2379          24 :                      (covers[i].startpos < startpos &&
    2380          12 :                       covers[i].endpos > endpos)))
    2381           0 :                     covers[i].excluded = true;
    2382             :             }
    2383             :         }
    2384             :         else
    2385          12 :             break;              /* no selectable covers remain */
    2386             :     }
    2387             : 
    2388             :     /* show the first min_words words if we have not marked anything */
    2389          30 :     if (num_f <= 0)
    2390             :     {
    2391           6 :         startpos = curlen = 0;
    2392           6 :         endpos = -1;
    2393         186 :         for (i = 0; i < prs->curwords && curlen < min_words; i++)
    2394             :         {
    2395         180 :             if (!NONWORDTOKEN(prs->words[i].type))
    2396          90 :                 curlen++;
    2397         180 :             endpos = i;
    2398             :         }
    2399           6 :         mark_fragment(prs, highlightall, startpos, endpos);
    2400             :     }
    2401             : 
    2402          30 :     pfree(covers);
    2403          30 : }
    2404             : 
    2405             : /*
    2406             :  * Headline selector used when MaxFragments == 0
    2407             :  */
    2408             : static void
    2409         344 : mark_hl_words(HeadlineParsedText *prs, TSQuery query, List *locations,
    2410             :               bool highlightall,
    2411             :               int shortword, int min_words, int max_words)
    2412             : {
    2413         344 :     int         nextpos = 0,
    2414         344 :                 p = 0,
    2415         344 :                 q = 0;
    2416         344 :     int         bestb = -1,
    2417         344 :                 beste = -1;
    2418         344 :     int         bestlen = -1;
    2419         344 :     bool        bestcover = false;
    2420             :     int         pose,
    2421             :                 posb,
    2422             :                 poslen,
    2423             :                 curlen;
    2424             :     bool        poscover;
    2425             :     int         i;
    2426             : 
    2427         344 :     if (!highlightall)
    2428             :     {
    2429             :         /* examine all covers, select a headline using the best one */
    2430         508 :         while (hlCover(prs, query, locations, &nextpos, &p, &q))
    2431             :         {
    2432             :             /*
    2433             :              * Count words (curlen) and interesting words (poslen) within
    2434             :              * cover, but stop once we reach max_words.  This step doesn't
    2435             :              * consider whether that's a good stopping point.  posb and pose
    2436             :              * are set to the start and end indexes of the possible headline.
    2437             :              */
    2438         170 :             curlen = 0;
    2439         170 :             poslen = 0;
    2440         170 :             posb = pose = p;
    2441        1456 :             for (i = p; i <= q && curlen < max_words; i++)
    2442             :             {
    2443        1286 :                 if (!NONWORDTOKEN(prs->words[i].type))
    2444         728 :                     curlen++;
    2445        1286 :                 if (INTERESTINGWORD(i))
    2446         290 :                     poslen++;
    2447        1286 :                 pose = i;
    2448             :             }
    2449             : 
    2450         170 :             if (curlen < max_words)
    2451             :             {
    2452             :                 /*
    2453             :                  * We have room to lengthen the headline, so search forward
    2454             :                  * until it's full or we find a good stopping point.  We'll
    2455             :                  * reconsider the word at "q", then move forward.
    2456             :                  */
    2457        2938 :                 for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
    2458             :                 {
    2459        2912 :                     if (i > q)
    2460             :                     {
    2461        2754 :                         if (!NONWORDTOKEN(prs->words[i].type))
    2462        1374 :                             curlen++;
    2463        2754 :                         if (INTERESTINGWORD(i))
    2464         120 :                             poslen++;
    2465             :                     }
    2466        2912 :                     pose = i;
    2467        2912 :                     if (BADENDPOINT(i))
    2468        1944 :                         continue;
    2469         968 :                     if (curlen >= min_words)
    2470         132 :                         break;
    2471             :                 }
    2472         158 :                 if (curlen < min_words)
    2473             :                 {
    2474             :                     /*
    2475             :                      * Reached end of text and our headline is still shorter
    2476             :                      * than min_words, so try to extend it to the left.
    2477             :                      */
    2478         366 :                     for (i = p - 1; i >= 0; i--)
    2479             :                     {
    2480         364 :                         if (!NONWORDTOKEN(prs->words[i].type))
    2481         182 :                             curlen++;
    2482         364 :                         if (INTERESTINGWORD(i))
    2483           6 :                             poslen++;
    2484         364 :                         if (curlen >= max_words)
    2485           0 :                             break;
    2486         364 :                         if (BADENDPOINT(i))
    2487         236 :                             continue;
    2488         128 :                         if (curlen >= min_words)
    2489          24 :                             break;
    2490             :                     }
    2491          26 :                     posb = (i >= 0) ? i : 0;
    2492             :                 }
    2493             :             }
    2494             :             else
    2495             :             {
    2496             :                 /*
    2497             :                  * Can't make headline longer, so consider making it shorter
    2498             :                  * if needed to avoid a bad endpoint.
    2499             :                  */
    2500          12 :                 if (i > q)
    2501           6 :                     i = q;
    2502          30 :                 for (; curlen > min_words; i--)
    2503             :                 {
    2504          30 :                     if (!BADENDPOINT(i))
    2505             :                         break;
    2506          18 :                     if (!NONWORDTOKEN(prs->words[i].type))
    2507           6 :                         curlen--;
    2508          18 :                     if (INTERESTINGWORD(i))
    2509           0 :                         poslen--;
    2510          18 :                     pose = i - 1;
    2511             :                 }
    2512             :             }
    2513             : 
    2514             :             /*
    2515             :              * Check whether the proposed headline includes the original
    2516             :              * cover; it might not if we trimmed it due to max_words.
    2517             :              */
    2518         170 :             poscover = (posb <= p && pose >= q);
    2519             : 
    2520             :             /*
    2521             :              * Adopt this headline if it's better than the last one, giving
    2522             :              * highest priority to headlines including the cover, then to
    2523             :              * headlines with more interesting words, then to headlines with
    2524             :              * good stopping points.  (Since bestlen is initially -1, we will
    2525             :              * certainly adopt the first headline.)
    2526             :              */
    2527         170 :             if (poscover > bestcover ||
    2528          78 :                 (poscover == bestcover && poslen > bestlen) ||
    2529          72 :                 (poscover == bestcover && poslen == bestlen &&
    2530          12 :                  !BADENDPOINT(pose) && BADENDPOINT(beste)))
    2531             :             {
    2532          98 :                 bestb = posb;
    2533          98 :                 beste = pose;
    2534          98 :                 bestlen = poslen;
    2535          98 :                 bestcover = poscover;
    2536             :             }
    2537             :         }
    2538             : 
    2539             :         /*
    2540             :          * If we found nothing acceptable, select min_words words starting at
    2541             :          * the beginning.
    2542             :          */
    2543         338 :         if (bestlen < 0)
    2544             :         {
    2545         240 :             curlen = 0;
    2546         240 :             pose = -1;
    2547        1038 :             for (i = 0; i < prs->curwords && curlen < min_words; i++)
    2548             :             {
    2549         798 :                 if (!NONWORDTOKEN(prs->words[i].type))
    2550         516 :                     curlen++;
    2551         798 :                 pose = i;
    2552             :             }
    2553         240 :             bestb = 0;
    2554         240 :             beste = pose;
    2555             :         }
    2556             :     }
    2557             :     else
    2558             :     {
    2559             :         /* highlightall mode: headline is whole document */
    2560           6 :         bestb = 0;
    2561           6 :         beste = prs->curwords - 1;
    2562             :     }
    2563             : 
    2564         344 :     mark_fragment(prs, highlightall, bestb, beste);
    2565         344 : }
    2566             : 
    2567             : /*
    2568             :  * Default parser's prsheadline function
    2569             :  */
    2570             : Datum
    2571         374 : prsd_headline(PG_FUNCTION_ARGS)
    2572             : {
    2573         374 :     HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
    2574         374 :     List       *prsoptions = (List *) PG_GETARG_POINTER(1);
    2575         374 :     TSQuery     query = PG_GETARG_TSQUERY(2);
    2576             :     List       *locations;
    2577             : 
    2578             :     /* default option values: */
    2579         374 :     int         min_words = 15;
    2580         374 :     int         max_words = 35;
    2581         374 :     int         shortword = 3;
    2582         374 :     int         max_fragments = 0;
    2583         374 :     bool        highlightall = false;
    2584             :     ListCell   *l;
    2585             : 
    2586             :     /* Extract configuration option values */
    2587         374 :     prs->startsel = NULL;
    2588         374 :     prs->stopsel = NULL;
    2589         374 :     prs->fragdelim = NULL;
    2590         728 :     foreach(l, prsoptions)
    2591             :     {
    2592         354 :         DefElem    *defel = (DefElem *) lfirst(l);
    2593         354 :         char       *val = defGetString(defel);
    2594             : 
    2595         354 :         if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
    2596          36 :             max_words = pg_strtoint32(val);
    2597         318 :         else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
    2598          36 :             min_words = pg_strtoint32(val);
    2599         282 :         else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
    2600           0 :             shortword = pg_strtoint32(val);
    2601         282 :         else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
    2602          30 :             max_fragments = pg_strtoint32(val);
    2603         252 :         else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
    2604         120 :             prs->startsel = pstrdup(val);
    2605         132 :         else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
    2606         120 :             prs->stopsel = pstrdup(val);
    2607          12 :         else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
    2608           6 :             prs->fragdelim = pstrdup(val);
    2609           6 :         else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
    2610          18 :             highlightall = (pg_strcasecmp(val, "1") == 0 ||
    2611          12 :                             pg_strcasecmp(val, "on") == 0 ||
    2612           6 :                             pg_strcasecmp(val, "true") == 0 ||
    2613           0 :                             pg_strcasecmp(val, "t") == 0 ||
    2614          12 :                             pg_strcasecmp(val, "y") == 0 ||
    2615           0 :                             pg_strcasecmp(val, "yes") == 0);
    2616             :         else
    2617           0 :             ereport(ERROR,
    2618             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    2619             :                      errmsg("unrecognized headline parameter: \"%s\"",
    2620             :                             defel->defname)));
    2621             :     }
    2622             : 
    2623             :     /* in HighlightAll mode these parameters are ignored */
    2624         374 :     if (!highlightall)
    2625             :     {
    2626         368 :         if (min_words >= max_words)
    2627           0 :             ereport(ERROR,
    2628             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    2629             :                      errmsg("%s must be less than %s", "MinWords", "MaxWords")));
    2630         368 :         if (min_words <= 0)
    2631           0 :             ereport(ERROR,
    2632             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    2633             :                      errmsg("%s must be positive", "MinWords")));
    2634         368 :         if (shortword < 0)
    2635           0 :             ereport(ERROR,
    2636             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    2637             :                      errmsg("%s must be >= 0", "ShortWord")));
    2638         368 :         if (max_fragments < 0)
    2639           0 :             ereport(ERROR,
    2640             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    2641             :                      errmsg("%s must be >= 0", "MaxFragments")));
    2642             :     }
    2643             : 
    2644             :     /* Locate words and phrases matching the query */
    2645         374 :     if (query->size > 0)
    2646             :     {
    2647             :         hlCheck     ch;
    2648             : 
    2649         362 :         ch.words = prs->words;
    2650         362 :         ch.len = prs->curwords;
    2651         362 :         locations = TS_execute_locations(GETQUERY(query), &ch, TS_EXEC_EMPTY,
    2652             :                                          checkcondition_HL);
    2653             :     }
    2654             :     else
    2655          12 :         locations = NIL;        /* empty query matches nothing */
    2656             : 
    2657             :     /* Apply appropriate headline selector */
    2658         374 :     if (max_fragments == 0)
    2659         344 :         mark_hl_words(prs, query, locations, highlightall, shortword,
    2660             :                       min_words, max_words);
    2661             :     else
    2662          30 :         mark_hl_fragments(prs, query, locations, highlightall, shortword,
    2663             :                           min_words, max_words, max_fragments);
    2664             : 
    2665             :     /* Fill in default values for string options */
    2666         374 :     if (!prs->startsel)
    2667         254 :         prs->startsel = pstrdup("<b>");
    2668         374 :     if (!prs->stopsel)
    2669         254 :         prs->stopsel = pstrdup("</b>");
    2670         374 :     if (!prs->fragdelim)
    2671         368 :         prs->fragdelim = pstrdup(" ... ");
    2672             : 
    2673             :     /* Caller will need these lengths, too */
    2674         374 :     prs->startsellen = strlen(prs->startsel);
    2675         374 :     prs->stopsellen = strlen(prs->stopsel);
    2676         374 :     prs->fragdelimlen = strlen(prs->fragdelim);
    2677             : 
    2678         374 :     PG_RETURN_POINTER(prs);
    2679             : }

Generated by: LCOV version 1.16