LCOV - code coverage report
Current view: top level - src/backend/tsearch - wparser_def.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 89.2 % 603 538
Test Date: 2026-02-17 17:20:33 Functions: 71.2 % 52 37
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * wparser_def.c
       4              :  *      Default text search parser
       5              :  *
       6              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7              :  *
       8              :  *
       9              :  * IDENTIFICATION
      10              :  *    src/backend/tsearch/wparser_def.c
      11              :  *
      12              :  *-------------------------------------------------------------------------
      13              :  */
      14              : 
      15              : #include "postgres.h"
      16              : 
      17              : #include <limits.h>
      18              : #include <wctype.h>
      19              : 
      20              : #include "commands/defrem.h"
      21              : #include "mb/pg_wchar.h"
      22              : #include "miscadmin.h"
      23              : #include "tsearch/ts_public.h"
      24              : #include "tsearch/ts_type.h"
      25              : #include "tsearch/ts_utils.h"
      26              : #include "utils/builtins.h"
      27              : #include "utils/pg_locale.h"
      28              : 
      29              : 
      30              : /* Define me to enable tracing of parser behavior */
      31              : /* #define WPARSER_TRACE */
      32              : 
      33              : 
      34              : /* Output token categories */
      35              : 
      36              : #define ASCIIWORD       1
      37              : #define WORD_T          2
      38              : #define NUMWORD         3
      39              : #define EMAIL           4
      40              : #define URL_T           5
      41              : #define HOST            6
      42              : #define SCIENTIFIC      7
      43              : #define VERSIONNUMBER   8
      44              : #define NUMPARTHWORD    9
      45              : #define PARTHWORD       10
      46              : #define ASCIIPARTHWORD  11
      47              : #define SPACE           12
      48              : #define TAG_T           13
      49              : #define PROTOCOL        14
      50              : #define NUMHWORD        15
      51              : #define ASCIIHWORD      16
      52              : #define HWORD           17
      53              : #define URLPATH         18
      54              : #define FILEPATH        19
      55              : #define DECIMAL_T       20
      56              : #define SIGNEDINT       21
      57              : #define UNSIGNEDINT     22
      58              : #define XMLENTITY       23
      59              : 
      60              : #define LASTNUM         23
      61              : 
      62              : static const char *const tok_alias[] = {
      63              :     "",
      64              :     "asciiword",
      65              :     "word",
      66              :     "numword",
      67              :     "email",
      68              :     "url",
      69              :     "host",
      70              :     "sfloat",
      71              :     "version",
      72              :     "hword_numpart",
      73              :     "hword_part",
      74              :     "hword_asciipart",
      75              :     "blank",
      76              :     "tag",
      77              :     "protocol",
      78              :     "numhword",
      79              :     "asciihword",
      80              :     "hword",
      81              :     "url_path",
      82              :     "file",
      83              :     "float",
      84              :     "int",
      85              :     "uint",
      86              :     "entity"
      87              : };
      88              : 
      89              : static const char *const lex_descr[] = {
      90              :     "",
      91              :     "Word, all ASCII",
      92              :     "Word, all letters",
      93              :     "Word, letters and digits",
      94              :     "Email address",
      95              :     "URL",
      96              :     "Host",
      97              :     "Scientific notation",
      98              :     "Version number",
      99              :     "Hyphenated word part, letters and digits",
     100              :     "Hyphenated word part, all letters",
     101              :     "Hyphenated word part, all ASCII",
     102              :     "Space symbols",
     103              :     "XML tag",
     104              :     "Protocol head",
     105              :     "Hyphenated word, letters and digits",
     106              :     "Hyphenated word, all ASCII",
     107              :     "Hyphenated word, all letters",
     108              :     "URL path",
     109              :     "File or path name",
     110              :     "Decimal notation",
     111              :     "Signed integer",
     112              :     "Unsigned integer",
     113              :     "XML entity"
     114              : };
     115              : 
     116              : 
     117              : /* Parser states */
     118              : 
     119              : typedef enum
     120              : {
     121              :     TPS_Base = 0,
     122              :     TPS_InNumWord,
     123              :     TPS_InAsciiWord,
     124              :     TPS_InWord,
     125              :     TPS_InUnsignedInt,
     126              :     TPS_InSignedIntFirst,
     127              :     TPS_InSignedInt,
     128              :     TPS_InSpace,
     129              :     TPS_InUDecimalFirst,
     130              :     TPS_InUDecimal,
     131              :     TPS_InDecimalFirst,
     132              :     TPS_InDecimal,
     133              :     TPS_InVerVersion,
     134              :     TPS_InSVerVersion,
     135              :     TPS_InVersionFirst,
     136              :     TPS_InVersion,
     137              :     TPS_InMantissaFirst,
     138              :     TPS_InMantissaSign,
     139              :     TPS_InMantissa,
     140              :     TPS_InXMLEntityFirst,
     141              :     TPS_InXMLEntity,
     142              :     TPS_InXMLEntityNumFirst,
     143              :     TPS_InXMLEntityNum,
     144              :     TPS_InXMLEntityHexNumFirst,
     145              :     TPS_InXMLEntityHexNum,
     146              :     TPS_InXMLEntityEnd,
     147              :     TPS_InTagFirst,
     148              :     TPS_InXMLBegin,
     149              :     TPS_InTagCloseFirst,
     150              :     TPS_InTagName,
     151              :     TPS_InTagBeginEnd,
     152              :     TPS_InTag,
     153              :     TPS_InTagEscapeK,
     154              :     TPS_InTagEscapeKK,
     155              :     TPS_InTagBackSleshed,
     156              :     TPS_InTagEnd,
     157              :     TPS_InCommentFirst,
     158              :     TPS_InCommentLast,
     159              :     TPS_InComment,
     160              :     TPS_InCloseCommentFirst,
     161              :     TPS_InCloseCommentLast,
     162              :     TPS_InCommentEnd,
     163              :     TPS_InHostFirstDomain,
     164              :     TPS_InHostDomainSecond,
     165              :     TPS_InHostDomain,
     166              :     TPS_InPortFirst,
     167              :     TPS_InPort,
     168              :     TPS_InHostFirstAN,
     169              :     TPS_InHost,
     170              :     TPS_InEmail,
     171              :     TPS_InFileFirst,
     172              :     TPS_InFileTwiddle,
     173              :     TPS_InPathFirst,
     174              :     TPS_InPathFirstFirst,
     175              :     TPS_InPathSecond,
     176              :     TPS_InFile,
     177              :     TPS_InFileNext,
     178              :     TPS_InURLPathFirst,
     179              :     TPS_InURLPathStart,
     180              :     TPS_InURLPath,
     181              :     TPS_InFURL,
     182              :     TPS_InProtocolFirst,
     183              :     TPS_InProtocolSecond,
     184              :     TPS_InProtocolEnd,
     185              :     TPS_InHyphenAsciiWordFirst,
     186              :     TPS_InHyphenAsciiWord,
     187              :     TPS_InHyphenWordFirst,
     188              :     TPS_InHyphenWord,
     189              :     TPS_InHyphenNumWordFirst,
     190              :     TPS_InHyphenNumWord,
     191              :     TPS_InHyphenDigitLookahead,
     192              :     TPS_InParseHyphen,
     193              :     TPS_InParseHyphenHyphen,
     194              :     TPS_InHyphenWordPart,
     195              :     TPS_InHyphenAsciiWordPart,
     196              :     TPS_InHyphenNumWordPart,
     197              :     TPS_InHyphenUnsignedInt,
     198              :     TPS_Null                    /* last state (fake value) */
     199              : } TParserState;
     200              : 
     201              : /* forward declaration */
     202              : struct TParser;
     203              : 
     204              : typedef int (*TParserCharTest) (struct TParser *);  /* any p_is* functions
     205              :                                                      * except p_iseq */
     206              : typedef void (*TParserSpecial) (struct TParser *);  /* special handler for
     207              :                                                      * special cases... */
     208              : 
     209              : typedef struct
     210              : {
     211              :     TParserCharTest isclass;
     212              :     char        c;
     213              :     uint16      flags;
     214              :     TParserState tostate;
     215              :     int         type;
     216              :     TParserSpecial special;
     217              : } TParserStateActionItem;
     218              : 
     219              : /* Flag bits in TParserStateActionItem.flags */
     220              : #define A_NEXT      0x0000
     221              : #define A_BINGO     0x0001
     222              : #define A_POP       0x0002
     223              : #define A_PUSH      0x0004
     224              : #define A_RERUN     0x0008
     225              : #define A_CLEAR     0x0010
     226              : #define A_MERGE     0x0020
     227              : #define A_CLRALL    0x0040
     228              : 
     229              : typedef struct TParserPosition
     230              : {
     231              :     int         posbyte;        /* position of parser in bytes */
     232              :     int         poschar;        /* position of parser in characters */
     233              :     int         charlen;        /* length of current char */
     234              :     int         lenbytetoken;   /* length of token-so-far in bytes */
     235              :     int         lenchartoken;   /* and in chars */
     236              :     TParserState state;
     237              :     struct TParserPosition *prev;
     238              :     const TParserStateActionItem *pushedAtAction;
     239              : } TParserPosition;
     240              : 
     241              : typedef struct TParser
     242              : {
     243              :     /* string and position information */
     244              :     char       *str;            /* multibyte string */
     245              :     int         lenstr;         /* length of mbstring */
     246              :     pg_wchar   *pgwstr;         /* wide character string for C-locale */
     247              : 
     248              :     /* State of parse */
     249              :     int         charmaxlen;
     250              :     TParserPosition *state;
     251              :     bool        ignore;
     252              :     bool        wanthost;
     253              : 
     254              :     /* silly char */
     255              :     char        c;
     256              : 
     257              :     /* out */
     258              :     char       *token;
     259              :     int         lenbytetoken;
     260              :     int         lenchartoken;
     261              :     int         type;
     262              : } TParser;
     263              : 
     264              : 
     265              : /* forward decls here */
     266              : static bool TParserGet(TParser *prs);
     267              : 
     268              : 
     269              : static TParserPosition *
     270         5116 : newTParserPosition(TParserPosition *prev)
     271              : {
     272         5116 :     TParserPosition *res = palloc_object(TParserPosition);
     273              : 
     274         5116 :     if (prev)
     275         2619 :         memcpy(res, prev, sizeof(TParserPosition));
     276              :     else
     277         2497 :         memset(res, 0, sizeof(TParserPosition));
     278              : 
     279         5116 :     res->prev = prev;
     280              : 
     281         5116 :     res->pushedAtAction = NULL;
     282              : 
     283         5116 :     return res;
     284              : }
     285              : 
     286              : static TParser *
     287         2377 : TParserInit(char *str, int len)
     288              : {
     289         2377 :     TParser    *prs = palloc0_object(TParser);
     290              : 
     291         2377 :     prs->charmaxlen = pg_database_encoding_max_length();
     292         2377 :     prs->str = str;
     293         2377 :     prs->lenstr = len;
     294         2377 :     prs->pgwstr = palloc_array(pg_wchar, prs->lenstr + 1);
     295         2377 :     pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
     296              : 
     297         2377 :     prs->state = newTParserPosition(NULL);
     298         2377 :     prs->state->state = TPS_Base;
     299              : 
     300              : #ifdef WPARSER_TRACE
     301              :     fprintf(stderr, "parsing \"%.*s\"\n", len, str);
     302              : #endif
     303              : 
     304         2377 :     return prs;
     305              : }
     306              : 
     307              : /*
     308              :  * As an alternative to a full TParserInit one can create a
     309              :  * TParserCopy which basically is a regular TParser without a private
     310              :  * copy of the string - instead it uses the one from another TParser.
     311              :  * This is useful because at some places TParsers are created
     312              :  * recursively and the repeated copying around of the strings can
     313              :  * cause major inefficiency if the source string is long.
     314              :  * The new parser starts parsing at the original's current position.
     315              :  *
     316              :  * Obviously one must not close the original TParser before the copy.
     317              :  */
     318              : static TParser *
     319          120 : TParserCopyInit(const TParser *orig)
     320              : {
     321          120 :     TParser    *prs = palloc0_object(TParser);
     322              : 
     323          120 :     prs->charmaxlen = orig->charmaxlen;
     324          120 :     prs->str = orig->str + orig->state->posbyte;
     325          120 :     prs->lenstr = orig->lenstr - orig->state->posbyte;
     326              : 
     327          120 :     if (orig->pgwstr)
     328          120 :         prs->pgwstr = orig->pgwstr + orig->state->poschar;
     329              : 
     330          120 :     prs->state = newTParserPosition(NULL);
     331          120 :     prs->state->state = TPS_Base;
     332              : 
     333              : #ifdef WPARSER_TRACE
     334              :     fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
     335              : #endif
     336              : 
     337          120 :     return prs;
     338              : }
     339              : 
     340              : 
     341              : static void
     342         2377 : TParserClose(TParser *prs)
     343              : {
     344         4754 :     while (prs->state)
     345              :     {
     346         2377 :         TParserPosition *ptr = prs->state->prev;
     347              : 
     348         2377 :         pfree(prs->state);
     349         2377 :         prs->state = ptr;
     350              :     }
     351              : 
     352         2377 :     if (prs->pgwstr)
     353         2377 :         pfree(prs->pgwstr);
     354              : 
     355              : #ifdef WPARSER_TRACE
     356              :     fprintf(stderr, "closing parser\n");
     357              : #endif
     358         2377 :     pfree(prs);
     359         2377 : }
     360              : 
     361              : /*
     362              :  * Close a parser created with TParserCopyInit
     363              :  */
     364              : static void
     365          120 : TParserCopyClose(TParser *prs)
     366              : {
     367          306 :     while (prs->state)
     368              :     {
     369          186 :         TParserPosition *ptr = prs->state->prev;
     370              : 
     371          186 :         pfree(prs->state);
     372          186 :         prs->state = ptr;
     373              :     }
     374              : 
     375              : #ifdef WPARSER_TRACE
     376              :     fprintf(stderr, "closing parser copy\n");
     377              : #endif
     378          120 :     pfree(prs);
     379          120 : }
     380              : 
     381              : 
     382              : /*
     383              :  * Character-type support functions using the database default locale. If the
     384              :  * locale is C, and the input character is non-ascii, the value to be returned
     385              :  * is determined by the 'nonascii' macro argument.
     386              :  */
     387              : 
     388              : #define p_iswhat(type, nonascii)                                            \
     389              :                                                                             \
     390              : static int                                                                  \
     391              : p_is##type(TParser *prs)                                                    \
     392              : {                                                                           \
     393              :     pg_locale_t locale = pg_database_locale();                              \
     394              :     pg_wchar    wc;                                                         \
     395              :     Assert(prs->state);                                                      \
     396              :     wc = prs->pgwstr[prs->state->poschar];                                 \
     397              :     if (prs->charmaxlen > 1 && locale->ctype_is_c && wc > 0x7f)             \
     398              :         return nonascii;                                                    \
     399              :     return pg_isw##type(wc, pg_database_locale());                      \
     400              : }                                                                           \
     401              :                                                                             \
     402              : static int                                                                  \
     403              : p_isnot##type(TParser *prs)                                                 \
     404              : {                                                                           \
     405              :     return !p_is##type(prs);                                                \
     406              : }
     407              : 
     408              : /*
     409              :  * In C locale with a multibyte encoding, any non-ASCII symbol is considered
     410              :  * an alpha character, but not a member of other char classes.
     411              :  */
     412        12561 : p_iswhat(alnum, 1)
     413        46928 : p_iswhat(alpha, 1)
     414        18566 : p_iswhat(digit, 0)
     415            0 : p_iswhat(lower, 0)
     416            0 : p_iswhat(print, 0)
     417            0 : p_iswhat(punct, 0)
     418          339 : p_iswhat(space, 0)
     419            0 : p_iswhat(upper, 0)
     420            9 : p_iswhat(xdigit, 0)
     421              : 
     422              : /* p_iseq should be used only for ascii symbols */
     423              : 
     424              : static int
     425       115696 : p_iseq(TParser *prs, char c)
     426              : {
     427              :     Assert(prs->state);
     428       115696 :     return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
     429              : }
     430              : 
     431              : static int
     432        50079 : p_isEOF(TParser *prs)
     433              : {
     434              :     Assert(prs->state);
     435        50079 :     return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
     436              : }
     437              : 
     438              : static int
     439       115696 : p_iseqC(TParser *prs)
     440              : {
     441       115696 :     return p_iseq(prs, prs->c);
     442              : }
     443              : 
     444              : static int
     445            0 : p_isneC(TParser *prs)
     446              : {
     447            0 :     return !p_iseq(prs, prs->c);
     448              : }
     449              : 
     450              : static int
     451        36772 : p_isascii(TParser *prs)
     452              : {
     453        36772 :     return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
     454              : }
     455              : 
     456              : static int
     457        36772 : p_isasclet(TParser *prs)
     458              : {
     459        36772 :     return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
     460              : }
     461              : 
     462              : static int
     463         1329 : p_isurlchar(TParser *prs)
     464              : {
     465              :     char        ch;
     466              : 
     467              :     /* no non-ASCII need apply */
     468         1329 :     if (prs->state->charlen != 1)
     469            0 :         return 0;
     470         1329 :     ch = *(prs->str + prs->state->posbyte);
     471              :     /* no spaces or control characters */
     472         1329 :     if (ch <= 0x20 || ch >= 0x7F)
     473          117 :         return 0;
     474              :     /* reject characters disallowed by RFC 3986 */
     475         1212 :     switch (ch)
     476              :     {
     477           12 :         case '"':
     478              :         case '<':
     479              :         case '>':
     480              :         case '\\':
     481              :         case '^':
     482              :         case '`':
     483              :         case '{':
     484              :         case '|':
     485              :         case '}':
     486           12 :             return 0;
     487              :     }
     488         1200 :     return 1;
     489              : }
     490              : 
     491              : 
     492              : /* deliberately suppress unused-function complaints for the above */
     493              : void        _make_compiler_happy(void);
     494              : void
     495            0 : _make_compiler_happy(void)
     496              : {
     497            0 :     p_isalnum(NULL);
     498            0 :     p_isnotalnum(NULL);
     499            0 :     p_isalpha(NULL);
     500            0 :     p_isnotalpha(NULL);
     501            0 :     p_isdigit(NULL);
     502            0 :     p_isnotdigit(NULL);
     503            0 :     p_islower(NULL);
     504            0 :     p_isnotlower(NULL);
     505            0 :     p_isprint(NULL);
     506            0 :     p_isnotprint(NULL);
     507            0 :     p_ispunct(NULL);
     508            0 :     p_isnotpunct(NULL);
     509            0 :     p_isspace(NULL);
     510            0 :     p_isnotspace(NULL);
     511            0 :     p_isupper(NULL);
     512            0 :     p_isnotupper(NULL);
     513            0 :     p_isxdigit(NULL);
     514            0 :     p_isnotxdigit(NULL);
     515            0 :     p_isEOF(NULL);
     516            0 :     p_iseqC(NULL);
     517            0 :     p_isneC(NULL);
     518            0 : }
     519              : 
     520              : 
     521              : static void
     522          126 : SpecialTags(TParser *prs)
     523              : {
     524          126 :     switch (prs->state->lenchartoken)
     525              :     {
     526            3 :         case 8:                 /* </script */
     527            3 :             if (pg_strncasecmp(prs->token, "</script", 8) == 0)
     528            3 :                 prs->ignore = false;
     529            3 :             break;
     530           12 :         case 7:                 /* <script || </style */
     531           12 :             if (pg_strncasecmp(prs->token, "</style", 7) == 0)
     532            0 :                 prs->ignore = false;
     533           12 :             else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
     534            3 :                 prs->ignore = true;
     535           12 :             break;
     536            9 :         case 6:                 /* <style */
     537            9 :             if (pg_strncasecmp(prs->token, "<style", 6) == 0)
     538            0 :                 prs->ignore = true;
     539            9 :             break;
     540          102 :         default:
     541          102 :             break;
     542              :     }
     543          126 : }
     544              : 
     545              : static void
     546           66 : SpecialFURL(TParser *prs)
     547              : {
     548           66 :     prs->wanthost = true;
     549           66 :     prs->state->posbyte -= prs->state->lenbytetoken;
     550           66 :     prs->state->poschar -= prs->state->lenchartoken;
     551           66 : }
     552              : 
     553              : static void
     554           18 : SpecialHyphen(TParser *prs)
     555              : {
     556           18 :     prs->state->posbyte -= prs->state->lenbytetoken;
     557           18 :     prs->state->poschar -= prs->state->lenchartoken;
     558           18 : }
     559              : 
     560              : static void
     561            0 : SpecialVerVersion(TParser *prs)
     562              : {
     563            0 :     prs->state->posbyte -= prs->state->lenbytetoken;
     564            0 :     prs->state->poschar -= prs->state->lenchartoken;
     565            0 :     prs->state->lenbytetoken = 0;
     566            0 :     prs->state->lenchartoken = 0;
     567            0 : }
     568              : 
     569              : static int
     570          240 : p_isstophost(TParser *prs)
     571              : {
     572          240 :     if (prs->wanthost)
     573              :     {
     574          102 :         prs->wanthost = false;
     575          102 :         return 1;
     576              :     }
     577          138 :     return 0;
     578              : }
     579              : 
     580              : static int
     581        18043 : p_isignore(TParser *prs)
     582              : {
     583        18043 :     return (prs->ignore) ? 1 : 0;
     584              : }
     585              : 
     586              : static int
     587           45 : p_ishost(TParser *prs)
     588              : {
     589           45 :     TParser    *tmpprs = TParserCopyInit(prs);
     590           45 :     int         res = 0;
     591              : 
     592           45 :     tmpprs->wanthost = true;
     593              : 
     594              :     /*
     595              :      * Check stack depth before recursing.  (Since TParserGet() doesn't
     596              :      * normally recurse, we put the cost of checking here not there.)
     597              :      */
     598           45 :     check_stack_depth();
     599              : 
     600           45 :     if (TParserGet(tmpprs) && tmpprs->type == HOST)
     601              :     {
     602           36 :         prs->state->posbyte += tmpprs->lenbytetoken;
     603           36 :         prs->state->poschar += tmpprs->lenchartoken;
     604           36 :         prs->state->lenbytetoken += tmpprs->lenbytetoken;
     605           36 :         prs->state->lenchartoken += tmpprs->lenchartoken;
     606           36 :         prs->state->charlen = tmpprs->state->charlen;
     607           36 :         res = 1;
     608              :     }
     609           45 :     TParserCopyClose(tmpprs);
     610              : 
     611           45 :     return res;
     612              : }
     613              : 
     614              : static int
     615           75 : p_isURLPath(TParser *prs)
     616              : {
     617           75 :     TParser    *tmpprs = TParserCopyInit(prs);
     618           75 :     int         res = 0;
     619              : 
     620           75 :     tmpprs->state = newTParserPosition(tmpprs->state);
     621           75 :     tmpprs->state->state = TPS_InURLPathFirst;
     622              : 
     623              :     /*
     624              :      * Check stack depth before recursing.  (Since TParserGet() doesn't
     625              :      * normally recurse, we put the cost of checking here not there.)
     626              :      */
     627           75 :     check_stack_depth();
     628              : 
     629           75 :     if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
     630              :     {
     631           66 :         prs->state->posbyte += tmpprs->lenbytetoken;
     632           66 :         prs->state->poschar += tmpprs->lenchartoken;
     633           66 :         prs->state->lenbytetoken += tmpprs->lenbytetoken;
     634           66 :         prs->state->lenchartoken += tmpprs->lenchartoken;
     635           66 :         prs->state->charlen = tmpprs->state->charlen;
     636           66 :         res = 1;
     637              :     }
     638           75 :     TParserCopyClose(tmpprs);
     639              : 
     640           75 :     return res;
     641              : }
     642              : 
     643              : /*
     644              :  * returns true if current character has zero display length or
     645              :  * it's a special sign in several languages. Such characters
     646              :  * aren't a word-breaker although they aren't an isalpha.
     647              :  * In beginning of word they aren't a part of it.
     648              :  */
     649              : static int
     650         4362 : p_isspecial(TParser *prs)
     651              : {
     652              :     /*
     653              :      * pg_dsplen could return -1 which means error or control character
     654              :      */
     655         4362 :     if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
     656            0 :         return 1;
     657              : 
     658              :     /*
     659              :      * Unicode Characters in the 'Mark, Spacing Combining' Category That
     660              :      * characters are not alpha although they are not breakers of word too.
     661              :      * Check that only in utf encoding, because other encodings aren't
     662              :      * supported by postgres or even exists.
     663              :      */
     664         4362 :     if (GetDatabaseEncoding() == PG_UTF8)
     665              :     {
     666              :         static const pg_wchar strange_letter[] = {
     667              :             /*
     668              :              * use binary search, so elements should be ordered
     669              :              */
     670              :             0x0903,             /* DEVANAGARI SIGN VISARGA */
     671              :             0x093E,             /* DEVANAGARI VOWEL SIGN AA */
     672              :             0x093F,             /* DEVANAGARI VOWEL SIGN I */
     673              :             0x0940,             /* DEVANAGARI VOWEL SIGN II */
     674              :             0x0949,             /* DEVANAGARI VOWEL SIGN CANDRA O */
     675              :             0x094A,             /* DEVANAGARI VOWEL SIGN SHORT O */
     676              :             0x094B,             /* DEVANAGARI VOWEL SIGN O */
     677              :             0x094C,             /* DEVANAGARI VOWEL SIGN AU */
     678              :             0x0982,             /* BENGALI SIGN ANUSVARA */
     679              :             0x0983,             /* BENGALI SIGN VISARGA */
     680              :             0x09BE,             /* BENGALI VOWEL SIGN AA */
     681              :             0x09BF,             /* BENGALI VOWEL SIGN I */
     682              :             0x09C0,             /* BENGALI VOWEL SIGN II */
     683              :             0x09C7,             /* BENGALI VOWEL SIGN E */
     684              :             0x09C8,             /* BENGALI VOWEL SIGN AI */
     685              :             0x09CB,             /* BENGALI VOWEL SIGN O */
     686              :             0x09CC,             /* BENGALI VOWEL SIGN AU */
     687              :             0x09D7,             /* BENGALI AU LENGTH MARK */
     688              :             0x0A03,             /* GURMUKHI SIGN VISARGA */
     689              :             0x0A3E,             /* GURMUKHI VOWEL SIGN AA */
     690              :             0x0A3F,             /* GURMUKHI VOWEL SIGN I */
     691              :             0x0A40,             /* GURMUKHI VOWEL SIGN II */
     692              :             0x0A83,             /* GUJARATI SIGN VISARGA */
     693              :             0x0ABE,             /* GUJARATI VOWEL SIGN AA */
     694              :             0x0ABF,             /* GUJARATI VOWEL SIGN I */
     695              :             0x0AC0,             /* GUJARATI VOWEL SIGN II */
     696              :             0x0AC9,             /* GUJARATI VOWEL SIGN CANDRA O */
     697              :             0x0ACB,             /* GUJARATI VOWEL SIGN O */
     698              :             0x0ACC,             /* GUJARATI VOWEL SIGN AU */
     699              :             0x0B02,             /* ORIYA SIGN ANUSVARA */
     700              :             0x0B03,             /* ORIYA SIGN VISARGA */
     701              :             0x0B3E,             /* ORIYA VOWEL SIGN AA */
     702              :             0x0B40,             /* ORIYA VOWEL SIGN II */
     703              :             0x0B47,             /* ORIYA VOWEL SIGN E */
     704              :             0x0B48,             /* ORIYA VOWEL SIGN AI */
     705              :             0x0B4B,             /* ORIYA VOWEL SIGN O */
     706              :             0x0B4C,             /* ORIYA VOWEL SIGN AU */
     707              :             0x0B57,             /* ORIYA AU LENGTH MARK */
     708              :             0x0BBE,             /* TAMIL VOWEL SIGN AA */
     709              :             0x0BBF,             /* TAMIL VOWEL SIGN I */
     710              :             0x0BC1,             /* TAMIL VOWEL SIGN U */
     711              :             0x0BC2,             /* TAMIL VOWEL SIGN UU */
     712              :             0x0BC6,             /* TAMIL VOWEL SIGN E */
     713              :             0x0BC7,             /* TAMIL VOWEL SIGN EE */
     714              :             0x0BC8,             /* TAMIL VOWEL SIGN AI */
     715              :             0x0BCA,             /* TAMIL VOWEL SIGN O */
     716              :             0x0BCB,             /* TAMIL VOWEL SIGN OO */
     717              :             0x0BCC,             /* TAMIL VOWEL SIGN AU */
     718              :             0x0BD7,             /* TAMIL AU LENGTH MARK */
     719              :             0x0C01,             /* TELUGU SIGN CANDRABINDU */
     720              :             0x0C02,             /* TELUGU SIGN ANUSVARA */
     721              :             0x0C03,             /* TELUGU SIGN VISARGA */
     722              :             0x0C41,             /* TELUGU VOWEL SIGN U */
     723              :             0x0C42,             /* TELUGU VOWEL SIGN UU */
     724              :             0x0C43,             /* TELUGU VOWEL SIGN VOCALIC R */
     725              :             0x0C44,             /* TELUGU VOWEL SIGN VOCALIC RR */
     726              :             0x0C82,             /* KANNADA SIGN ANUSVARA */
     727              :             0x0C83,             /* KANNADA SIGN VISARGA */
     728              :             0x0CBE,             /* KANNADA VOWEL SIGN AA */
     729              :             0x0CC0,             /* KANNADA VOWEL SIGN II */
     730              :             0x0CC1,             /* KANNADA VOWEL SIGN U */
     731              :             0x0CC2,             /* KANNADA VOWEL SIGN UU */
     732              :             0x0CC3,             /* KANNADA VOWEL SIGN VOCALIC R */
     733              :             0x0CC4,             /* KANNADA VOWEL SIGN VOCALIC RR */
     734              :             0x0CC7,             /* KANNADA VOWEL SIGN EE */
     735              :             0x0CC8,             /* KANNADA VOWEL SIGN AI */
     736              :             0x0CCA,             /* KANNADA VOWEL SIGN O */
     737              :             0x0CCB,             /* KANNADA VOWEL SIGN OO */
     738              :             0x0CD5,             /* KANNADA LENGTH MARK */
     739              :             0x0CD6,             /* KANNADA AI LENGTH MARK */
     740              :             0x0D02,             /* MALAYALAM SIGN ANUSVARA */
     741              :             0x0D03,             /* MALAYALAM SIGN VISARGA */
     742              :             0x0D3E,             /* MALAYALAM VOWEL SIGN AA */
     743              :             0x0D3F,             /* MALAYALAM VOWEL SIGN I */
     744              :             0x0D40,             /* MALAYALAM VOWEL SIGN II */
     745              :             0x0D46,             /* MALAYALAM VOWEL SIGN E */
     746              :             0x0D47,             /* MALAYALAM VOWEL SIGN EE */
     747              :             0x0D48,             /* MALAYALAM VOWEL SIGN AI */
     748              :             0x0D4A,             /* MALAYALAM VOWEL SIGN O */
     749              :             0x0D4B,             /* MALAYALAM VOWEL SIGN OO */
     750              :             0x0D4C,             /* MALAYALAM VOWEL SIGN AU */
     751              :             0x0D57,             /* MALAYALAM AU LENGTH MARK */
     752              :             0x0D82,             /* SINHALA SIGN ANUSVARAYA */
     753              :             0x0D83,             /* SINHALA SIGN VISARGAYA */
     754              :             0x0DCF,             /* SINHALA VOWEL SIGN AELA-PILLA */
     755              :             0x0DD0,             /* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
     756              :             0x0DD1,             /* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
     757              :             0x0DD8,             /* SINHALA VOWEL SIGN GAETTA-PILLA */
     758              :             0x0DD9,             /* SINHALA VOWEL SIGN KOMBUVA */
     759              :             0x0DDA,             /* SINHALA VOWEL SIGN DIGA KOMBUVA */
     760              :             0x0DDB,             /* SINHALA VOWEL SIGN KOMBU DEKA */
     761              :             0x0DDC,             /* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
     762              :             0x0DDD,             /* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
     763              :                                  * AELA-PILLA */
     764              :             0x0DDE,             /* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
     765              :             0x0DDF,             /* SINHALA VOWEL SIGN GAYANUKITTA */
     766              :             0x0DF2,             /* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
     767              :             0x0DF3,             /* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
     768              :             0x0F3E,             /* TIBETAN SIGN YAR TSHES */
     769              :             0x0F3F,             /* TIBETAN SIGN MAR TSHES */
     770              :             0x0F7F,             /* TIBETAN SIGN RNAM BCAD */
     771              :             0x102B,             /* MYANMAR VOWEL SIGN TALL AA */
     772              :             0x102C,             /* MYANMAR VOWEL SIGN AA */
     773              :             0x1031,             /* MYANMAR VOWEL SIGN E */
     774              :             0x1038,             /* MYANMAR SIGN VISARGA */
     775              :             0x103B,             /* MYANMAR CONSONANT SIGN MEDIAL YA */
     776              :             0x103C,             /* MYANMAR CONSONANT SIGN MEDIAL RA */
     777              :             0x1056,             /* MYANMAR VOWEL SIGN VOCALIC R */
     778              :             0x1057,             /* MYANMAR VOWEL SIGN VOCALIC RR */
     779              :             0x1062,             /* MYANMAR VOWEL SIGN SGAW KAREN EU */
     780              :             0x1063,             /* MYANMAR TONE MARK SGAW KAREN HATHI */
     781              :             0x1064,             /* MYANMAR TONE MARK SGAW KAREN KE PHO */
     782              :             0x1067,             /* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
     783              :             0x1068,             /* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
     784              :             0x1069,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
     785              :             0x106A,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
     786              :             0x106B,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
     787              :             0x106C,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
     788              :             0x106D,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
     789              :             0x1083,             /* MYANMAR VOWEL SIGN SHAN AA */
     790              :             0x1084,             /* MYANMAR VOWEL SIGN SHAN E */
     791              :             0x1087,             /* MYANMAR SIGN SHAN TONE-2 */
     792              :             0x1088,             /* MYANMAR SIGN SHAN TONE-3 */
     793              :             0x1089,             /* MYANMAR SIGN SHAN TONE-5 */
     794              :             0x108A,             /* MYANMAR SIGN SHAN TONE-6 */
     795              :             0x108B,             /* MYANMAR SIGN SHAN COUNCIL TONE-2 */
     796              :             0x108C,             /* MYANMAR SIGN SHAN COUNCIL TONE-3 */
     797              :             0x108F,             /* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
     798              :             0x17B6,             /* KHMER VOWEL SIGN AA */
     799              :             0x17BE,             /* KHMER VOWEL SIGN OE */
     800              :             0x17BF,             /* KHMER VOWEL SIGN YA */
     801              :             0x17C0,             /* KHMER VOWEL SIGN IE */
     802              :             0x17C1,             /* KHMER VOWEL SIGN E */
     803              :             0x17C2,             /* KHMER VOWEL SIGN AE */
     804              :             0x17C3,             /* KHMER VOWEL SIGN AI */
     805              :             0x17C4,             /* KHMER VOWEL SIGN OO */
     806              :             0x17C5,             /* KHMER VOWEL SIGN AU */
     807              :             0x17C7,             /* KHMER SIGN REAHMUK */
     808              :             0x17C8,             /* KHMER SIGN YUUKALEAPINTU */
     809              :             0x1923,             /* LIMBU VOWEL SIGN EE */
     810              :             0x1924,             /* LIMBU VOWEL SIGN AI */
     811              :             0x1925,             /* LIMBU VOWEL SIGN OO */
     812              :             0x1926,             /* LIMBU VOWEL SIGN AU */
     813              :             0x1929,             /* LIMBU SUBJOINED LETTER YA */
     814              :             0x192A,             /* LIMBU SUBJOINED LETTER RA */
     815              :             0x192B,             /* LIMBU SUBJOINED LETTER WA */
     816              :             0x1930,             /* LIMBU SMALL LETTER KA */
     817              :             0x1931,             /* LIMBU SMALL LETTER NGA */
     818              :             0x1933,             /* LIMBU SMALL LETTER TA */
     819              :             0x1934,             /* LIMBU SMALL LETTER NA */
     820              :             0x1935,             /* LIMBU SMALL LETTER PA */
     821              :             0x1936,             /* LIMBU SMALL LETTER MA */
     822              :             0x1937,             /* LIMBU SMALL LETTER RA */
     823              :             0x1938,             /* LIMBU SMALL LETTER LA */
     824              :             0x19B0,             /* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
     825              :             0x19B1,             /* NEW TAI LUE VOWEL SIGN AA */
     826              :             0x19B2,             /* NEW TAI LUE VOWEL SIGN II */
     827              :             0x19B3,             /* NEW TAI LUE VOWEL SIGN U */
     828              :             0x19B4,             /* NEW TAI LUE VOWEL SIGN UU */
     829              :             0x19B5,             /* NEW TAI LUE VOWEL SIGN E */
     830              :             0x19B6,             /* NEW TAI LUE VOWEL SIGN AE */
     831              :             0x19B7,             /* NEW TAI LUE VOWEL SIGN O */
     832              :             0x19B8,             /* NEW TAI LUE VOWEL SIGN OA */
     833              :             0x19B9,             /* NEW TAI LUE VOWEL SIGN UE */
     834              :             0x19BA,             /* NEW TAI LUE VOWEL SIGN AY */
     835              :             0x19BB,             /* NEW TAI LUE VOWEL SIGN AAY */
     836              :             0x19BC,             /* NEW TAI LUE VOWEL SIGN UY */
     837              :             0x19BD,             /* NEW TAI LUE VOWEL SIGN OY */
     838              :             0x19BE,             /* NEW TAI LUE VOWEL SIGN OAY */
     839              :             0x19BF,             /* NEW TAI LUE VOWEL SIGN UEY */
     840              :             0x19C0,             /* NEW TAI LUE VOWEL SIGN IY */
     841              :             0x19C8,             /* NEW TAI LUE TONE MARK-1 */
     842              :             0x19C9,             /* NEW TAI LUE TONE MARK-2 */
     843              :             0x1A19,             /* BUGINESE VOWEL SIGN E */
     844              :             0x1A1A,             /* BUGINESE VOWEL SIGN O */
     845              :             0x1A1B,             /* BUGINESE VOWEL SIGN AE */
     846              :             0x1B04,             /* BALINESE SIGN BISAH */
     847              :             0x1B35,             /* BALINESE VOWEL SIGN TEDUNG */
     848              :             0x1B3B,             /* BALINESE VOWEL SIGN RA REPA TEDUNG */
     849              :             0x1B3D,             /* BALINESE VOWEL SIGN LA LENGA TEDUNG */
     850              :             0x1B3E,             /* BALINESE VOWEL SIGN TALING */
     851              :             0x1B3F,             /* BALINESE VOWEL SIGN TALING REPA */
     852              :             0x1B40,             /* BALINESE VOWEL SIGN TALING TEDUNG */
     853              :             0x1B41,             /* BALINESE VOWEL SIGN TALING REPA TEDUNG */
     854              :             0x1B43,             /* BALINESE VOWEL SIGN PEPET TEDUNG */
     855              :             0x1B44,             /* BALINESE ADEG ADEG */
     856              :             0x1B82,             /* SUNDANESE SIGN PANGWISAD */
     857              :             0x1BA1,             /* SUNDANESE CONSONANT SIGN PAMINGKAL */
     858              :             0x1BA6,             /* SUNDANESE VOWEL SIGN PANAELAENG */
     859              :             0x1BA7,             /* SUNDANESE VOWEL SIGN PANOLONG */
     860              :             0x1BAA,             /* SUNDANESE SIGN PAMAAEH */
     861              :             0x1C24,             /* LEPCHA SUBJOINED LETTER YA */
     862              :             0x1C25,             /* LEPCHA SUBJOINED LETTER RA */
     863              :             0x1C26,             /* LEPCHA VOWEL SIGN AA */
     864              :             0x1C27,             /* LEPCHA VOWEL SIGN I */
     865              :             0x1C28,             /* LEPCHA VOWEL SIGN O */
     866              :             0x1C29,             /* LEPCHA VOWEL SIGN OO */
     867              :             0x1C2A,             /* LEPCHA VOWEL SIGN U */
     868              :             0x1C2B,             /* LEPCHA VOWEL SIGN UU */
     869              :             0x1C34,             /* LEPCHA CONSONANT SIGN NYIN-DO */
     870              :             0x1C35,             /* LEPCHA CONSONANT SIGN KANG */
     871              :             0xA823,             /* SYLOTI NAGRI VOWEL SIGN A */
     872              :             0xA824,             /* SYLOTI NAGRI VOWEL SIGN I */
     873              :             0xA827,             /* SYLOTI NAGRI VOWEL SIGN OO */
     874              :             0xA880,             /* SAURASHTRA SIGN ANUSVARA */
     875              :             0xA881,             /* SAURASHTRA SIGN VISARGA */
     876              :             0xA8B4,             /* SAURASHTRA CONSONANT SIGN HAARU */
     877              :             0xA8B5,             /* SAURASHTRA VOWEL SIGN AA */
     878              :             0xA8B6,             /* SAURASHTRA VOWEL SIGN I */
     879              :             0xA8B7,             /* SAURASHTRA VOWEL SIGN II */
     880              :             0xA8B8,             /* SAURASHTRA VOWEL SIGN U */
     881              :             0xA8B9,             /* SAURASHTRA VOWEL SIGN UU */
     882              :             0xA8BA,             /* SAURASHTRA VOWEL SIGN VOCALIC R */
     883              :             0xA8BB,             /* SAURASHTRA VOWEL SIGN VOCALIC RR */
     884              :             0xA8BC,             /* SAURASHTRA VOWEL SIGN VOCALIC L */
     885              :             0xA8BD,             /* SAURASHTRA VOWEL SIGN VOCALIC LL */
     886              :             0xA8BE,             /* SAURASHTRA VOWEL SIGN E */
     887              :             0xA8BF,             /* SAURASHTRA VOWEL SIGN EE */
     888              :             0xA8C0,             /* SAURASHTRA VOWEL SIGN AI */
     889              :             0xA8C1,             /* SAURASHTRA VOWEL SIGN O */
     890              :             0xA8C2,             /* SAURASHTRA VOWEL SIGN OO */
     891              :             0xA8C3,             /* SAURASHTRA VOWEL SIGN AU */
     892              :             0xA952,             /* REJANG CONSONANT SIGN H */
     893              :             0xA953,             /* REJANG VIRAMA */
     894              :             0xAA2F,             /* CHAM VOWEL SIGN O */
     895              :             0xAA30,             /* CHAM VOWEL SIGN AI */
     896              :             0xAA33,             /* CHAM CONSONANT SIGN YA */
     897              :             0xAA34,             /* CHAM CONSONANT SIGN RA */
     898              :             0xAA4D              /* CHAM CONSONANT SIGN FINAL H */
     899              :         };
     900         4362 :         const pg_wchar *StopLow = strange_letter,
     901         4362 :                    *StopHigh = strange_letter + lengthof(strange_letter),
     902              :                    *StopMiddle;
     903              :         pg_wchar    c;
     904              : 
     905         4362 :         c = *(prs->pgwstr + prs->state->poschar);
     906              : 
     907        39258 :         while (StopLow < StopHigh)
     908              :         {
     909        34896 :             StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
     910        34896 :             if (*StopMiddle == c)
     911            0 :                 return 1;
     912        34896 :             else if (*StopMiddle < c)
     913            0 :                 StopLow = StopMiddle + 1;
     914              :             else
     915        34896 :                 StopHigh = StopMiddle;
     916              :         }
     917              :     }
     918              : 
     919         4362 :     return 0;
     920              : }
     921              : 
     922              : /*
     923              :  * Table of state/action of parser
     924              :  */
     925              : 
     926              : static const TParserStateActionItem actionTPS_Base[] = {
     927              :     {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
     928              :     {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
     929              :     {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
     930              :     {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
     931              :     {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
     932              :     {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
     933              :     {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
     934              :     {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
     935              :     {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
     936              :     {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
     937              :     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
     938              :     {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
     939              :     {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
     940              : };
     941              : 
     942              : 
     943              : static const TParserStateActionItem actionTPS_InNumWord[] = {
     944              :     {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
     945              :     {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
     946              :     {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
     947              :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
     948              :     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
     949              :     {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
     950              :     {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
     951              :     {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
     952              : };
     953              : 
     954              : static const TParserStateActionItem actionTPS_InAsciiWord[] = {
     955              :     {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
     956              :     {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
     957              :     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
     958              :     {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
     959              :     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
     960              :     {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
     961              :     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
     962              :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
     963              :     {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
     964              :     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
     965              :     {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
     966              :     {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
     967              :     {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
     968              :     {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
     969              :     {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
     970              : };
     971              : 
     972              : static const TParserStateActionItem actionTPS_InWord[] = {
     973              :     {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
     974              :     {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
     975              :     {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
     976              :     {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
     977              :     {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
     978              :     {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
     979              : };
     980              : 
     981              : static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
     982              :     {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
     983              :     {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
     984              :     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
     985              :     {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
     986              :     {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
     987              :     {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
     988              :     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
     989              :     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
     990              :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
     991              :     {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
     992              :     {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
     993              :     {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
     994              :     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
     995              :     {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
     996              : };
     997              : 
     998              : static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
     999              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1000              :     {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
    1001              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1002              : };
    1003              : 
    1004              : static const TParserStateActionItem actionTPS_InSignedInt[] = {
    1005              :     {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
    1006              :     {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
    1007              :     {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
    1008              :     {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
    1009              :     {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
    1010              :     {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
    1011              : };
    1012              : 
    1013              : static const TParserStateActionItem actionTPS_InSpace[] = {
    1014              :     {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
    1015              :     {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
    1016              :     {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
    1017              :     {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
    1018              :     {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
    1019              :     {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
    1020              :     {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
    1021              :     {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
    1022              :     {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
    1023              : };
    1024              : 
    1025              : static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
    1026              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1027              :     {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
    1028              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1029              : };
    1030              : 
    1031              : static const TParserStateActionItem actionTPS_InUDecimal[] = {
    1032              :     {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
    1033              :     {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
    1034              :     {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
    1035              :     {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
    1036              :     {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
    1037              :     {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
    1038              : };
    1039              : 
    1040              : static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
    1041              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1042              :     {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
    1043              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1044              : };
    1045              : 
    1046              : static const TParserStateActionItem actionTPS_InDecimal[] = {
    1047              :     {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
    1048              :     {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
    1049              :     {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
    1050              :     {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
    1051              :     {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
    1052              :     {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
    1053              : };
    1054              : 
    1055              : static const TParserStateActionItem actionTPS_InVerVersion[] = {
    1056              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1057              :     {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
    1058              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1059              : };
    1060              : 
    1061              : static const TParserStateActionItem actionTPS_InSVerVersion[] = {
    1062              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1063              :     {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
    1064              :     {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
    1065              : };
    1066              : 
    1067              : 
    1068              : static const TParserStateActionItem actionTPS_InVersionFirst[] = {
    1069              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1070              :     {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
    1071              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1072              : };
    1073              : 
    1074              : static const TParserStateActionItem actionTPS_InVersion[] = {
    1075              :     {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
    1076              :     {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
    1077              :     {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
    1078              :     {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
    1079              : };
    1080              : 
    1081              : static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
    1082              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1083              :     {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
    1084              :     {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
    1085              :     {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
    1086              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1087              : };
    1088              : 
    1089              : static const TParserStateActionItem actionTPS_InMantissaSign[] = {
    1090              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1091              :     {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
    1092              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1093              : };
    1094              : 
    1095              : static const TParserStateActionItem actionTPS_InMantissa[] = {
    1096              :     {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
    1097              :     {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
    1098              :     {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
    1099              : };
    1100              : 
    1101              : static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
    1102              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1103              :     {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
    1104              :     {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
    1105              :     {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
    1106              :     {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
    1107              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1108              : };
    1109              : 
    1110              : static const TParserStateActionItem actionTPS_InXMLEntity[] = {
    1111              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1112              :     {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
    1113              :     {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
    1114              :     {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
    1115              :     {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
    1116              :     {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
    1117              :     {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
    1118              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1119              : };
    1120              : 
    1121              : static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
    1122              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1123              :     {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
    1124              :     {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
    1125              :     {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
    1126              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1127              : };
    1128              : 
    1129              : static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
    1130              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1131              :     {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
    1132              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1133              : };
    1134              : 
    1135              : static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
    1136              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1137              :     {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
    1138              :     {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
    1139              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1140              : };
    1141              : 
    1142              : static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
    1143              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1144              :     {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
    1145              :     {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
    1146              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1147              : };
    1148              : 
    1149              : static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
    1150              :     {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
    1151              : };
    1152              : 
    1153              : static const TParserStateActionItem actionTPS_InTagFirst[] = {
    1154              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1155              :     {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
    1156              :     {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
    1157              :     {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
    1158              :     {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
    1159              :     {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
    1160              :     {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
    1161              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1162              : };
    1163              : 
    1164              : static const TParserStateActionItem actionTPS_InXMLBegin[] = {
    1165              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1166              :     /* <?xml ... */
    1167              :     /* XXX do we wants states for the m and l ?  Right now this accepts <?xZ */
    1168              :     {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
    1169              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1170              : };
    1171              : 
    1172              : static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
    1173              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1174              :     {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
    1175              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1176              : };
    1177              : 
    1178              : static const TParserStateActionItem actionTPS_InTagName[] = {
    1179              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1180              :     /* <br/> case */
    1181              :     {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
    1182              :     {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
    1183              :     {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
    1184              :     {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
    1185              :     {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
    1186              :     {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
    1187              :     {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
    1188              :     {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
    1189              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1190              : };
    1191              : 
    1192              : static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
    1193              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1194              :     {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
    1195              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1196              : };
    1197              : 
    1198              : static const TParserStateActionItem actionTPS_InTag[] = {
    1199              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1200              :     {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
    1201              :     {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
    1202              :     {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
    1203              :     {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
    1204              :     {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
    1205              :     {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
    1206              :     {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
    1207              :     {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
    1208              :     {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
    1209              :     {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
    1210              :     {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
    1211              :     {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
    1212              :     {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
    1213              :     {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
    1214              :     {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
    1215              :     {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
    1216              :     {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
    1217              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1218              : };
    1219              : 
    1220              : static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
    1221              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1222              :     {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
    1223              :     {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
    1224              :     {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
    1225              : };
    1226              : 
    1227              : static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
    1228              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1229              :     {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
    1230              :     {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
    1231              :     {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
    1232              : };
    1233              : 
    1234              : static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
    1235              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1236              :     {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
    1237              : };
    1238              : 
    1239              : static const TParserStateActionItem actionTPS_InTagEnd[] = {
    1240              :     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
    1241              : };
    1242              : 
    1243              : static const TParserStateActionItem actionTPS_InCommentFirst[] = {
    1244              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1245              :     {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
    1246              :     /* <!DOCTYPE ...> */
    1247              :     {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
    1248              :     {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
    1249              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1250              : };
    1251              : 
    1252              : static const TParserStateActionItem actionTPS_InCommentLast[] = {
    1253              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1254              :     {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
    1255              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1256              : };
    1257              : 
    1258              : static const TParserStateActionItem actionTPS_InComment[] = {
    1259              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1260              :     {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
    1261              :     {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
    1262              : };
    1263              : 
    1264              : static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
    1265              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1266              :     {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
    1267              :     {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
    1268              : };
    1269              : 
    1270              : static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
    1271              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1272              :     {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
    1273              :     {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
    1274              :     {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
    1275              : };
    1276              : 
    1277              : static const TParserStateActionItem actionTPS_InCommentEnd[] = {
    1278              :     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
    1279              : };
    1280              : 
    1281              : static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
    1282              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1283              :     {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
    1284              :     {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
    1285              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1286              : };
    1287              : 
    1288              : static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
    1289              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1290              :     {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
    1291              :     {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
    1292              :     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    1293              :     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    1294              :     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
    1295              :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
    1296              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1297              : };
    1298              : 
    1299              : static const TParserStateActionItem actionTPS_InHostDomain[] = {
    1300              :     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
    1301              :     {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
    1302              :     {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
    1303              :     {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
    1304              :     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    1305              :     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    1306              :     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
    1307              :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
    1308              :     {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
    1309              :     {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
    1310              :     {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
    1311              :     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
    1312              : };
    1313              : 
    1314              : static const TParserStateActionItem actionTPS_InPortFirst[] = {
    1315              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1316              :     {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
    1317              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1318              : };
    1319              : 
    1320              : static const TParserStateActionItem actionTPS_InPort[] = {
    1321              :     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
    1322              :     {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
    1323              :     {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
    1324              :     {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
    1325              :     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
    1326              : };
    1327              : 
    1328              : static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
    1329              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1330              :     {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
    1331              :     {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
    1332              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1333              : };
    1334              : 
    1335              : static const TParserStateActionItem actionTPS_InHost[] = {
    1336              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1337              :     {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
    1338              :     {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
    1339              :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
    1340              :     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
    1341              :     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    1342              :     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    1343              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1344              : };
    1345              : 
    1346              : static const TParserStateActionItem actionTPS_InEmail[] = {
    1347              :     {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
    1348              :     {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
    1349              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1350              : };
    1351              : 
    1352              : static const TParserStateActionItem actionTPS_InFileFirst[] = {
    1353              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1354              :     {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
    1355              :     {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
    1356              :     {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
    1357              :     {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
    1358              :     {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
    1359              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1360              : };
    1361              : 
    1362              : static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
    1363              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1364              :     {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
    1365              :     {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
    1366              :     {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
    1367              :     {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
    1368              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1369              : };
    1370              : 
    1371              : static const TParserStateActionItem actionTPS_InPathFirst[] = {
    1372              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1373              :     {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
    1374              :     {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
    1375              :     {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
    1376              :     {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
    1377              :     {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
    1378              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1379              : };
    1380              : 
    1381              : static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
    1382              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1383              :     {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
    1384              :     {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
    1385              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1386              : };
    1387              : 
    1388              : static const TParserStateActionItem actionTPS_InPathSecond[] = {
    1389              :     {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
    1390              :     {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
    1391              :     {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
    1392              :     {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
    1393              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1394              : };
    1395              : 
    1396              : static const TParserStateActionItem actionTPS_InFile[] = {
    1397              :     {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
    1398              :     {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
    1399              :     {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
    1400              :     {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
    1401              :     {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
    1402              :     {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
    1403              :     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
    1404              :     {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
    1405              : };
    1406              : 
    1407              : static const TParserStateActionItem actionTPS_InFileNext[] = {
    1408              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1409              :     {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
    1410              :     {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
    1411              :     {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
    1412              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1413              : };
    1414              : 
    1415              : static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
    1416              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1417              :     {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
    1418              :     {NULL, 0, A_POP, TPS_Null, 0, NULL},
    1419              : };
    1420              : 
    1421              : static const TParserStateActionItem actionTPS_InURLPathStart[] = {
    1422              :     {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
    1423              : };
    1424              : 
    1425              : static const TParserStateActionItem actionTPS_InURLPath[] = {
    1426              :     {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
    1427              :     {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
    1428              :     {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
    1429              : };
    1430              : 
    1431              : static const TParserStateActionItem actionTPS_InFURL[] = {
    1432              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1433              :     {p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
    1434              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1435              : };
    1436              : 
    1437              : static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
    1438              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1439              :     {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
    1440              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1441              : };
    1442              : 
    1443              : static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
    1444              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1445              :     {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
    1446              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1447              : };
    1448              : 
    1449              : static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
    1450              :     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
    1451              : };
    1452              : 
    1453              : static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
    1454              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1455              :     {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
    1456              :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
    1457              :     {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
    1458              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1459              : };
    1460              : 
    1461              : static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
    1462              :     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
    1463              :     {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
    1464              :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
    1465              :     {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
    1466              :     {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    1467              :     {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
    1468              :     {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
    1469              : };
    1470              : 
    1471              : static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
    1472              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1473              :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
    1474              :     {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
    1475              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1476              : };
    1477              : 
    1478              : static const TParserStateActionItem actionTPS_InHyphenWord[] = {
    1479              :     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
    1480              :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
    1481              :     {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
    1482              :     {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    1483              :     {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
    1484              :     {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
    1485              : };
    1486              : 
    1487              : static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
    1488              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1489              :     {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    1490              :     {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
    1491              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1492              : };
    1493              : 
    1494              : static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
    1495              :     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
    1496              :     {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    1497              :     {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    1498              :     {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
    1499              :     {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
    1500              : };
    1501              : 
    1502              : static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
    1503              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1504              :     {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
    1505              :     {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    1506              :     {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
    1507              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1508              : };
    1509              : 
    1510              : static const TParserStateActionItem actionTPS_InParseHyphen[] = {
    1511              :     {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
    1512              :     {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
    1513              :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
    1514              :     {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
    1515              :     {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
    1516              :     {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
    1517              : };
    1518              : 
    1519              : static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
    1520              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1521              :     {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
    1522              :     {p_isspecial, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
    1523              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1524              : };
    1525              : 
    1526              : static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
    1527              :     {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
    1528              :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
    1529              :     {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
    1530              :     {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
    1531              :     {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
    1532              : };
    1533              : 
    1534              : static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
    1535              :     {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
    1536              :     {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
    1537              :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
    1538              :     {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
    1539              :     {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
    1540              :     {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
    1541              : };
    1542              : 
    1543              : static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
    1544              :     {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
    1545              :     {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
    1546              :     {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
    1547              :     {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
    1548              : };
    1549              : 
    1550              : static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
    1551              :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    1552              :     {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
    1553              :     {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
    1554              :     {p_isspecial, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
    1555              :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
    1556              : };
    1557              : 
    1558              : 
    1559              : /*
    1560              :  * main table of per-state parser actions
    1561              :  */
    1562              : typedef struct
    1563              : {
    1564              :     const TParserStateActionItem *action;   /* the actual state info */
    1565              :     TParserState state;         /* only for Assert crosscheck */
    1566              : #ifdef WPARSER_TRACE
    1567              :     const char *state_name;     /* only for debug printout */
    1568              : #endif
    1569              : } TParserStateAction;
    1570              : 
    1571              : #ifdef WPARSER_TRACE
    1572              : #define TPARSERSTATEACTION(state) \
    1573              :     { CppConcat(action,state), state, CppAsString(state) }
    1574              : #else
    1575              : #define TPARSERSTATEACTION(state) \
    1576              :     { CppConcat(action,state), state }
    1577              : #endif
    1578              : 
    1579              : /*
    1580              :  * order must be the same as in typedef enum {} TParserState!!
    1581              :  */
    1582              : 
    1583              : static const TParserStateAction Actions[] = {
    1584              :     TPARSERSTATEACTION(TPS_Base),
    1585              :     TPARSERSTATEACTION(TPS_InNumWord),
    1586              :     TPARSERSTATEACTION(TPS_InAsciiWord),
    1587              :     TPARSERSTATEACTION(TPS_InWord),
    1588              :     TPARSERSTATEACTION(TPS_InUnsignedInt),
    1589              :     TPARSERSTATEACTION(TPS_InSignedIntFirst),
    1590              :     TPARSERSTATEACTION(TPS_InSignedInt),
    1591              :     TPARSERSTATEACTION(TPS_InSpace),
    1592              :     TPARSERSTATEACTION(TPS_InUDecimalFirst),
    1593              :     TPARSERSTATEACTION(TPS_InUDecimal),
    1594              :     TPARSERSTATEACTION(TPS_InDecimalFirst),
    1595              :     TPARSERSTATEACTION(TPS_InDecimal),
    1596              :     TPARSERSTATEACTION(TPS_InVerVersion),
    1597              :     TPARSERSTATEACTION(TPS_InSVerVersion),
    1598              :     TPARSERSTATEACTION(TPS_InVersionFirst),
    1599              :     TPARSERSTATEACTION(TPS_InVersion),
    1600              :     TPARSERSTATEACTION(TPS_InMantissaFirst),
    1601              :     TPARSERSTATEACTION(TPS_InMantissaSign),
    1602              :     TPARSERSTATEACTION(TPS_InMantissa),
    1603              :     TPARSERSTATEACTION(TPS_InXMLEntityFirst),
    1604              :     TPARSERSTATEACTION(TPS_InXMLEntity),
    1605              :     TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
    1606              :     TPARSERSTATEACTION(TPS_InXMLEntityNum),
    1607              :     TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
    1608              :     TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
    1609              :     TPARSERSTATEACTION(TPS_InXMLEntityEnd),
    1610              :     TPARSERSTATEACTION(TPS_InTagFirst),
    1611              :     TPARSERSTATEACTION(TPS_InXMLBegin),
    1612              :     TPARSERSTATEACTION(TPS_InTagCloseFirst),
    1613              :     TPARSERSTATEACTION(TPS_InTagName),
    1614              :     TPARSERSTATEACTION(TPS_InTagBeginEnd),
    1615              :     TPARSERSTATEACTION(TPS_InTag),
    1616              :     TPARSERSTATEACTION(TPS_InTagEscapeK),
    1617              :     TPARSERSTATEACTION(TPS_InTagEscapeKK),
    1618              :     TPARSERSTATEACTION(TPS_InTagBackSleshed),
    1619              :     TPARSERSTATEACTION(TPS_InTagEnd),
    1620              :     TPARSERSTATEACTION(TPS_InCommentFirst),
    1621              :     TPARSERSTATEACTION(TPS_InCommentLast),
    1622              :     TPARSERSTATEACTION(TPS_InComment),
    1623              :     TPARSERSTATEACTION(TPS_InCloseCommentFirst),
    1624              :     TPARSERSTATEACTION(TPS_InCloseCommentLast),
    1625              :     TPARSERSTATEACTION(TPS_InCommentEnd),
    1626              :     TPARSERSTATEACTION(TPS_InHostFirstDomain),
    1627              :     TPARSERSTATEACTION(TPS_InHostDomainSecond),
    1628              :     TPARSERSTATEACTION(TPS_InHostDomain),
    1629              :     TPARSERSTATEACTION(TPS_InPortFirst),
    1630              :     TPARSERSTATEACTION(TPS_InPort),
    1631              :     TPARSERSTATEACTION(TPS_InHostFirstAN),
    1632              :     TPARSERSTATEACTION(TPS_InHost),
    1633              :     TPARSERSTATEACTION(TPS_InEmail),
    1634              :     TPARSERSTATEACTION(TPS_InFileFirst),
    1635              :     TPARSERSTATEACTION(TPS_InFileTwiddle),
    1636              :     TPARSERSTATEACTION(TPS_InPathFirst),
    1637              :     TPARSERSTATEACTION(TPS_InPathFirstFirst),
    1638              :     TPARSERSTATEACTION(TPS_InPathSecond),
    1639              :     TPARSERSTATEACTION(TPS_InFile),
    1640              :     TPARSERSTATEACTION(TPS_InFileNext),
    1641              :     TPARSERSTATEACTION(TPS_InURLPathFirst),
    1642              :     TPARSERSTATEACTION(TPS_InURLPathStart),
    1643              :     TPARSERSTATEACTION(TPS_InURLPath),
    1644              :     TPARSERSTATEACTION(TPS_InFURL),
    1645              :     TPARSERSTATEACTION(TPS_InProtocolFirst),
    1646              :     TPARSERSTATEACTION(TPS_InProtocolSecond),
    1647              :     TPARSERSTATEACTION(TPS_InProtocolEnd),
    1648              :     TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
    1649              :     TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
    1650              :     TPARSERSTATEACTION(TPS_InHyphenWordFirst),
    1651              :     TPARSERSTATEACTION(TPS_InHyphenWord),
    1652              :     TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
    1653              :     TPARSERSTATEACTION(TPS_InHyphenNumWord),
    1654              :     TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
    1655              :     TPARSERSTATEACTION(TPS_InParseHyphen),
    1656              :     TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
    1657              :     TPARSERSTATEACTION(TPS_InHyphenWordPart),
    1658              :     TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
    1659              :     TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
    1660              :     TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
    1661              : };
    1662              : 
    1663              : 
    1664              : static bool
    1665        14462 : TParserGet(TParser *prs)
    1666              : {
    1667        14462 :     const TParserStateActionItem *item = NULL;
    1668              : 
    1669        14462 :     CHECK_FOR_INTERRUPTS();
    1670              : 
    1671              :     Assert(prs->state);
    1672              : 
    1673        14462 :     if (prs->state->posbyte >= prs->lenstr)
    1674         2377 :         return false;
    1675              : 
    1676        12085 :     prs->token = prs->str + prs->state->posbyte;
    1677        12085 :     prs->state->pushedAtAction = NULL;
    1678              : 
    1679              :     /* look at string */
    1680        51639 :     while (prs->state->posbyte <= prs->lenstr)
    1681              :     {
    1682        51639 :         if (prs->state->posbyte == prs->lenstr)
    1683         2452 :             prs->state->charlen = 0;
    1684              :         else
    1685        98374 :             prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
    1686        49187 :                 pg_mblen_range(prs->str + prs->state->posbyte,
    1687        49187 :                                prs->str + prs->lenstr);
    1688              : 
    1689              :         Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
    1690              :         Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
    1691              :         Assert(Actions[prs->state->state].state == prs->state->state);
    1692              : 
    1693        51639 :         if (prs->state->pushedAtAction)
    1694              :         {
    1695              :             /* After a POP, pick up at the next test */
    1696         1296 :             item = prs->state->pushedAtAction + 1;
    1697         1296 :             prs->state->pushedAtAction = NULL;
    1698              :         }
    1699              :         else
    1700              :         {
    1701        50343 :             item = Actions[prs->state->state].action;
    1702              :             Assert(item != NULL);
    1703              :         }
    1704              : 
    1705              :         /* find action by character class */
    1706       277854 :         while (item->isclass)
    1707              :         {
    1708       262182 :             prs->c = item->c;
    1709       262182 :             if (item->isclass(prs) != 0)
    1710        35967 :                 break;
    1711       226215 :             item++;
    1712              :         }
    1713              : 
    1714              : #ifdef WPARSER_TRACE
    1715              :         {
    1716              :             TParserPosition *ptr;
    1717              : 
    1718              :             fprintf(stderr, "state ");
    1719              :             /* indent according to stack depth */
    1720              :             for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
    1721              :                 fprintf(stderr, "  ");
    1722              :             fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
    1723              :             if (prs->state->posbyte < prs->lenstr)
    1724              :                 fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
    1725              :             else
    1726              :                 fprintf(stderr, "at EOF");
    1727              :             fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
    1728              :                     (int) (item - Actions[prs->state->state].action),
    1729              :                     (item->flags & A_BINGO) ? " BINGO" : "",
    1730              :                     (item->flags & A_POP) ? " POP" : "",
    1731              :                     (item->flags & A_PUSH) ? " PUSH" : "",
    1732              :                     (item->flags & A_RERUN) ? " RERUN" : "",
    1733              :                     (item->flags & A_CLEAR) ? " CLEAR" : "",
    1734              :                     (item->flags & A_MERGE) ? " MERGE" : "",
    1735              :                     (item->flags & A_CLRALL) ? " CLRALL" : "",
    1736              :                     (item->tostate != TPS_Null) ? " tostate " : "",
    1737              :                     (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
    1738              :                     (item->type > 0) ? " type " : "",
    1739              :                     tok_alias[item->type]);
    1740              :         }
    1741              : #endif
    1742              : 
    1743              :         /* call special handler if exists */
    1744        51639 :         if (item->special)
    1745          210 :             item->special(prs);
    1746              : 
    1747              :         /* BINGO, token is found */
    1748        51639 :         if (item->flags & A_BINGO)
    1749              :         {
    1750              :             Assert(item->type > 0);
    1751        12085 :             prs->lenbytetoken = prs->state->lenbytetoken;
    1752        12085 :             prs->lenchartoken = prs->state->lenchartoken;
    1753        12085 :             prs->state->lenbytetoken = prs->state->lenchartoken = 0;
    1754        12085 :             prs->type = item->type;
    1755              :         }
    1756              : 
    1757              :         /* do various actions by flags */
    1758        51639 :         if (item->flags & A_POP)
    1759              :         {                       /* pop stored state in stack */
    1760         1305 :             TParserPosition *ptr = prs->state->prev;
    1761              : 
    1762         1305 :             pfree(prs->state);
    1763         1305 :             prs->state = ptr;
    1764              :             Assert(prs->state);
    1765              :         }
    1766        50334 :         else if (item->flags & A_PUSH)
    1767              :         {                       /* push (store) state in stack */
    1768         2544 :             prs->state->pushedAtAction = item;    /* remember where we push */
    1769         2544 :             prs->state = newTParserPosition(prs->state);
    1770              :         }
    1771        47790 :         else if (item->flags & A_CLEAR)
    1772              :         {                       /* clear previous pushed state */
    1773              :             TParserPosition *ptr;
    1774              : 
    1775              :             Assert(prs->state->prev);
    1776          249 :             ptr = prs->state->prev->prev;
    1777          249 :             pfree(prs->state->prev);
    1778          249 :             prs->state->prev = ptr;
    1779              :         }
    1780        47541 :         else if (item->flags & A_CLRALL)
    1781              :         {                       /* clear all previous pushed state */
    1782              :             TParserPosition *ptr;
    1783              : 
    1784         1389 :             while (prs->state->prev)
    1785              :             {
    1786          999 :                 ptr = prs->state->prev->prev;
    1787          999 :                 pfree(prs->state->prev);
    1788          999 :                 prs->state->prev = ptr;
    1789              :             }
    1790              :         }
    1791        47151 :         else if (item->flags & A_MERGE)
    1792              :         {                       /* merge posinfo with current and pushed state */
    1793            0 :             TParserPosition *ptr = prs->state;
    1794              : 
    1795              :             Assert(prs->state->prev);
    1796            0 :             prs->state = prs->state->prev;
    1797              : 
    1798            0 :             prs->state->posbyte = ptr->posbyte;
    1799            0 :             prs->state->poschar = ptr->poschar;
    1800            0 :             prs->state->charlen = ptr->charlen;
    1801            0 :             prs->state->lenbytetoken = ptr->lenbytetoken;
    1802            0 :             prs->state->lenchartoken = ptr->lenchartoken;
    1803            0 :             pfree(ptr);
    1804              :         }
    1805              : 
    1806              :         /* set new state if pointed */
    1807        51639 :         if (item->tostate != TPS_Null)
    1808        33101 :             prs->state->state = item->tostate;
    1809              : 
    1810              :         /* check for go away */
    1811        51639 :         if ((item->flags & A_BINGO) ||
    1812        39554 :             (prs->state->posbyte >= prs->lenstr &&
    1813            0 :              (item->flags & A_RERUN) == 0))
    1814              :             break;
    1815              : 
    1816              :         /* go to beginning of loop if we should rerun or we just restore state */
    1817        39554 :         if (item->flags & (A_RERUN | A_POP))
    1818         1317 :             continue;
    1819              : 
    1820              :         /* move forward */
    1821        38237 :         if (prs->state->charlen)
    1822              :         {
    1823        38237 :             prs->state->posbyte += prs->state->charlen;
    1824        38237 :             prs->state->lenbytetoken += prs->state->charlen;
    1825        38237 :             prs->state->poschar++;
    1826        38237 :             prs->state->lenchartoken++;
    1827              :         }
    1828              :     }
    1829              : 
    1830        12085 :     return (item && (item->flags & A_BINGO));
    1831              : }
    1832              : 
    1833              : Datum
    1834         4932 : prsd_lextype(PG_FUNCTION_ARGS)
    1835              : {
    1836         4932 :     LexDescr   *descr = palloc_array(LexDescr, LASTNUM + 1);
    1837              :     int         i;
    1838              : 
    1839       118368 :     for (i = 1; i <= LASTNUM; i++)
    1840              :     {
    1841       113436 :         descr[i - 1].lexid = i;
    1842       113436 :         descr[i - 1].alias = pstrdup(tok_alias[i]);
    1843       113436 :         descr[i - 1].descr = pstrdup(lex_descr[i]);
    1844              :     }
    1845              : 
    1846         4932 :     descr[LASTNUM].lexid = 0;
    1847              : 
    1848         4932 :     PG_RETURN_POINTER(descr);
    1849              : }
    1850              : 
    1851              : Datum
    1852         2377 : prsd_start(PG_FUNCTION_ARGS)
    1853              : {
    1854         2377 :     PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
    1855              : }
    1856              : 
    1857              : Datum
    1858        14342 : prsd_nexttoken(PG_FUNCTION_ARGS)
    1859              : {
    1860        14342 :     TParser    *p = (TParser *) PG_GETARG_POINTER(0);
    1861        14342 :     char      **t = (char **) PG_GETARG_POINTER(1);
    1862        14342 :     int        *tlen = (int *) PG_GETARG_POINTER(2);
    1863              : 
    1864        14342 :     if (!TParserGet(p))
    1865         2377 :         PG_RETURN_INT32(0);
    1866              : 
    1867        11965 :     *t = p->token;
    1868        11965 :     *tlen = p->lenbytetoken;
    1869              : 
    1870        11965 :     PG_RETURN_INT32(p->type);
    1871              : }
    1872              : 
    1873              : Datum
    1874         2377 : prsd_end(PG_FUNCTION_ARGS)
    1875              : {
    1876         2377 :     TParser    *p = (TParser *) PG_GETARG_POINTER(0);
    1877              : 
    1878         2377 :     TParserClose(p);
    1879         2377 :     PG_RETURN_VOID();
    1880              : }
    1881              : 
    1882              : 
    1883              : /*
    1884              :  * ts_headline support begins here
    1885              :  */
    1886              : 
    1887              : /* token type classification macros */
    1888              : #define TS_IDIGNORE(x)  ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
    1889              : #define HLIDREPLACE(x)  ( (x)==TAG_T )
    1890              : #define HLIDSKIP(x)     ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
    1891              : #define XMLHLIDSKIP(x)  ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
    1892              : #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
    1893              : #define NOENDTOKEN(x)   ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
    1894              : 
    1895              : /*
    1896              :  * Macros useful in headline selection.  These rely on availability of
    1897              :  * "HeadlineParsedText *prs" describing some text, and "int shortword"
    1898              :  * describing the "short word" length parameter.
    1899              :  */
    1900              : 
    1901              : /* Interesting words are non-repeated search terms */
    1902              : #define INTERESTINGWORD(j) \
    1903              :     (prs->words[j].item && !prs->words[j].repeated)
    1904              : 
    1905              : /* Don't want to end at a non-word or a short word, unless interesting */
    1906              : #define BADENDPOINT(j) \
    1907              :     ((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \
    1908              :      !INTERESTINGWORD(j))
    1909              : 
    1910              : typedef struct
    1911              : {
    1912              :     /* one cover (well, really one fragment) for mark_hl_fragments */
    1913              :     int32       startpos;       /* fragment's starting word index */
    1914              :     int32       endpos;         /* ending word index (inclusive) */
    1915              :     int32       poslen;         /* number of interesting words */
    1916              :     int32       curlen;         /* total number of words */
    1917              :     bool        chosen;         /* chosen? */
    1918              :     bool        excluded;       /* excluded? */
    1919              : } CoverPos;
    1920              : 
    1921              : typedef struct
    1922              : {
    1923              :     /* callback data for checkcondition_HL */
    1924              :     HeadlineWordEntry *words;
    1925              :     int         len;
    1926              : } hlCheck;
    1927              : 
    1928              : 
    1929              : /*
    1930              :  * TS_execute callback for matching a tsquery operand to headline words
    1931              :  *
    1932              :  * Note: it's tempting to report words[] indexes as pos values to save
    1933              :  * searching in hlCover; but that would screw up phrase matching, which
    1934              :  * expects to measure distances in lexemes not tokens.
    1935              :  */
    1936              : static TSTernaryValue
    1937          500 : checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
    1938              : {
    1939          500 :     hlCheck    *checkval = (hlCheck *) opaque;
    1940              :     int         i;
    1941              : 
    1942              :     /* scan words array for matching items */
    1943        12725 :     for (i = 0; i < checkval->len; i++)
    1944              :     {
    1945        12325 :         if (checkval->words[i].item == val)
    1946              :         {
    1947              :             /* if data == NULL, don't need to report positions */
    1948          437 :             if (!data)
    1949          100 :                 return TS_YES;
    1950              : 
    1951          337 :             if (!data->pos)
    1952              :             {
    1953          238 :                 data->pos = palloc_array(WordEntryPos, checkval->len);
    1954          238 :                 data->allocated = true;
    1955          238 :                 data->npos = 1;
    1956          238 :                 data->pos[0] = checkval->words[i].pos;
    1957              :             }
    1958           99 :             else if (data->pos[data->npos - 1] < checkval->words[i].pos)
    1959              :             {
    1960           99 :                 data->pos[data->npos++] = checkval->words[i].pos;
    1961              :             }
    1962              :         }
    1963              :     }
    1964              : 
    1965          400 :     if (data && data->npos > 0)
    1966          238 :         return TS_YES;
    1967              : 
    1968          162 :     return TS_NO;
    1969              : }
    1970              : 
    1971              : /*
    1972              :  * hlCover: try to find a substring of prs' word list that satisfies query
    1973              :  *
    1974              :  * locations is the result of TS_execute_locations() for the query.
    1975              :  * We use this to identify plausible subranges of the query.
    1976              :  *
    1977              :  * *nextpos is the lexeme position (NOT word index) to start the search
    1978              :  * at.  Caller should initialize this to zero.  If successful, we'll
    1979              :  * advance it to the next place to search at.
    1980              :  *
    1981              :  * On success, sets *p to first word index and *q to last word index of the
    1982              :  * cover substring, and returns true.
    1983              :  *
    1984              :  * The result is a minimal cover, in the sense that both *p and *q will be
    1985              :  * words used in the query.
    1986              :  */
    1987              : static bool
    1988          281 : hlCover(HeadlineParsedText *prs, TSQuery query, List *locations,
    1989              :         int *nextpos, int *p, int *q)
    1990              : {
    1991          281 :     int         pos = *nextpos;
    1992              : 
    1993              :     /* This loop repeats when our selected word-range fails the query */
    1994              :     for (;;)
    1995           30 :     {
    1996              :         int         posb,
    1997              :                     pose;
    1998              :         ListCell   *lc;
    1999              : 
    2000              :         /*
    2001              :          * For each AND'ed query term or phrase, find its first occurrence at
    2002              :          * or after pos; set pose to the maximum of those positions.
    2003              :          *
    2004              :          * We need not consider ORs or NOTs here; see the comments for
    2005              :          * TS_execute_locations().  Rechecking the match with TS_execute(),
    2006              :          * below, will deal with any ensuing imprecision.
    2007              :          */
    2008          311 :         pose = -1;
    2009          483 :         foreach(lc, locations)
    2010              :         {
    2011          233 :             ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
    2012          233 :             int         first = -1;
    2013              : 
    2014          396 :             for (int i = 0; i < pdata->npos; i++)
    2015              :             {
    2016              :                 /* For phrase matches, use the ending lexeme */
    2017          335 :                 int         endp = pdata->pos[i];
    2018              : 
    2019          335 :                 if (endp >= pos)
    2020              :                 {
    2021          172 :                     first = endp;
    2022          172 :                     break;
    2023              :                 }
    2024              :             }
    2025          233 :             if (first < 0)
    2026           61 :                 return false;   /* no more matches for this term */
    2027          172 :             if (first > pose)
    2028          163 :                 pose = first;
    2029              :         }
    2030              : 
    2031          250 :         if (pose < 0)
    2032          123 :             return false;       /* we only get here if empty list */
    2033              : 
    2034              :         /*
    2035              :          * Now, for each AND'ed query term or phrase, find its last occurrence
    2036              :          * at or before pose; set posb to the minimum of those positions.
    2037              :          *
    2038              :          * We start posb at INT_MAX - 1 to guarantee no overflow if we compute
    2039              :          * posb + 1 below.
    2040              :          */
    2041          127 :         posb = INT_MAX - 1;
    2042          293 :         foreach(lc, locations)
    2043              :         {
    2044          166 :             ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
    2045          166 :             int         last = -1;
    2046              : 
    2047          247 :             for (int i = pdata->npos - 1; i >= 0; i--)
    2048              :             {
    2049              :                 /* For phrase matches, use the starting lexeme */
    2050          247 :                 int         startp = pdata->pos[i] - pdata->width;
    2051              : 
    2052          247 :                 if (startp <= pose)
    2053              :                 {
    2054          166 :                     last = startp;
    2055          166 :                     break;
    2056              :                 }
    2057              :             }
    2058          166 :             if (last < posb)
    2059          136 :                 posb = last;
    2060              :         }
    2061              : 
    2062              :         /*
    2063              :          * We could end up with posb to the left of pos, in case some phrase
    2064              :          * match crosses pos.  Try the match starting at pos anyway, since the
    2065              :          * result of TS_execute_locations is imprecise for phrase matches OR'd
    2066              :          * with plain matches; that is, if the query is "(A <-> B) | C" then C
    2067              :          * could match at pos even though the phrase match would have to
    2068              :          * extend to the left of pos.
    2069              :          */
    2070          127 :         posb = Max(posb, pos);
    2071              : 
    2072              :         /* This test probably always succeeds, but be paranoid */
    2073          127 :         if (posb <= pose)
    2074              :         {
    2075              :             /*
    2076              :              * posb .. pose is now the shortest, earliest-after-pos range of
    2077              :              * lexeme positions containing all the query terms.  It will
    2078              :              * contain all phrase matches, too, except in the corner case
    2079              :              * described just above.
    2080              :              *
    2081              :              * Now convert these lexeme positions to indexes in prs->words[].
    2082              :              */
    2083          127 :             int         idxb = -1;
    2084          127 :             int         idxe = -1;
    2085              : 
    2086         5812 :             for (int i = 0; i < prs->curwords; i++)
    2087              :             {
    2088         5748 :                 if (prs->words[i].item == NULL)
    2089         5306 :                     continue;
    2090          442 :                 if (idxb < 0 && prs->words[i].pos >= posb)
    2091          127 :                     idxb = i;
    2092          442 :                 if (prs->words[i].pos <= pose)
    2093          379 :                     idxe = i;
    2094              :                 else
    2095           63 :                     break;
    2096              :             }
    2097              : 
    2098              :             /* This test probably always succeeds, but be paranoid */
    2099          127 :             if (idxb >= 0 && idxe >= idxb)
    2100              :             {
    2101              :                 /*
    2102              :                  * Finally, check that the selected range satisfies the query.
    2103              :                  * This should succeed in all simple cases; but odd cases
    2104              :                  * involving non-top-level NOT conditions or phrase matches
    2105              :                  * OR'd with other things could fail, since the result of
    2106              :                  * TS_execute_locations doesn't fully represent such things.
    2107              :                  */
    2108              :                 hlCheck     ch;
    2109              : 
    2110          127 :                 ch.words = &(prs->words[idxb]);
    2111          127 :                 ch.len = idxe - idxb + 1;
    2112          127 :                 if (TS_execute(GETQUERY(query), &ch,
    2113              :                                TS_EXEC_EMPTY, checkcondition_HL))
    2114              :                 {
    2115              :                     /* Match!  Advance *nextpos and return the word range. */
    2116           97 :                     *nextpos = posb + 1;
    2117           97 :                     *p = idxb;
    2118           97 :                     *q = idxe;
    2119           97 :                     return true;
    2120              :                 }
    2121              :             }
    2122              :         }
    2123              : 
    2124              :         /*
    2125              :          * Advance pos and try again.  Any later workable match must start
    2126              :          * beyond posb.
    2127              :          */
    2128           30 :         pos = posb + 1;
    2129              :     }
    2130              :     /* Can't get here, but stupider compilers complain if we leave it off */
    2131              :     return false;
    2132              : }
    2133              : 
    2134              : /*
    2135              :  * Apply suitable highlight marking to words selected by headline selector
    2136              :  *
    2137              :  * The words from startpos to endpos inclusive are marked per highlightall
    2138              :  */
    2139              : static void
    2140          193 : mark_fragment(HeadlineParsedText *prs, bool highlightall,
    2141              :               int startpos, int endpos)
    2142              : {
    2143              :     int         i;
    2144              : 
    2145         2827 :     for (i = startpos; i <= endpos; i++)
    2146              :     {
    2147         2634 :         if (prs->words[i].item)
    2148          250 :             prs->words[i].selected = 1;
    2149         2634 :         if (!highlightall)
    2150              :         {
    2151         2511 :             if (HLIDREPLACE(prs->words[i].type))
    2152            0 :                 prs->words[i].replace = 1;
    2153         2511 :             else if (HLIDSKIP(prs->words[i].type))
    2154            0 :                 prs->words[i].skip = 1;
    2155              :         }
    2156              :         else
    2157              :         {
    2158          123 :             if (XMLHLIDSKIP(prs->words[i].type))
    2159            3 :                 prs->words[i].skip = 1;
    2160              :         }
    2161              : 
    2162         2634 :         prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
    2163              :     }
    2164          193 : }
    2165              : 
    2166              : /*
    2167              :  * split a cover substring into fragments not longer than max_words
    2168              :  *
    2169              :  * At entry, *startpos and *endpos are the (remaining) bounds of the cover
    2170              :  * substring.  They are updated to hold the bounds of the next fragment.
    2171              :  *
    2172              :  * *curlen and *poslen are set to the fragment's length, in words and
    2173              :  * interesting words respectively.
    2174              :  */
    2175              : static void
    2176           18 : get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
    2177              :                   int *curlen, int *poslen, int max_words)
    2178              : {
    2179              :     int         i;
    2180              : 
    2181              :     /*
    2182              :      * Objective: select a fragment of words between startpos and endpos such
    2183              :      * that it has at most max_words and both ends have query words. If the
    2184              :      * startpos and endpos are the endpoints of the cover and the cover has
    2185              :      * fewer words than max_words, then this function should just return the
    2186              :      * cover
    2187              :      */
    2188              :     /* first move startpos to an item */
    2189          444 :     for (i = *startpos; i <= *endpos; i++)
    2190              :     {
    2191          444 :         *startpos = i;
    2192          444 :         if (INTERESTINGWORD(i))
    2193           18 :             break;
    2194              :     }
    2195              :     /* cut endpos to have only max_words */
    2196           18 :     *curlen = 0;
    2197           18 :     *poslen = 0;
    2198          480 :     for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
    2199              :     {
    2200          462 :         if (!NONWORDTOKEN(prs->words[i].type))
    2201          240 :             *curlen += 1;
    2202          462 :         if (INTERESTINGWORD(i))
    2203           27 :             *poslen += 1;
    2204              :     }
    2205              :     /* if the cover was cut then move back endpos to a query item */
    2206           18 :     if (*endpos > i)
    2207              :     {
    2208            6 :         *endpos = i;
    2209          420 :         for (i = *endpos; i >= *startpos; i--)
    2210              :         {
    2211          420 :             *endpos = i;
    2212          420 :             if (INTERESTINGWORD(i))
    2213            6 :                 break;
    2214          414 :             if (!NONWORDTOKEN(prs->words[i].type))
    2215          204 :                 *curlen -= 1;
    2216              :         }
    2217              :     }
    2218           18 : }
    2219              : 
    2220              : /*
    2221              :  * Headline selector used when MaxFragments > 0
    2222              :  *
    2223              :  * Note: in this mode, highlightall is disregarded for phrase selection;
    2224              :  * it only controls presentation details.
    2225              :  */
    2226              : static void
    2227           15 : mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, List *locations,
    2228              :                   bool highlightall,
    2229              :                   int shortword, int min_words,
    2230              :                   int max_words, int max_fragments)
    2231              : {
    2232              :     int32       poslen,
    2233              :                 curlen,
    2234              :                 i,
    2235              :                 f,
    2236           15 :                 num_f = 0;
    2237              :     int32       stretch,
    2238              :                 maxstretch,
    2239              :                 posmarker;
    2240              : 
    2241           15 :     int32       startpos = 0,
    2242           15 :                 endpos = 0,
    2243           15 :                 nextpos = 0,
    2244           15 :                 p = 0,
    2245           15 :                 q = 0;
    2246              : 
    2247           15 :     int32       numcovers = 0,
    2248           15 :                 maxcovers = 32;
    2249              : 
    2250              :     int32       minI,
    2251              :                 minwords,
    2252              :                 maxitems;
    2253              :     CoverPos   *covers;
    2254              : 
    2255           15 :     covers = palloc(maxcovers * sizeof(CoverPos));
    2256              : 
    2257              :     /* get all covers */
    2258           27 :     while (hlCover(prs, query, locations, &nextpos, &p, &q))
    2259              :     {
    2260           12 :         startpos = p;
    2261           12 :         endpos = q;
    2262              : 
    2263              :         /*
    2264              :          * Break the cover into smaller fragments such that each fragment has
    2265              :          * at most max_words. Also ensure that each end of each fragment is a
    2266              :          * query word. This will allow us to stretch the fragment in either
    2267              :          * direction
    2268              :          */
    2269              : 
    2270           30 :         while (startpos <= endpos)
    2271              :         {
    2272           18 :             get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
    2273           18 :             if (numcovers >= maxcovers)
    2274              :             {
    2275            0 :                 maxcovers *= 2;
    2276            0 :                 covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
    2277              :             }
    2278           18 :             covers[numcovers].startpos = startpos;
    2279           18 :             covers[numcovers].endpos = endpos;
    2280           18 :             covers[numcovers].curlen = curlen;
    2281           18 :             covers[numcovers].poslen = poslen;
    2282           18 :             covers[numcovers].chosen = false;
    2283           18 :             covers[numcovers].excluded = false;
    2284           18 :             numcovers++;
    2285           18 :             startpos = endpos + 1;
    2286           18 :             endpos = q;
    2287              :         }
    2288              :     }
    2289              : 
    2290              :     /* choose best covers */
    2291           33 :     for (f = 0; f < max_fragments; f++)
    2292              :     {
    2293           24 :         maxitems = 0;
    2294           24 :         minwords = PG_INT32_MAX;
    2295           24 :         minI = -1;
    2296              : 
    2297              :         /*
    2298              :          * Choose the cover that contains max items. In case of tie choose the
    2299              :          * one with smaller number of words.
    2300              :          */
    2301           57 :         for (i = 0; i < numcovers; i++)
    2302              :         {
    2303           33 :             if (!covers[i].chosen && !covers[i].excluded &&
    2304           24 :                 (maxitems < covers[i].poslen ||
    2305            6 :                  (maxitems == covers[i].poslen &&
    2306            6 :                   minwords > covers[i].curlen)))
    2307              :             {
    2308           18 :                 maxitems = covers[i].poslen;
    2309           18 :                 minwords = covers[i].curlen;
    2310           18 :                 minI = i;
    2311              :             }
    2312              :         }
    2313              :         /* if a cover was found mark it */
    2314           24 :         if (minI >= 0)
    2315              :         {
    2316           18 :             covers[minI].chosen = true;
    2317              :             /* adjust the size of cover */
    2318           18 :             startpos = covers[minI].startpos;
    2319           18 :             endpos = covers[minI].endpos;
    2320           18 :             curlen = covers[minI].curlen;
    2321              :             /* stretch the cover if cover size is lower than max_words */
    2322           18 :             if (curlen < max_words)
    2323              :             {
    2324              :                 /* divide the stretch on both sides of cover */
    2325           18 :                 maxstretch = (max_words - curlen) / 2;
    2326              : 
    2327              :                 /*
    2328              :                  * first stretch the startpos stop stretching if 1. we hit the
    2329              :                  * beginning of document 2. exceed maxstretch 3. we hit an
    2330              :                  * already marked fragment
    2331              :                  */
    2332           18 :                 stretch = 0;
    2333           18 :                 posmarker = startpos;
    2334          300 :                 for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
    2335              :                 {
    2336          282 :                     if (!NONWORDTOKEN(prs->words[i].type))
    2337              :                     {
    2338          135 :                         curlen++;
    2339          135 :                         stretch++;
    2340              :                     }
    2341          282 :                     posmarker = i;
    2342              :                 }
    2343              :                 /* cut back startpos till we find a good endpoint */
    2344           66 :                 for (i = posmarker; i < startpos && BADENDPOINT(i); i++)
    2345              :                 {
    2346           48 :                     if (!NONWORDTOKEN(prs->words[i].type))
    2347           18 :                         curlen--;
    2348              :                 }
    2349           18 :                 startpos = i;
    2350              :                 /* now stretch the endpos as much as possible */
    2351           18 :                 posmarker = endpos;
    2352          483 :                 for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
    2353              :                 {
    2354          465 :                     if (!NONWORDTOKEN(prs->words[i].type))
    2355          231 :                         curlen++;
    2356          465 :                     posmarker = i;
    2357              :                 }
    2358              :                 /* cut back endpos till we find a good endpoint */
    2359           45 :                 for (i = posmarker; i > endpos && BADENDPOINT(i); i--)
    2360              :                 {
    2361           27 :                     if (!NONWORDTOKEN(prs->words[i].type))
    2362           12 :                         curlen--;
    2363              :                 }
    2364           18 :                 endpos = i;
    2365              :             }
    2366           18 :             covers[minI].startpos = startpos;
    2367           18 :             covers[minI].endpos = endpos;
    2368           18 :             covers[minI].curlen = curlen;
    2369              :             /* Mark the chosen fragments (covers) */
    2370           18 :             mark_fragment(prs, highlightall, startpos, endpos);
    2371           18 :             num_f++;
    2372              :             /* Exclude covers overlapping this one from future consideration */
    2373           48 :             for (i = 0; i < numcovers; i++)
    2374              :             {
    2375           30 :                 if (i != minI &&
    2376           12 :                     ((covers[i].startpos >= startpos &&
    2377            6 :                       covers[i].startpos <= endpos) ||
    2378           12 :                      (covers[i].endpos >= startpos &&
    2379            6 :                       covers[i].endpos <= endpos) ||
    2380           12 :                      (covers[i].startpos < startpos &&
    2381            6 :                       covers[i].endpos > endpos)))
    2382            0 :                     covers[i].excluded = true;
    2383              :             }
    2384              :         }
    2385              :         else
    2386            6 :             break;              /* no selectable covers remain */
    2387              :     }
    2388              : 
    2389              :     /* show the first min_words words if we have not marked anything */
    2390           15 :     if (num_f <= 0)
    2391              :     {
    2392            3 :         startpos = curlen = 0;
    2393            3 :         endpos = -1;
    2394           93 :         for (i = 0; i < prs->curwords && curlen < min_words; i++)
    2395              :         {
    2396           90 :             if (!NONWORDTOKEN(prs->words[i].type))
    2397           45 :                 curlen++;
    2398           90 :             endpos = i;
    2399              :         }
    2400            3 :         mark_fragment(prs, highlightall, startpos, endpos);
    2401              :     }
    2402              : 
    2403           15 :     pfree(covers);
    2404           15 : }
    2405              : 
    2406              : /*
    2407              :  * Headline selector used when MaxFragments == 0
    2408              :  */
    2409              : static void
    2410          172 : mark_hl_words(HeadlineParsedText *prs, TSQuery query, List *locations,
    2411              :               bool highlightall,
    2412              :               int shortword, int min_words, int max_words)
    2413              : {
    2414          172 :     int         nextpos = 0,
    2415          172 :                 p = 0,
    2416          172 :                 q = 0;
    2417          172 :     int         bestb = -1,
    2418          172 :                 beste = -1;
    2419          172 :     int         bestlen = -1;
    2420          172 :     bool        bestcover = false;
    2421              :     int         pose,
    2422              :                 posb,
    2423              :                 poslen,
    2424              :                 curlen;
    2425              :     bool        poscover;
    2426              :     int         i;
    2427              : 
    2428          172 :     if (!highlightall)
    2429              :     {
    2430              :         /* examine all covers, select a headline using the best one */
    2431          254 :         while (hlCover(prs, query, locations, &nextpos, &p, &q))
    2432              :         {
    2433              :             /*
    2434              :              * Count words (curlen) and interesting words (poslen) within
    2435              :              * cover, but stop once we reach max_words.  This step doesn't
    2436              :              * consider whether that's a good stopping point.  posb and pose
    2437              :              * are set to the start and end indexes of the possible headline.
    2438              :              */
    2439           85 :             curlen = 0;
    2440           85 :             poslen = 0;
    2441           85 :             posb = pose = p;
    2442          728 :             for (i = p; i <= q && curlen < max_words; i++)
    2443              :             {
    2444          643 :                 if (!NONWORDTOKEN(prs->words[i].type))
    2445          364 :                     curlen++;
    2446          643 :                 if (INTERESTINGWORD(i))
    2447          145 :                     poslen++;
    2448          643 :                 pose = i;
    2449              :             }
    2450              : 
    2451           85 :             if (curlen < max_words)
    2452              :             {
    2453              :                 /*
    2454              :                  * We have room to lengthen the headline, so search forward
    2455              :                  * until it's full or we find a good stopping point.  We'll
    2456              :                  * reconsider the word at "q", then move forward.
    2457              :                  */
    2458         1469 :                 for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
    2459              :                 {
    2460         1456 :                     if (i > q)
    2461              :                     {
    2462         1377 :                         if (!NONWORDTOKEN(prs->words[i].type))
    2463          687 :                             curlen++;
    2464         1377 :                         if (INTERESTINGWORD(i))
    2465           60 :                             poslen++;
    2466              :                     }
    2467         1456 :                     pose = i;
    2468         1456 :                     if (BADENDPOINT(i))
    2469          972 :                         continue;
    2470          484 :                     if (curlen >= min_words)
    2471           66 :                         break;
    2472              :                 }
    2473           79 :                 if (curlen < min_words)
    2474              :                 {
    2475              :                     /*
    2476              :                      * Reached end of text and our headline is still shorter
    2477              :                      * than min_words, so try to extend it to the left.
    2478              :                      */
    2479          183 :                     for (i = p - 1; i >= 0; i--)
    2480              :                     {
    2481          182 :                         if (!NONWORDTOKEN(prs->words[i].type))
    2482           91 :                             curlen++;
    2483          182 :                         if (INTERESTINGWORD(i))
    2484            3 :                             poslen++;
    2485          182 :                         if (curlen >= max_words)
    2486            0 :                             break;
    2487          182 :                         if (BADENDPOINT(i))
    2488          118 :                             continue;
    2489           64 :                         if (curlen >= min_words)
    2490           12 :                             break;
    2491              :                     }
    2492           13 :                     posb = (i >= 0) ? i : 0;
    2493              :                 }
    2494              :             }
    2495              :             else
    2496              :             {
    2497              :                 /*
    2498              :                  * Can't make headline longer, so consider making it shorter
    2499              :                  * if needed to avoid a bad endpoint.
    2500              :                  */
    2501            6 :                 if (i > q)
    2502            3 :                     i = q;
    2503           15 :                 for (; curlen > min_words; i--)
    2504              :                 {
    2505           15 :                     if (!BADENDPOINT(i))
    2506              :                         break;
    2507            9 :                     if (!NONWORDTOKEN(prs->words[i].type))
    2508            3 :                         curlen--;
    2509            9 :                     if (INTERESTINGWORD(i))
    2510            0 :                         poslen--;
    2511            9 :                     pose = i - 1;
    2512              :                 }
    2513              :             }
    2514              : 
    2515              :             /*
    2516              :              * Check whether the proposed headline includes the original
    2517              :              * cover; it might not if we trimmed it due to max_words.
    2518              :              */
    2519           85 :             poscover = (posb <= p && pose >= q);
    2520              : 
    2521              :             /*
    2522              :              * Adopt this headline if it's better than the last one, giving
    2523              :              * highest priority to headlines including the cover, then to
    2524              :              * headlines with more interesting words, then to headlines with
    2525              :              * good stopping points.  (Since bestlen is initially -1, we will
    2526              :              * certainly adopt the first headline.)
    2527              :              */
    2528           85 :             if (poscover > bestcover ||
    2529           39 :                 (poscover == bestcover && poslen > bestlen) ||
    2530           36 :                 (poscover == bestcover && poslen == bestlen &&
    2531            6 :                  !BADENDPOINT(pose) && BADENDPOINT(beste)))
    2532              :             {
    2533           49 :                 bestb = posb;
    2534           49 :                 beste = pose;
    2535           49 :                 bestlen = poslen;
    2536           49 :                 bestcover = poscover;
    2537              :             }
    2538              :         }
    2539              : 
    2540              :         /*
    2541              :          * If we found nothing acceptable, select min_words words starting at
    2542              :          * the beginning.
    2543              :          */
    2544          169 :         if (bestlen < 0)
    2545              :         {
    2546          120 :             curlen = 0;
    2547          120 :             pose = -1;
    2548          519 :             for (i = 0; i < prs->curwords && curlen < min_words; i++)
    2549              :             {
    2550          399 :                 if (!NONWORDTOKEN(prs->words[i].type))
    2551          258 :                     curlen++;
    2552          399 :                 pose = i;
    2553              :             }
    2554          120 :             bestb = 0;
    2555          120 :             beste = pose;
    2556              :         }
    2557              :     }
    2558              :     else
    2559              :     {
    2560              :         /* highlightall mode: headline is whole document */
    2561            3 :         bestb = 0;
    2562            3 :         beste = prs->curwords - 1;
    2563              :     }
    2564              : 
    2565          172 :     mark_fragment(prs, highlightall, bestb, beste);
    2566          172 : }
    2567              : 
    2568              : /*
    2569              :  * Default parser's prsheadline function
    2570              :  */
    2571              : Datum
    2572          187 : prsd_headline(PG_FUNCTION_ARGS)
    2573              : {
    2574          187 :     HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
    2575          187 :     List       *prsoptions = (List *) PG_GETARG_POINTER(1);
    2576          187 :     TSQuery     query = PG_GETARG_TSQUERY(2);
    2577              :     List       *locations;
    2578              : 
    2579              :     /* default option values: */
    2580          187 :     int         min_words = 15;
    2581          187 :     int         max_words = 35;
    2582          187 :     int         shortword = 3;
    2583          187 :     int         max_fragments = 0;
    2584          187 :     bool        highlightall = false;
    2585              :     ListCell   *l;
    2586              : 
    2587              :     /* Extract configuration option values */
    2588          187 :     prs->startsel = NULL;
    2589          187 :     prs->stopsel = NULL;
    2590          187 :     prs->fragdelim = NULL;
    2591          364 :     foreach(l, prsoptions)
    2592              :     {
    2593          177 :         DefElem    *defel = (DefElem *) lfirst(l);
    2594          177 :         char       *val = defGetString(defel);
    2595              : 
    2596          177 :         if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
    2597           18 :             max_words = pg_strtoint32(val);
    2598          159 :         else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
    2599           18 :             min_words = pg_strtoint32(val);
    2600          141 :         else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
    2601            0 :             shortword = pg_strtoint32(val);
    2602          141 :         else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
    2603           15 :             max_fragments = pg_strtoint32(val);
    2604          126 :         else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
    2605           60 :             prs->startsel = pstrdup(val);
    2606           66 :         else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
    2607           60 :             prs->stopsel = pstrdup(val);
    2608            6 :         else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
    2609            3 :             prs->fragdelim = pstrdup(val);
    2610            3 :         else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
    2611            9 :             highlightall = (pg_strcasecmp(val, "1") == 0 ||
    2612            6 :                             pg_strcasecmp(val, "on") == 0 ||
    2613            3 :                             pg_strcasecmp(val, "true") == 0 ||
    2614            0 :                             pg_strcasecmp(val, "t") == 0 ||
    2615            6 :                             pg_strcasecmp(val, "y") == 0 ||
    2616            0 :                             pg_strcasecmp(val, "yes") == 0);
    2617              :         else
    2618            0 :             ereport(ERROR,
    2619              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    2620              :                      errmsg("unrecognized headline parameter: \"%s\"",
    2621              :                             defel->defname)));
    2622              :     }
    2623              : 
    2624              :     /* in HighlightAll mode these parameters are ignored */
    2625          187 :     if (!highlightall)
    2626              :     {
    2627          184 :         if (min_words >= max_words)
    2628            0 :             ereport(ERROR,
    2629              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    2630              :                      errmsg("%s must be less than %s", "MinWords", "MaxWords")));
    2631          184 :         if (min_words <= 0)
    2632            0 :             ereport(ERROR,
    2633              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    2634              :                      errmsg("%s must be positive", "MinWords")));
    2635          184 :         if (shortword < 0)
    2636            0 :             ereport(ERROR,
    2637              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    2638              :                      errmsg("%s must be >= 0", "ShortWord")));
    2639          184 :         if (max_fragments < 0)
    2640            0 :             ereport(ERROR,
    2641              :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    2642              :                      errmsg("%s must be >= 0", "MaxFragments")));
    2643              :     }
    2644              : 
    2645              :     /* Locate words and phrases matching the query */
    2646          187 :     if (query->size > 0)
    2647              :     {
    2648              :         hlCheck     ch;
    2649              : 
    2650          181 :         ch.words = prs->words;
    2651          181 :         ch.len = prs->curwords;
    2652          181 :         locations = TS_execute_locations(GETQUERY(query), &ch, TS_EXEC_EMPTY,
    2653              :                                          checkcondition_HL);
    2654              :     }
    2655              :     else
    2656            6 :         locations = NIL;        /* empty query matches nothing */
    2657              : 
    2658              :     /* Apply appropriate headline selector */
    2659          187 :     if (max_fragments == 0)
    2660          172 :         mark_hl_words(prs, query, locations, highlightall, shortword,
    2661              :                       min_words, max_words);
    2662              :     else
    2663           15 :         mark_hl_fragments(prs, query, locations, highlightall, shortword,
    2664              :                           min_words, max_words, max_fragments);
    2665              : 
    2666              :     /* Fill in default values for string options */
    2667          187 :     if (!prs->startsel)
    2668          127 :         prs->startsel = pstrdup("<b>");
    2669          187 :     if (!prs->stopsel)
    2670          127 :         prs->stopsel = pstrdup("</b>");
    2671          187 :     if (!prs->fragdelim)
    2672          184 :         prs->fragdelim = pstrdup(" ... ");
    2673              : 
    2674              :     /* Caller will need these lengths, too */
    2675          187 :     prs->startsellen = strlen(prs->startsel);
    2676          187 :     prs->stopsellen = strlen(prs->stopsel);
    2677          187 :     prs->fragdelimlen = strlen(prs->fragdelim);
    2678              : 
    2679          187 :     PG_RETURN_POINTER(prs);
    2680              : }
        

Generated by: LCOV version 2.0-1