LCOV - code coverage report
Current view: top level - src/backend/utils/adt - tsvector_parser.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13devel Lines: 121 142 85.2 %
Date: 2019-09-22 07:07:17 Functions: 4 5 80.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * tsvector_parser.c
       4             :  *    Parser for tsvector
       5             :  *
       6             :  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
       7             :  *
       8             :  *
       9             :  * IDENTIFICATION
      10             :  *    src/backend/utils/adt/tsvector_parser.c
      11             :  *
      12             :  *-------------------------------------------------------------------------
      13             :  */
      14             : 
      15             : #include "postgres.h"
      16             : 
      17             : #include "tsearch/ts_locale.h"
      18             : #include "tsearch/ts_utils.h"
      19             : 
      20             : 
      21             : /*
      22             :  * Private state of tsvector parser.  Note that tsquery also uses this code to
      23             :  * parse its input, hence the boolean flags.  The two flags are both true or
      24             :  * both false in current usage, but we keep them separate for clarity.
      25             :  * is_tsquery affects *only* the content of error messages.
      26             :  */
      27             : struct TSVectorParseStateData
      28             : {
      29             :     char       *prsbuf;         /* next input character */
      30             :     char       *bufstart;       /* whole string (used only for errors) */
      31             :     char       *word;           /* buffer to hold the current word */
      32             :     int         len;            /* size in bytes allocated for 'word' */
      33             :     int         eml;            /* max bytes per character */
      34             :     bool        oprisdelim;     /* treat ! | * ( ) as delimiters? */
      35             :     bool        is_tsquery;     /* say "tsquery" not "tsvector" in errors? */
      36             :     bool        is_web;         /* we're in websearch_to_tsquery() */
      37             : };
      38             : 
      39             : 
      40             : /*
      41             :  * Initializes parser for the input string. If oprisdelim is set, the
      42             :  * following characters are treated as delimiters in addition to whitespace:
      43             :  * ! | & ( )
      44             :  */
      45             : TSVectorParseState
      46        4408 : init_tsvector_parser(char *input, int flags)
      47             : {
      48             :     TSVectorParseState state;
      49             : 
      50        4408 :     state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
      51        4408 :     state->prsbuf = input;
      52        4408 :     state->bufstart = input;
      53        4408 :     state->len = 32;
      54        4408 :     state->word = (char *) palloc(state->len);
      55        4408 :     state->eml = pg_database_encoding_max_length();
      56        4408 :     state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0;
      57        4408 :     state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0;
      58        4408 :     state->is_web = (flags & P_TSV_IS_WEB) != 0;
      59             : 
      60        4408 :     return state;
      61             : }
      62             : 
      63             : /*
      64             :  * Reinitializes parser to parse 'input', instead of previous input.
      65             :  */
      66             : void
      67        4580 : reset_tsvector_parser(TSVectorParseState state, char *input)
      68             : {
      69        4580 :     state->prsbuf = input;
      70        4580 : }
      71             : 
      72             : /*
      73             :  * Shuts down a tsvector parser.
      74             :  */
      75             : void
      76        4408 : close_tsvector_parser(TSVectorParseState state)
      77             : {
      78        4408 :     pfree(state->word);
      79        4408 :     pfree(state);
      80        4408 : }
      81             : 
      82             : /* increase the size of 'word' if needed to hold one more character */
      83             : #define RESIZEPRSBUF \
      84             : do { \
      85             :     int clen = curpos - state->word; \
      86             :     if ( clen + state->eml >= state->len ) \
      87             :     { \
      88             :         state->len *= 2; \
      89             :         state->word = (char *) repalloc(state->word, state->len); \
      90             :         curpos = state->word + clen; \
      91             :     } \
      92             : } while (0)
      93             : 
      94             : /* Fills gettoken_tsvector's output parameters, and returns true */
      95             : #define RETURN_TOKEN \
      96             : do { \
      97             :     if (pos_ptr != NULL) \
      98             :     { \
      99             :         *pos_ptr = pos; \
     100             :         *poslen = npos; \
     101             :     } \
     102             :     else if (pos != NULL) \
     103             :         pfree(pos); \
     104             :     \
     105             :     if (strval != NULL) \
     106             :         *strval = state->word; \
     107             :     if (lenval != NULL) \
     108             :         *lenval = curpos - state->word; \
     109             :     if (endptr != NULL) \
     110             :         *endptr = state->prsbuf; \
     111             :     return true; \
     112             : } while(0)
     113             : 
     114             : 
     115             : /* State codes used in gettoken_tsvector */
     116             : #define WAITWORD        1
     117             : #define WAITENDWORD     2
     118             : #define WAITNEXTCHAR    3
     119             : #define WAITENDCMPLX    4
     120             : #define WAITPOSINFO     5
     121             : #define INPOSINFO       6
     122             : #define WAITPOSDELIM    7
     123             : #define WAITCHARCMPLX   8
     124             : 
     125             : #define PRSSYNTAXERROR prssyntaxerror(state)
     126             : 
     127             : static void
     128           0 : prssyntaxerror(TSVectorParseState state)
     129             : {
     130           0 :     ereport(ERROR,
     131             :             (errcode(ERRCODE_SYNTAX_ERROR),
     132             :              state->is_tsquery ?
     133             :              errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
     134             :              errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
     135             : }
     136             : 
     137             : 
     138             : /*
     139             :  * Get next token from string being parsed. Returns true if successful,
     140             :  * false if end of input string is reached.  On success, these output
     141             :  * parameters are filled in:
     142             :  *
     143             :  * *strval      pointer to token
     144             :  * *lenval      length of *strval
     145             :  * *pos_ptr     pointer to a palloc'd array of positions and weights
     146             :  *              associated with the token. If the caller is not interested
     147             :  *              in the information, NULL can be supplied. Otherwise
     148             :  *              the caller is responsible for pfreeing the array.
     149             :  * *poslen      number of elements in *pos_ptr
     150             :  * *endptr      scan resumption point
     151             :  *
     152             :  * Pass NULL for unwanted output parameters.
     153             :  */
     154             : bool
     155      127588 : gettoken_tsvector(TSVectorParseState state,
     156             :                   char **strval, int *lenval,
     157             :                   WordEntryPos **pos_ptr, int *poslen,
     158             :                   char **endptr)
     159             : {
     160      127588 :     int         oldstate = 0;
     161      127588 :     char       *curpos = state->word;
     162      127588 :     int         statecode = WAITWORD;
     163             : 
     164             :     /*
     165             :      * pos is for collecting the comma delimited list of positions followed by
     166             :      * the actual token.
     167             :      */
     168      127588 :     WordEntryPos *pos = NULL;
     169      127588 :     int         npos = 0;       /* elements of pos used */
     170      127588 :     int         posalen = 0;    /* allocated size of pos */
     171             : 
     172             :     while (1)
     173             :     {
     174      881716 :         if (statecode == WAITWORD)
     175             :         {
     176      245884 :             if (*(state->prsbuf) == '\0')
     177        2482 :                 return false;
     178      243402 :             else if (!state->is_web && t_iseq(state->prsbuf, '\''))
     179         100 :                 statecode = WAITENDCMPLX;
     180      243302 :             else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
     181             :             {
     182           4 :                 statecode = WAITNEXTCHAR;
     183           4 :                 oldstate = WAITENDWORD;
     184             :             }
     185      486596 :             else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
     186      244126 :                      (state->is_web && t_iseq(state->prsbuf, '"')))
     187           0 :                 PRSSYNTAXERROR;
     188      243298 :             else if (!t_isspace(state->prsbuf))
     189             :             {
     190      125002 :                 COPYCHAR(curpos, state->prsbuf);
     191      125002 :                 curpos += pg_mblen(state->prsbuf);
     192      125002 :                 statecode = WAITENDWORD;
     193             :             }
     194             :         }
     195      258768 :         else if (statecode == WAITNEXTCHAR)
     196             :         {
     197         108 :             if (*(state->prsbuf) == '\0')
     198           0 :                 ereport(ERROR,
     199             :                         (errcode(ERRCODE_SYNTAX_ERROR),
     200             :                          errmsg("there is no escaped character: \"%s\"",
     201             :                                 state->bufstart)));
     202             :             else
     203             :             {
     204         108 :                 RESIZEPRSBUF;
     205         108 :                 COPYCHAR(curpos, state->prsbuf);
     206         108 :                 curpos += pg_mblen(state->prsbuf);
     207             :                 Assert(oldstate != 0);
     208         108 :                 statecode = oldstate;
     209             :             }
     210             :         }
     211      258660 :         else if (statecode == WAITENDWORD)
     212             :         {
     213      253936 :             if (!state->is_web && t_iseq(state->prsbuf, '\\'))
     214             :             {
     215          48 :                 statecode = WAITNEXTCHAR;
     216          48 :                 oldstate = WAITENDWORD;
     217             :             }
     218      385034 :             else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
     219      269402 :                      (state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
     220      132246 :                      (state->is_web && t_iseq(state->prsbuf, '"')))
     221             :             {
     222      123782 :                 RESIZEPRSBUF;
     223      123782 :                 if (curpos == state->word)
     224           0 :                     PRSSYNTAXERROR;
     225      123782 :                 *(curpos) = '\0';
     226      123782 :                 RETURN_TOKEN;
     227             :             }
     228      130106 :             else if (t_iseq(state->prsbuf, ':'))
     229             :             {
     230        1224 :                 if (curpos == state->word)
     231           0 :                     PRSSYNTAXERROR;
     232        1224 :                 *(curpos) = '\0';
     233        1224 :                 if (state->oprisdelim)
     234         296 :                     RETURN_TOKEN;
     235             :                 else
     236         928 :                     statecode = INPOSINFO;
     237             :             }
     238             :             else
     239             :             {
     240      128882 :                 RESIZEPRSBUF;
     241      128882 :                 COPYCHAR(curpos, state->prsbuf);
     242      128882 :                 curpos += pg_mblen(state->prsbuf);
     243             :             }
     244             :         }
     245        4724 :         else if (statecode == WAITENDCMPLX)
     246             :         {
     247         648 :             if (!state->is_web && t_iseq(state->prsbuf, '\''))
     248             :             {
     249         100 :                 statecode = WAITCHARCMPLX;
     250             :             }
     251         548 :             else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
     252             :             {
     253          56 :                 statecode = WAITNEXTCHAR;
     254          56 :                 oldstate = WAITENDCMPLX;
     255             :             }
     256         492 :             else if (*(state->prsbuf) == '\0')
     257           0 :                 PRSSYNTAXERROR;
     258             :             else
     259             :             {
     260         492 :                 RESIZEPRSBUF;
     261         492 :                 COPYCHAR(curpos, state->prsbuf);
     262         492 :                 curpos += pg_mblen(state->prsbuf);
     263             :             }
     264             :         }
     265        4076 :         else if (statecode == WAITCHARCMPLX)
     266             :         {
     267         100 :             if (!state->is_web && t_iseq(state->prsbuf, '\''))
     268             :             {
     269           0 :                 RESIZEPRSBUF;
     270           0 :                 COPYCHAR(curpos, state->prsbuf);
     271           0 :                 curpos += pg_mblen(state->prsbuf);
     272           0 :                 statecode = WAITENDCMPLX;
     273             :             }
     274             :             else
     275             :             {
     276         100 :                 RESIZEPRSBUF;
     277         100 :                 *(curpos) = '\0';
     278         100 :                 if (curpos == state->word)
     279           0 :                     PRSSYNTAXERROR;
     280         100 :                 if (state->oprisdelim)
     281             :                 {
     282             :                     /* state->prsbuf+=pg_mblen(state->prsbuf); */
     283          48 :                     RETURN_TOKEN;
     284             :                 }
     285             :                 else
     286          52 :                     statecode = WAITPOSINFO;
     287          52 :                 continue;       /* recheck current character */
     288             :             }
     289             :         }
     290        3976 :         else if (statecode == WAITPOSINFO)
     291             :         {
     292          52 :             if (t_iseq(state->prsbuf, ':'))
     293           0 :                 statecode = INPOSINFO;
     294             :             else
     295          52 :                 RETURN_TOKEN;
     296             :         }
     297        3924 :         else if (statecode == INPOSINFO)
     298             :         {
     299        1344 :             if (t_isdigit(state->prsbuf))
     300             :             {
     301        1344 :                 if (posalen == 0)
     302             :                 {
     303         928 :                     posalen = 4;
     304         928 :                     pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
     305         928 :                     npos = 0;
     306             :                 }
     307         416 :                 else if (npos + 1 >= posalen)
     308             :                 {
     309          80 :                     posalen *= 2;
     310          80 :                     pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
     311             :                 }
     312        1344 :                 npos++;
     313        1344 :                 WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
     314             :                 /* we cannot get here in tsquery, so no need for 2 errmsgs */
     315        1344 :                 if (WEP_GETPOS(pos[npos - 1]) == 0)
     316           0 :                     ereport(ERROR,
     317             :                             (errcode(ERRCODE_SYNTAX_ERROR),
     318             :                              errmsg("wrong position info in tsvector: \"%s\"",
     319             :                                     state->bufstart)));
     320        1344 :                 WEP_SETWEIGHT(pos[npos - 1], 0);
     321        1344 :                 statecode = WAITPOSDELIM;
     322             :             }
     323             :             else
     324           0 :                 PRSSYNTAXERROR;
     325             :         }
     326        2580 :         else if (statecode == WAITPOSDELIM)
     327             :         {
     328        2580 :             if (t_iseq(state->prsbuf, ','))
     329         416 :                 statecode = INPOSINFO;
     330        2164 :             else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
     331             :             {
     332         240 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     333           0 :                     PRSSYNTAXERROR;
     334         240 :                 WEP_SETWEIGHT(pos[npos - 1], 3);
     335             :             }
     336        1924 :             else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
     337             :             {
     338         148 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     339           0 :                     PRSSYNTAXERROR;
     340         148 :                 WEP_SETWEIGHT(pos[npos - 1], 2);
     341             :             }
     342        1776 :             else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
     343             :             {
     344         184 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     345           0 :                     PRSSYNTAXERROR;
     346         184 :                 WEP_SETWEIGHT(pos[npos - 1], 1);
     347             :             }
     348        1592 :             else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
     349             :             {
     350          76 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     351           0 :                     PRSSYNTAXERROR;
     352          76 :                 WEP_SETWEIGHT(pos[npos - 1], 0);
     353             :             }
     354        2268 :             else if (t_isspace(state->prsbuf) ||
     355         752 :                      *(state->prsbuf) == '\0')
     356         928 :                 RETURN_TOKEN;
     357         588 :             else if (!t_isdigit(state->prsbuf))
     358           0 :                 PRSSYNTAXERROR;
     359             :         }
     360             :         else                    /* internal error */
     361           0 :             elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
     362             :                  statecode);
     363             : 
     364             :         /* get next char */
     365      377012 :         state->prsbuf += pg_mblen(state->prsbuf);
     366             :     }
     367             : }

Generated by: LCOV version 1.13