LCOV - code coverage report
Current view: top level - src/backend/utils/adt - tsvector_parser.c (source / functions) Hit Total Coverage
Test: PostgreSQL 15beta1 Lines: 124 142 87.3 %
Date: 2022-05-18 02:09:37 Functions: 5 5 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * tsvector_parser.c
       4             :  *    Parser for tsvector
       5             :  *
       6             :  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
       7             :  *
       8             :  *
       9             :  * IDENTIFICATION
      10             :  *    src/backend/utils/adt/tsvector_parser.c
      11             :  *
      12             :  *-------------------------------------------------------------------------
      13             :  */
      14             : 
      15             : #include "postgres.h"
      16             : 
      17             : #include "tsearch/ts_locale.h"
      18             : #include "tsearch/ts_utils.h"
      19             : 
      20             : 
      21             : /*
      22             :  * Private state of tsvector parser.  Note that tsquery also uses this code to
      23             :  * parse its input, hence the boolean flags.  The two flags are both true or
      24             :  * both false in current usage, but we keep them separate for clarity.
      25             :  * is_tsquery affects *only* the content of error messages.
      26             :  */
      27             : struct TSVectorParseStateData
      28             : {
      29             :     char       *prsbuf;         /* next input character */
      30             :     char       *bufstart;       /* whole string (used only for errors) */
      31             :     char       *word;           /* buffer to hold the current word */
      32             :     int         len;            /* size in bytes allocated for 'word' */
      33             :     int         eml;            /* max bytes per character */
      34             :     bool        oprisdelim;     /* treat ! | * ( ) as delimiters? */
      35             :     bool        is_tsquery;     /* say "tsquery" not "tsvector" in errors? */
      36             :     bool        is_web;         /* we're in websearch_to_tsquery() */
      37             : };
      38             : 
      39             : 
      40             : /*
      41             :  * Initializes parser for the input string. If oprisdelim is set, the
      42             :  * following characters are treated as delimiters in addition to whitespace:
      43             :  * ! | & ( )
      44             :  */
      45             : TSVectorParseState
      46        7536 : init_tsvector_parser(char *input, int flags)
      47             : {
      48             :     TSVectorParseState state;
      49             : 
      50        7536 :     state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
      51        7536 :     state->prsbuf = input;
      52        7536 :     state->bufstart = input;
      53        7536 :     state->len = 32;
      54        7536 :     state->word = (char *) palloc(state->len);
      55        7536 :     state->eml = pg_database_encoding_max_length();
      56        7536 :     state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0;
      57        7536 :     state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0;
      58        7536 :     state->is_web = (flags & P_TSV_IS_WEB) != 0;
      59             : 
      60        7536 :     return state;
      61             : }
      62             : 
      63             : /*
      64             :  * Reinitializes parser to parse 'input', instead of previous input.
      65             :  */
      66             : void
      67        8004 : reset_tsvector_parser(TSVectorParseState state, char *input)
      68             : {
      69        8004 :     state->prsbuf = input;
      70        8004 : }
      71             : 
      72             : /*
      73             :  * Shuts down a tsvector parser.
      74             :  */
      75             : void
      76        7530 : close_tsvector_parser(TSVectorParseState state)
      77             : {
      78        7530 :     pfree(state->word);
      79        7530 :     pfree(state);
      80        7530 : }
      81             : 
      82             : /* increase the size of 'word' if needed to hold one more character */
      83             : #define RESIZEPRSBUF \
      84             : do { \
      85             :     int clen = curpos - state->word; \
      86             :     if ( clen + state->eml >= state->len ) \
      87             :     { \
      88             :         state->len *= 2; \
      89             :         state->word = (char *) repalloc(state->word, state->len); \
      90             :         curpos = state->word + clen; \
      91             :     } \
      92             : } while (0)
      93             : 
      94             : /* Fills gettoken_tsvector's output parameters, and returns true */
      95             : #define RETURN_TOKEN \
      96             : do { \
      97             :     if (pos_ptr != NULL) \
      98             :     { \
      99             :         *pos_ptr = pos; \
     100             :         *poslen = npos; \
     101             :     } \
     102             :     else if (pos != NULL) \
     103             :         pfree(pos); \
     104             :     \
     105             :     if (strval != NULL) \
     106             :         *strval = state->word; \
     107             :     if (lenval != NULL) \
     108             :         *lenval = curpos - state->word; \
     109             :     if (endptr != NULL) \
     110             :         *endptr = state->prsbuf; \
     111             :     return true; \
     112             : } while(0)
     113             : 
     114             : 
     115             : /* State codes used in gettoken_tsvector */
     116             : #define WAITWORD        1
     117             : #define WAITENDWORD     2
     118             : #define WAITNEXTCHAR    3
     119             : #define WAITENDCMPLX    4
     120             : #define WAITPOSINFO     5
     121             : #define INPOSINFO       6
     122             : #define WAITPOSDELIM    7
     123             : #define WAITCHARCMPLX   8
     124             : 
     125             : #define PRSSYNTAXERROR prssyntaxerror(state)
     126             : 
     127             : static void
     128           6 : prssyntaxerror(TSVectorParseState state)
     129             : {
     130           6 :     ereport(ERROR,
     131             :             (errcode(ERRCODE_SYNTAX_ERROR),
     132             :              state->is_tsquery ?
     133             :              errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
     134             :              errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
     135             : }
     136             : 
     137             : 
     138             : /*
     139             :  * Get next token from string being parsed. Returns true if successful,
     140             :  * false if end of input string is reached.  On success, these output
     141             :  * parameters are filled in:
     142             :  *
     143             :  * *strval      pointer to token
     144             :  * *lenval      length of *strval
     145             :  * *pos_ptr     pointer to a palloc'd array of positions and weights
     146             :  *              associated with the token. If the caller is not interested
     147             :  *              in the information, NULL can be supplied. Otherwise
     148             :  *              the caller is responsible for pfreeing the array.
     149             :  * *poslen      number of elements in *pos_ptr
     150             :  * *endptr      scan resumption point
     151             :  *
     152             :  * Pass NULL for unwanted output parameters.
     153             :  */
     154             : bool
     155      192580 : gettoken_tsvector(TSVectorParseState state,
     156             :                   char **strval, int *lenval,
     157             :                   WordEntryPos **pos_ptr, int *poslen,
     158             :                   char **endptr)
     159             : {
     160      192580 :     int         oldstate = 0;
     161      192580 :     char       *curpos = state->word;
     162      192580 :     int         statecode = WAITWORD;
     163             : 
     164             :     /*
     165             :      * pos is for collecting the comma delimited list of positions followed by
     166             :      * the actual token.
     167             :      */
     168      192580 :     WordEntryPos *pos = NULL;
     169      192580 :     int         npos = 0;       /* elements of pos used */
     170      192580 :     int         posalen = 0;    /* allocated size of pos */
     171             : 
     172             :     while (1)
     173             :     {
     174      784464 :         if (statecode == WAITWORD)
     175             :         {
     176      369960 :             if (*(state->prsbuf) == '\0')
     177        3778 :                 return false;
     178      366182 :             else if (!state->is_web && t_iseq(state->prsbuf, '\''))
     179         150 :                 statecode = WAITENDCMPLX;
     180      366032 :             else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
     181             :             {
     182           6 :                 statecode = WAITNEXTCHAR;
     183           6 :                 oldstate = WAITENDWORD;
     184             :             }
     185      366026 :             else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
     186      366026 :                      (state->is_web && t_iseq(state->prsbuf, '"')))
     187           0 :                 PRSSYNTAXERROR;
     188      366026 :             else if (!t_isspace(state->prsbuf))
     189             :             {
     190      188646 :                 COPYCHAR(curpos, state->prsbuf);
     191      188646 :                 curpos += pg_mblen(state->prsbuf);
     192      188646 :                 statecode = WAITENDWORD;
     193             :             }
     194             :         }
     195      414504 :         else if (statecode == WAITNEXTCHAR)
     196             :         {
     197         162 :             if (*(state->prsbuf) == '\0')
     198           0 :                 ereport(ERROR,
     199             :                         (errcode(ERRCODE_SYNTAX_ERROR),
     200             :                          errmsg("there is no escaped character: \"%s\"",
     201             :                                 state->bufstart)));
     202             :             else
     203             :             {
     204         162 :                 RESIZEPRSBUF;
     205         162 :                 COPYCHAR(curpos, state->prsbuf);
     206         162 :                 curpos += pg_mblen(state->prsbuf);
     207             :                 Assert(oldstate != 0);
     208         162 :                 statecode = oldstate;
     209             :             }
     210             :         }
     211      414342 :         else if (statecode == WAITENDWORD)
     212             :         {
     213      382740 :             if (!state->is_web && t_iseq(state->prsbuf, '\\'))
     214             :             {
     215          72 :                 statecode = WAITNEXTCHAR;
     216          72 :                 oldstate = WAITENDWORD;
     217             :             }
     218      382668 :             else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
     219      206406 :                      (state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
     220      204642 :                      (state->is_web && t_iseq(state->prsbuf, '"')))
     221             :             {
     222      178038 :                 RESIZEPRSBUF;
     223      178038 :                 if (curpos == state->word)
     224           0 :                     PRSSYNTAXERROR;
     225      178038 :                 *(curpos) = '\0';
     226      178038 :                 RETURN_TOKEN;
     227             :             }
     228      204630 :             else if (t_iseq(state->prsbuf, ':'))
     229             :             {
     230       10614 :                 if (curpos == state->word)
     231           0 :                     PRSSYNTAXERROR;
     232       10614 :                 *(curpos) = '\0';
     233       10614 :                 if (state->oprisdelim)
     234         696 :                     RETURN_TOKEN;
     235             :                 else
     236        9918 :                     statecode = INPOSINFO;
     237             :             }
     238             :             else
     239             :             {
     240      194016 :                 RESIZEPRSBUF;
     241      194016 :                 COPYCHAR(curpos, state->prsbuf);
     242      194016 :                 curpos += pg_mblen(state->prsbuf);
     243             :             }
     244             :         }
     245       31602 :         else if (statecode == WAITENDCMPLX)
     246             :         {
     247         912 :             if (!state->is_web && t_iseq(state->prsbuf, '\''))
     248             :             {
     249         150 :                 statecode = WAITCHARCMPLX;
     250             :             }
     251         762 :             else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
     252             :             {
     253          84 :                 statecode = WAITNEXTCHAR;
     254          84 :                 oldstate = WAITENDCMPLX;
     255             :             }
     256         678 :             else if (*(state->prsbuf) == '\0')
     257           0 :                 PRSSYNTAXERROR;
     258             :             else
     259             :             {
     260         678 :                 RESIZEPRSBUF;
     261         678 :                 COPYCHAR(curpos, state->prsbuf);
     262         678 :                 curpos += pg_mblen(state->prsbuf);
     263             :             }
     264             :         }
     265       30690 :         else if (statecode == WAITCHARCMPLX)
     266             :         {
     267         150 :             if (!state->is_web && t_iseq(state->prsbuf, '\''))
     268             :             {
     269           0 :                 RESIZEPRSBUF;
     270           0 :                 COPYCHAR(curpos, state->prsbuf);
     271           0 :                 curpos += pg_mblen(state->prsbuf);
     272           0 :                 statecode = WAITENDCMPLX;
     273             :             }
     274             :             else
     275             :             {
     276         150 :                 RESIZEPRSBUF;
     277         150 :                 *(curpos) = '\0';
     278         150 :                 if (curpos == state->word)
     279           6 :                     PRSSYNTAXERROR;
     280         144 :                 if (state->oprisdelim)
     281             :                 {
     282             :                     /* state->prsbuf+=pg_mblen(state->prsbuf); */
     283          66 :                     RETURN_TOKEN;
     284             :                 }
     285             :                 else
     286          78 :                     statecode = WAITPOSINFO;
     287          78 :                 continue;       /* recheck current character */
     288             :             }
     289             :         }
     290       30540 :         else if (statecode == WAITPOSINFO)
     291             :         {
     292          78 :             if (t_iseq(state->prsbuf, ':'))
     293           0 :                 statecode = INPOSINFO;
     294             :             else
     295          78 :                 RETURN_TOKEN;
     296             :         }
     297       30462 :         else if (statecode == INPOSINFO)
     298             :         {
     299       10524 :             if (t_isdigit(state->prsbuf))
     300             :             {
     301       10524 :                 if (posalen == 0)
     302             :                 {
     303        9918 :                     posalen = 4;
     304        9918 :                     pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
     305        9918 :                     npos = 0;
     306             :                 }
     307         606 :                 else if (npos + 1 >= posalen)
     308             :                 {
     309         114 :                     posalen *= 2;
     310         114 :                     pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
     311             :                 }
     312       10524 :                 npos++;
     313       10524 :                 WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
     314             :                 /* we cannot get here in tsquery, so no need for 2 errmsgs */
     315       10524 :                 if (WEP_GETPOS(pos[npos - 1]) == 0)
     316           0 :                     ereport(ERROR,
     317             :                             (errcode(ERRCODE_SYNTAX_ERROR),
     318             :                              errmsg("wrong position info in tsvector: \"%s\"",
     319             :                                     state->bufstart)));
     320       10524 :                 WEP_SETWEIGHT(pos[npos - 1], 0);
     321       10524 :                 statecode = WAITPOSDELIM;
     322             :             }
     323             :             else
     324           0 :                 PRSSYNTAXERROR;
     325             :         }
     326       19938 :         else if (statecode == WAITPOSDELIM)
     327             :         {
     328       19938 :             if (t_iseq(state->prsbuf, ','))
     329         606 :                 statecode = INPOSINFO;
     330       19332 :             else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
     331             :             {
     332         420 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     333           0 :                     PRSSYNTAXERROR;
     334         420 :                 WEP_SETWEIGHT(pos[npos - 1], 3);
     335             :             }
     336       18912 :             else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
     337             :             {
     338         216 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     339           0 :                     PRSSYNTAXERROR;
     340         216 :                 WEP_SETWEIGHT(pos[npos - 1], 2);
     341             :             }
     342       18696 :             else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
     343             :             {
     344         276 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     345           0 :                     PRSSYNTAXERROR;
     346         276 :                 WEP_SETWEIGHT(pos[npos - 1], 1);
     347             :             }
     348       18420 :             else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
     349             :             {
     350         132 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     351           0 :                     PRSSYNTAXERROR;
     352         132 :                 WEP_SETWEIGHT(pos[npos - 1], 0);
     353             :             }
     354       18288 :             else if (t_isspace(state->prsbuf) ||
     355        8802 :                      *(state->prsbuf) == '\0')
     356        9918 :                 RETURN_TOKEN;
     357        8370 :             else if (!t_isdigit(state->prsbuf))
     358           0 :                 PRSSYNTAXERROR;
     359             :         }
     360             :         else                    /* internal error */
     361           0 :             elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
     362             :                  statecode);
     363             : 
     364             :         /* get next char */
     365      591806 :         state->prsbuf += pg_mblen(state->prsbuf);
     366             :     }
     367             : }

Generated by: LCOV version 1.14