LCOV - code coverage report
Current view: top level - src/backend/utils/adt - tsvector_parser.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 126 144 87.5 %
Date: 2024-07-18 16:11:39 Functions: 5 5 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * tsvector_parser.c
       4             :  *    Parser for tsvector
       5             :  *
       6             :  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
       7             :  *
       8             :  *
       9             :  * IDENTIFICATION
      10             :  *    src/backend/utils/adt/tsvector_parser.c
      11             :  *
      12             :  *-------------------------------------------------------------------------
      13             :  */
      14             : 
      15             : #include "postgres.h"
      16             : 
      17             : #include "tsearch/ts_locale.h"
      18             : #include "tsearch/ts_utils.h"
      19             : 
      20             : 
      21             : /*
      22             :  * Private state of tsvector parser.  Note that tsquery also uses this code to
      23             :  * parse its input, hence the boolean flags.  The oprisdelim and is_tsquery
      24             :  * flags are both true or both false in current usage, but we keep them
      25             :  * separate for clarity.
      26             :  *
      27             :  * If oprisdelim is set, the following characters are treated as delimiters
      28             :  * (in addition to whitespace): ! | & ( )
      29             :  *
      30             :  * is_tsquery affects *only* the content of error messages.
      31             :  *
      32             :  * is_web can be true to further modify tsquery parsing.
      33             :  *
      34             :  * If escontext is an ErrorSaveContext node, then soft errors can be
      35             :  * captured there rather than being thrown.
      36             :  */
      37             : struct TSVectorParseStateData
      38             : {
      39             :     char       *prsbuf;         /* next input character */
      40             :     char       *bufstart;       /* whole string (used only for errors) */
      41             :     char       *word;           /* buffer to hold the current word */
      42             :     int         len;            /* size in bytes allocated for 'word' */
      43             :     int         eml;            /* max bytes per character */
      44             :     bool        oprisdelim;     /* treat ! | * ( ) as delimiters? */
      45             :     bool        is_tsquery;     /* say "tsquery" not "tsvector" in errors? */
      46             :     bool        is_web;         /* we're in websearch_to_tsquery() */
      47             :     Node       *escontext;      /* for soft error reporting */
      48             : };
      49             : 
      50             : 
      51             : /*
      52             :  * Initializes a parser state object for the given input string.
      53             :  * A bitmask of flags (see ts_utils.h) and an error context object
      54             :  * can be provided as well.
      55             :  */
      56             : TSVectorParseState
      57        7638 : init_tsvector_parser(char *input, int flags, Node *escontext)
      58             : {
      59             :     TSVectorParseState state;
      60             : 
      61        7638 :     state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
      62        7638 :     state->prsbuf = input;
      63        7638 :     state->bufstart = input;
      64        7638 :     state->len = 32;
      65        7638 :     state->word = (char *) palloc(state->len);
      66        7638 :     state->eml = pg_database_encoding_max_length();
      67        7638 :     state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0;
      68        7638 :     state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0;
      69        7638 :     state->is_web = (flags & P_TSV_IS_WEB) != 0;
      70        7638 :     state->escontext = escontext;
      71             : 
      72        7638 :     return state;
      73             : }
      74             : 
      75             : /*
      76             :  * Reinitializes parser to parse 'input', instead of previous input.
      77             :  *
      78             :  * Note that bufstart (the string reported in errors) is not changed.
      79             :  */
      80             : void
      81        8136 : reset_tsvector_parser(TSVectorParseState state, char *input)
      82             : {
      83        8136 :     state->prsbuf = input;
      84        8136 : }
      85             : 
      86             : /*
      87             :  * Shuts down a tsvector parser.
      88             :  */
      89             : void
      90        7632 : close_tsvector_parser(TSVectorParseState state)
      91             : {
      92        7632 :     pfree(state->word);
      93        7632 :     pfree(state);
      94        7632 : }
      95             : 
      96             : /* increase the size of 'word' if needed to hold one more character */
      97             : #define RESIZEPRSBUF \
      98             : do { \
      99             :     int clen = curpos - state->word; \
     100             :     if ( clen + state->eml >= state->len ) \
     101             :     { \
     102             :         state->len *= 2; \
     103             :         state->word = (char *) repalloc(state->word, state->len); \
     104             :         curpos = state->word + clen; \
     105             :     } \
     106             : } while (0)
     107             : 
     108             : /* Fills gettoken_tsvector's output parameters, and returns true */
     109             : #define RETURN_TOKEN \
     110             : do { \
     111             :     if (pos_ptr != NULL) \
     112             :     { \
     113             :         *pos_ptr = pos; \
     114             :         *poslen = npos; \
     115             :     } \
     116             :     else if (pos != NULL) \
     117             :         pfree(pos); \
     118             :     \
     119             :     if (strval != NULL) \
     120             :         *strval = state->word; \
     121             :     if (lenval != NULL) \
     122             :         *lenval = curpos - state->word; \
     123             :     if (endptr != NULL) \
     124             :         *endptr = state->prsbuf; \
     125             :     return true; \
     126             : } while(0)
     127             : 
     128             : 
     129             : /* State codes used in gettoken_tsvector */
     130             : #define WAITWORD        1
     131             : #define WAITENDWORD     2
     132             : #define WAITNEXTCHAR    3
     133             : #define WAITENDCMPLX    4
     134             : #define WAITPOSINFO     5
     135             : #define INPOSINFO       6
     136             : #define WAITPOSDELIM    7
     137             : #define WAITCHARCMPLX   8
     138             : 
     139             : #define PRSSYNTAXERROR return prssyntaxerror(state)
     140             : 
     141             : static bool
     142          18 : prssyntaxerror(TSVectorParseState state)
     143             : {
     144          18 :     errsave(state->escontext,
     145             :             (errcode(ERRCODE_SYNTAX_ERROR),
     146             :              state->is_tsquery ?
     147             :              errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
     148             :              errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
     149             :     /* In soft error situation, return false as convenience for caller */
     150          12 :     return false;
     151             : }
     152             : 
     153             : 
     154             : /*
     155             :  * Get next token from string being parsed. Returns true if successful,
     156             :  * false if end of input string is reached or soft error.
     157             :  *
     158             :  * On success, these output parameters are filled in:
     159             :  *
     160             :  * *strval      pointer to token
     161             :  * *lenval      length of *strval
     162             :  * *pos_ptr     pointer to a palloc'd array of positions and weights
     163             :  *              associated with the token. If the caller is not interested
     164             :  *              in the information, NULL can be supplied. Otherwise
     165             :  *              the caller is responsible for pfreeing the array.
     166             :  * *poslen      number of elements in *pos_ptr
     167             :  * *endptr      scan resumption point
     168             :  *
     169             :  * Pass NULL for any unwanted output parameters.
     170             :  *
     171             :  * If state->escontext is an ErrorSaveContext, then caller must check
     172             :  * SOFT_ERROR_OCCURRED() to determine whether a "false" result means
     173             :  * error or normal end-of-string.
     174             :  */
     175             : bool
     176      192736 : gettoken_tsvector(TSVectorParseState state,
     177             :                   char **strval, int *lenval,
     178             :                   WordEntryPos **pos_ptr, int *poslen,
     179             :                   char **endptr)
     180             : {
     181      192736 :     int         oldstate = 0;
     182      192736 :     char       *curpos = state->word;
     183      192736 :     int         statecode = WAITWORD;
     184             : 
     185             :     /*
     186             :      * pos is for collecting the comma delimited list of positions followed by
     187             :      * the actual token.
     188             :      */
     189      192736 :     WordEntryPos *pos = NULL;
     190      192736 :     int         npos = 0;       /* elements of pos used */
     191      192736 :     int         posalen = 0;    /* allocated size of pos */
     192             : 
     193             :     while (1)
     194             :     {
     195      785238 :         if (statecode == WAITWORD)
     196             :         {
     197      370116 :             if (*(state->prsbuf) == '\0')
     198        3778 :                 return false;
     199      366338 :             else if (!state->is_web && t_iseq(state->prsbuf, '\''))
     200         162 :                 statecode = WAITENDCMPLX;
     201      366176 :             else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
     202             :             {
     203           6 :                 statecode = WAITNEXTCHAR;
     204           6 :                 oldstate = WAITENDWORD;
     205             :             }
     206      366170 :             else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
     207      366170 :                      (state->is_web && t_iseq(state->prsbuf, '"')))
     208           0 :                 PRSSYNTAXERROR;
     209      366170 :             else if (!t_isspace(state->prsbuf))
     210             :             {
     211      188790 :                 COPYCHAR(curpos, state->prsbuf);
     212      188790 :                 curpos += pg_mblen(state->prsbuf);
     213      188790 :                 statecode = WAITENDWORD;
     214             :             }
     215             :         }
     216      415122 :         else if (statecode == WAITNEXTCHAR)
     217             :         {
     218         162 :             if (*(state->prsbuf) == '\0')
     219           0 :                 ereturn(state->escontext, false,
     220             :                         (errcode(ERRCODE_SYNTAX_ERROR),
     221             :                          errmsg("there is no escaped character: \"%s\"",
     222             :                                 state->bufstart)));
     223             :             else
     224             :             {
     225         162 :                 RESIZEPRSBUF;
     226         162 :                 COPYCHAR(curpos, state->prsbuf);
     227         162 :                 curpos += pg_mblen(state->prsbuf);
     228             :                 Assert(oldstate != 0);
     229         162 :                 statecode = oldstate;
     230             :             }
     231             :         }
     232      414960 :         else if (statecode == WAITENDWORD)
     233             :         {
     234      383334 :             if (!state->is_web && t_iseq(state->prsbuf, '\\'))
     235             :             {
     236          72 :                 statecode = WAITNEXTCHAR;
     237          72 :                 oldstate = WAITENDWORD;
     238             :             }
     239      383262 :             else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
     240      206880 :                      (state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
     241      205092 :                      (state->is_web && t_iseq(state->prsbuf, '"')))
     242             :             {
     243      178182 :                 RESIZEPRSBUF;
     244      178182 :                 if (curpos == state->word)
     245           0 :                     PRSSYNTAXERROR;
     246      178182 :                 *(curpos) = '\0';
     247      178182 :                 RETURN_TOKEN;
     248             :             }
     249      205080 :             else if (t_iseq(state->prsbuf, ':'))
     250             :             {
     251       10614 :                 if (curpos == state->word)
     252           0 :                     PRSSYNTAXERROR;
     253       10614 :                 *(curpos) = '\0';
     254       10614 :                 if (state->oprisdelim)
     255         696 :                     RETURN_TOKEN;
     256             :                 else
     257        9918 :                     statecode = INPOSINFO;
     258             :             }
     259             :             else
     260             :             {
     261      194466 :                 RESIZEPRSBUF;
     262      194466 :                 COPYCHAR(curpos, state->prsbuf);
     263      194466 :                 curpos += pg_mblen(state->prsbuf);
     264             :             }
     265             :         }
     266       31626 :         else if (statecode == WAITENDCMPLX)
     267             :         {
     268         924 :             if (!state->is_web && t_iseq(state->prsbuf, '\''))
     269             :             {
     270         162 :                 statecode = WAITCHARCMPLX;
     271             :             }
     272         762 :             else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
     273             :             {
     274          84 :                 statecode = WAITNEXTCHAR;
     275          84 :                 oldstate = WAITENDCMPLX;
     276             :             }
     277         678 :             else if (*(state->prsbuf) == '\0')
     278           0 :                 PRSSYNTAXERROR;
     279             :             else
     280             :             {
     281         678 :                 RESIZEPRSBUF;
     282         678 :                 COPYCHAR(curpos, state->prsbuf);
     283         678 :                 curpos += pg_mblen(state->prsbuf);
     284             :             }
     285             :         }
     286       30702 :         else if (statecode == WAITCHARCMPLX)
     287             :         {
     288         162 :             if (!state->is_web && t_iseq(state->prsbuf, '\''))
     289             :             {
     290           0 :                 RESIZEPRSBUF;
     291           0 :                 COPYCHAR(curpos, state->prsbuf);
     292           0 :                 curpos += pg_mblen(state->prsbuf);
     293           0 :                 statecode = WAITENDCMPLX;
     294             :             }
     295             :             else
     296             :             {
     297         162 :                 RESIZEPRSBUF;
     298         162 :                 *(curpos) = '\0';
     299         162 :                 if (curpos == state->word)
     300          18 :                     PRSSYNTAXERROR;
     301         144 :                 if (state->oprisdelim)
     302             :                 {
     303             :                     /* state->prsbuf+=pg_mblen(state->prsbuf); */
     304          66 :                     RETURN_TOKEN;
     305             :                 }
     306             :                 else
     307          78 :                     statecode = WAITPOSINFO;
     308          78 :                 continue;       /* recheck current character */
     309             :             }
     310             :         }
     311       30540 :         else if (statecode == WAITPOSINFO)
     312             :         {
     313          78 :             if (t_iseq(state->prsbuf, ':'))
     314           0 :                 statecode = INPOSINFO;
     315             :             else
     316          78 :                 RETURN_TOKEN;
     317             :         }
     318       30462 :         else if (statecode == INPOSINFO)
     319             :         {
     320       10524 :             if (t_isdigit(state->prsbuf))
     321             :             {
     322       10524 :                 if (posalen == 0)
     323             :                 {
     324        9918 :                     posalen = 4;
     325        9918 :                     pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
     326        9918 :                     npos = 0;
     327             :                 }
     328         606 :                 else if (npos + 1 >= posalen)
     329             :                 {
     330         114 :                     posalen *= 2;
     331         114 :                     pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
     332             :                 }
     333       10524 :                 npos++;
     334       10524 :                 WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
     335             :                 /* we cannot get here in tsquery, so no need for 2 errmsgs */
     336       10524 :                 if (WEP_GETPOS(pos[npos - 1]) == 0)
     337           0 :                     ereturn(state->escontext, false,
     338             :                             (errcode(ERRCODE_SYNTAX_ERROR),
     339             :                              errmsg("wrong position info in tsvector: \"%s\"",
     340             :                                     state->bufstart)));
     341       10524 :                 WEP_SETWEIGHT(pos[npos - 1], 0);
     342       10524 :                 statecode = WAITPOSDELIM;
     343             :             }
     344             :             else
     345           0 :                 PRSSYNTAXERROR;
     346             :         }
     347       19938 :         else if (statecode == WAITPOSDELIM)
     348             :         {
     349       19938 :             if (t_iseq(state->prsbuf, ','))
     350         606 :                 statecode = INPOSINFO;
     351       19332 :             else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
     352             :             {
     353         420 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     354           0 :                     PRSSYNTAXERROR;
     355         420 :                 WEP_SETWEIGHT(pos[npos - 1], 3);
     356             :             }
     357       18912 :             else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
     358             :             {
     359         216 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     360           0 :                     PRSSYNTAXERROR;
     361         216 :                 WEP_SETWEIGHT(pos[npos - 1], 2);
     362             :             }
     363       18696 :             else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
     364             :             {
     365         276 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     366           0 :                     PRSSYNTAXERROR;
     367         276 :                 WEP_SETWEIGHT(pos[npos - 1], 1);
     368             :             }
     369       18420 :             else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
     370             :             {
     371         132 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     372           0 :                     PRSSYNTAXERROR;
     373         132 :                 WEP_SETWEIGHT(pos[npos - 1], 0);
     374             :             }
     375       18288 :             else if (t_isspace(state->prsbuf) ||
     376        8802 :                      *(state->prsbuf) == '\0')
     377        9918 :                 RETURN_TOKEN;
     378        8370 :             else if (!t_isdigit(state->prsbuf))
     379           0 :                 PRSSYNTAXERROR;
     380             :         }
     381             :         else                    /* internal error */
     382           0 :             elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
     383             :                  statecode);
     384             : 
     385             :         /* get next char */
     386      592424 :         state->prsbuf += pg_mblen(state->prsbuf);
     387             :     }
     388             : }

Generated by: LCOV version 1.14