LCOV - code coverage report
Current view: top level - src/backend/utils/adt - tsvector_parser.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 122 139 87.8 %
Date: 2026-02-09 07:17:33 Functions: 5 5 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * tsvector_parser.c
       4             :  *    Parser for tsvector
       5             :  *
       6             :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7             :  *
       8             :  *
       9             :  * IDENTIFICATION
      10             :  *    src/backend/utils/adt/tsvector_parser.c
      11             :  *
      12             :  *-------------------------------------------------------------------------
      13             :  */
      14             : 
      15             : #include "postgres.h"
      16             : 
      17             : #include "tsearch/ts_locale.h"
      18             : #include "tsearch/ts_utils.h"
      19             : 
      20             : 
      21             : /*
      22             :  * Private state of tsvector parser.  Note that tsquery also uses this code to
      23             :  * parse its input, hence the boolean flags.  The oprisdelim and is_tsquery
      24             :  * flags are both true or both false in current usage, but we keep them
      25             :  * separate for clarity.
      26             :  *
      27             :  * If oprisdelim is set, the following characters are treated as delimiters
      28             :  * (in addition to whitespace): ! | & ( )
      29             :  *
      30             :  * is_tsquery affects *only* the content of error messages.
      31             :  *
      32             :  * is_web can be true to further modify tsquery parsing.
      33             :  *
      34             :  * If escontext is an ErrorSaveContext node, then soft errors can be
      35             :  * captured there rather than being thrown.
      36             :  */
      37             : struct TSVectorParseStateData
      38             : {
      39             :     char       *prsbuf;         /* next input character */
      40             :     char       *bufstart;       /* whole string (used only for errors) */
      41             :     char       *word;           /* buffer to hold the current word */
      42             :     int         len;            /* size in bytes allocated for 'word' */
      43             :     int         eml;            /* max bytes per character */
      44             :     bool        oprisdelim;     /* treat ! | * ( ) as delimiters? */
      45             :     bool        is_tsquery;     /* say "tsquery" not "tsvector" in errors? */
      46             :     bool        is_web;         /* we're in websearch_to_tsquery() */
      47             :     Node       *escontext;      /* for soft error reporting */
      48             : };
      49             : 
      50             : 
      51             : /*
      52             :  * Initializes a parser state object for the given input string.
      53             :  * A bitmask of flags (see ts_utils.h) and an error context object
      54             :  * can be provided as well.
      55             :  */
      56             : TSVectorParseState
      57        7638 : init_tsvector_parser(char *input, int flags, Node *escontext)
      58             : {
      59             :     TSVectorParseState state;
      60             : 
      61        7638 :     state = palloc_object(struct TSVectorParseStateData);
      62        7638 :     state->prsbuf = input;
      63        7638 :     state->bufstart = input;
      64        7638 :     state->len = 32;
      65        7638 :     state->word = (char *) palloc(state->len);
      66        7638 :     state->eml = pg_database_encoding_max_length();
      67        7638 :     state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0;
      68        7638 :     state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0;
      69        7638 :     state->is_web = (flags & P_TSV_IS_WEB) != 0;
      70        7638 :     state->escontext = escontext;
      71             : 
      72        7638 :     return state;
      73             : }
      74             : 
      75             : /*
      76             :  * Reinitializes parser to parse 'input', instead of previous input.
      77             :  *
      78             :  * Note that bufstart (the string reported in errors) is not changed.
      79             :  */
      80             : void
      81        8136 : reset_tsvector_parser(TSVectorParseState state, char *input)
      82             : {
      83        8136 :     state->prsbuf = input;
      84        8136 : }
      85             : 
      86             : /*
      87             :  * Shuts down a tsvector parser.
      88             :  */
      89             : void
      90        7632 : close_tsvector_parser(TSVectorParseState state)
      91             : {
      92        7632 :     pfree(state->word);
      93        7632 :     pfree(state);
      94        7632 : }
      95             : 
      96             : /* increase the size of 'word' if needed to hold one more character */
      97             : #define RESIZEPRSBUF \
      98             : do { \
      99             :     int clen = curpos - state->word; \
     100             :     if ( clen + state->eml >= state->len ) \
     101             :     { \
     102             :         state->len *= 2; \
     103             :         state->word = (char *) repalloc(state->word, state->len); \
     104             :         curpos = state->word + clen; \
     105             :     } \
     106             : } while (0)
     107             : 
     108             : /* Fills gettoken_tsvector's output parameters, and returns true */
     109             : #define RETURN_TOKEN \
     110             : do { \
     111             :     if (pos_ptr != NULL) \
     112             :     { \
     113             :         *pos_ptr = pos; \
     114             :         *poslen = npos; \
     115             :     } \
     116             :     else if (pos != NULL) \
     117             :         pfree(pos); \
     118             :     \
     119             :     if (strval != NULL) \
     120             :         *strval = state->word; \
     121             :     if (lenval != NULL) \
     122             :         *lenval = curpos - state->word; \
     123             :     if (endptr != NULL) \
     124             :         *endptr = state->prsbuf; \
     125             :     return true; \
     126             : } while(0)
     127             : 
     128             : 
     129             : /* State codes used in gettoken_tsvector */
     130             : #define WAITWORD        1
     131             : #define WAITENDWORD     2
     132             : #define WAITNEXTCHAR    3
     133             : #define WAITENDCMPLX    4
     134             : #define WAITPOSINFO     5
     135             : #define INPOSINFO       6
     136             : #define WAITPOSDELIM    7
     137             : #define WAITCHARCMPLX   8
     138             : 
     139             : #define PRSSYNTAXERROR return prssyntaxerror(state)
     140             : 
     141             : static bool
     142          18 : prssyntaxerror(TSVectorParseState state)
     143             : {
     144          18 :     errsave(state->escontext,
     145             :             (errcode(ERRCODE_SYNTAX_ERROR),
     146             :              state->is_tsquery ?
     147             :              errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
     148             :              errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
     149             :     /* In soft error situation, return false as convenience for caller */
     150          12 :     return false;
     151             : }
     152             : 
     153             : 
     154             : /*
     155             :  * Get next token from string being parsed. Returns true if successful,
     156             :  * false if end of input string is reached or soft error.
     157             :  *
     158             :  * On success, these output parameters are filled in:
     159             :  *
     160             :  * *strval      pointer to token
     161             :  * *lenval      length of *strval
     162             :  * *pos_ptr     pointer to a palloc'd array of positions and weights
     163             :  *              associated with the token. If the caller is not interested
     164             :  *              in the information, NULL can be supplied. Otherwise
     165             :  *              the caller is responsible for pfreeing the array.
     166             :  * *poslen      number of elements in *pos_ptr
     167             :  * *endptr      scan resumption point
     168             :  *
     169             :  * Pass NULL for any unwanted output parameters.
     170             :  *
     171             :  * If state->escontext is an ErrorSaveContext, then caller must check
     172             :  * SOFT_ERROR_OCCURRED() to determine whether a "false" result means
     173             :  * error or normal end-of-string.
     174             :  */
     175             : bool
     176      192736 : gettoken_tsvector(TSVectorParseState state,
     177             :                   char **strval, int *lenval,
     178             :                   WordEntryPos **pos_ptr, int *poslen,
     179             :                   char **endptr)
     180             : {
     181      192736 :     int         oldstate = 0;
     182      192736 :     char       *curpos = state->word;
     183      192736 :     int         statecode = WAITWORD;
     184             : 
     185             :     /*
     186             :      * pos is for collecting the comma delimited list of positions followed by
     187             :      * the actual token.
     188             :      */
     189      192736 :     WordEntryPos *pos = NULL;
     190      192736 :     int         npos = 0;       /* elements of pos used */
     191      192736 :     int         posalen = 0;    /* allocated size of pos */
     192             : 
     193             :     while (1)
     194             :     {
     195      785238 :         if (statecode == WAITWORD)
     196             :         {
     197      370116 :             if (*(state->prsbuf) == '\0')
     198        3778 :                 return false;
     199      366338 :             else if (!state->is_web && t_iseq(state->prsbuf, '\''))
     200         162 :                 statecode = WAITENDCMPLX;
     201      366176 :             else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
     202             :             {
     203           6 :                 statecode = WAITNEXTCHAR;
     204           6 :                 oldstate = WAITENDWORD;
     205             :             }
     206      366170 :             else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
     207      366170 :                      (state->is_web && t_iseq(state->prsbuf, '"')))
     208           0 :                 PRSSYNTAXERROR;
     209      366170 :             else if (!isspace((unsigned char) *state->prsbuf))
     210             :             {
     211      188790 :                 curpos += ts_copychar_cstr(curpos, state->prsbuf);
     212      188790 :                 statecode = WAITENDWORD;
     213             :             }
     214             :         }
     215      415122 :         else if (statecode == WAITNEXTCHAR)
     216             :         {
     217         162 :             if (*(state->prsbuf) == '\0')
     218           0 :                 ereturn(state->escontext, false,
     219             :                         (errcode(ERRCODE_SYNTAX_ERROR),
     220             :                          errmsg("there is no escaped character: \"%s\"",
     221             :                                 state->bufstart)));
     222             :             else
     223             :             {
     224         162 :                 RESIZEPRSBUF;
     225         162 :                 curpos += ts_copychar_cstr(curpos, state->prsbuf);
     226             :                 Assert(oldstate != 0);
     227         162 :                 statecode = oldstate;
     228             :             }
     229             :         }
     230      414960 :         else if (statecode == WAITENDWORD)
     231             :         {
     232      383334 :             if (!state->is_web && t_iseq(state->prsbuf, '\\'))
     233             :             {
     234          72 :                 statecode = WAITNEXTCHAR;
     235          72 :                 oldstate = WAITENDWORD;
     236             :             }
     237      383262 :             else if (isspace((unsigned char) *state->prsbuf) || *(state->prsbuf) == '\0' ||
     238      206880 :                      (state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
     239      205092 :                      (state->is_web && t_iseq(state->prsbuf, '"')))
     240             :             {
     241      178182 :                 RESIZEPRSBUF;
     242      178182 :                 if (curpos == state->word)
     243           0 :                     PRSSYNTAXERROR;
     244      178182 :                 *(curpos) = '\0';
     245      178182 :                 RETURN_TOKEN;
     246             :             }
     247      205080 :             else if (t_iseq(state->prsbuf, ':'))
     248             :             {
     249       10614 :                 if (curpos == state->word)
     250           0 :                     PRSSYNTAXERROR;
     251       10614 :                 *(curpos) = '\0';
     252       10614 :                 if (state->oprisdelim)
     253         696 :                     RETURN_TOKEN;
     254             :                 else
     255        9918 :                     statecode = INPOSINFO;
     256             :             }
     257             :             else
     258             :             {
     259      194466 :                 RESIZEPRSBUF;
     260      194466 :                 curpos += ts_copychar_cstr(curpos, state->prsbuf);
     261             :             }
     262             :         }
     263       31626 :         else if (statecode == WAITENDCMPLX)
     264             :         {
     265         924 :             if (!state->is_web && t_iseq(state->prsbuf, '\''))
     266             :             {
     267         162 :                 statecode = WAITCHARCMPLX;
     268             :             }
     269         762 :             else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
     270             :             {
     271          84 :                 statecode = WAITNEXTCHAR;
     272          84 :                 oldstate = WAITENDCMPLX;
     273             :             }
     274         678 :             else if (*(state->prsbuf) == '\0')
     275           0 :                 PRSSYNTAXERROR;
     276             :             else
     277             :             {
     278         678 :                 RESIZEPRSBUF;
     279         678 :                 curpos += ts_copychar_cstr(curpos, state->prsbuf);
     280             :             }
     281             :         }
     282       30702 :         else if (statecode == WAITCHARCMPLX)
     283             :         {
     284         162 :             if (!state->is_web && t_iseq(state->prsbuf, '\''))
     285             :             {
     286           0 :                 RESIZEPRSBUF;
     287           0 :                 curpos += ts_copychar_cstr(curpos, state->prsbuf);
     288           0 :                 statecode = WAITENDCMPLX;
     289             :             }
     290             :             else
     291             :             {
     292         162 :                 RESIZEPRSBUF;
     293         162 :                 *(curpos) = '\0';
     294         162 :                 if (curpos == state->word)
     295          18 :                     PRSSYNTAXERROR;
     296         144 :                 if (state->oprisdelim)
     297             :                 {
     298             :                     /* state->prsbuf+=pg_mblen_cstr(state->prsbuf); */
     299          66 :                     RETURN_TOKEN;
     300             :                 }
     301             :                 else
     302          78 :                     statecode = WAITPOSINFO;
     303          78 :                 continue;       /* recheck current character */
     304             :             }
     305             :         }
     306       30540 :         else if (statecode == WAITPOSINFO)
     307             :         {
     308          78 :             if (t_iseq(state->prsbuf, ':'))
     309           0 :                 statecode = INPOSINFO;
     310             :             else
     311          78 :                 RETURN_TOKEN;
     312             :         }
     313       30462 :         else if (statecode == INPOSINFO)
     314             :         {
     315       10524 :             if (isdigit((unsigned char) *state->prsbuf))
     316             :             {
     317       10524 :                 if (posalen == 0)
     318             :                 {
     319        9918 :                     posalen = 4;
     320        9918 :                     pos = palloc_array(WordEntryPos, posalen);
     321        9918 :                     npos = 0;
     322             :                 }
     323         606 :                 else if (npos + 1 >= posalen)
     324             :                 {
     325         114 :                     posalen *= 2;
     326         114 :                     pos = repalloc_array(pos, WordEntryPos, posalen);
     327             :                 }
     328       10524 :                 npos++;
     329       10524 :                 WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
     330             :                 /* we cannot get here in tsquery, so no need for 2 errmsgs */
     331       10524 :                 if (WEP_GETPOS(pos[npos - 1]) == 0)
     332           0 :                     ereturn(state->escontext, false,
     333             :                             (errcode(ERRCODE_SYNTAX_ERROR),
     334             :                              errmsg("wrong position info in tsvector: \"%s\"",
     335             :                                     state->bufstart)));
     336       10524 :                 WEP_SETWEIGHT(pos[npos - 1], 0);
     337       10524 :                 statecode = WAITPOSDELIM;
     338             :             }
     339             :             else
     340           0 :                 PRSSYNTAXERROR;
     341             :         }
     342       19938 :         else if (statecode == WAITPOSDELIM)
     343             :         {
     344       19938 :             if (t_iseq(state->prsbuf, ','))
     345         606 :                 statecode = INPOSINFO;
     346       19332 :             else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
     347             :             {
     348         420 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     349           0 :                     PRSSYNTAXERROR;
     350         420 :                 WEP_SETWEIGHT(pos[npos - 1], 3);
     351             :             }
     352       18912 :             else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
     353             :             {
     354         216 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     355           0 :                     PRSSYNTAXERROR;
     356         216 :                 WEP_SETWEIGHT(pos[npos - 1], 2);
     357             :             }
     358       18696 :             else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
     359             :             {
     360         276 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     361           0 :                     PRSSYNTAXERROR;
     362         276 :                 WEP_SETWEIGHT(pos[npos - 1], 1);
     363             :             }
     364       18420 :             else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
     365             :             {
     366         132 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     367           0 :                     PRSSYNTAXERROR;
     368         132 :                 WEP_SETWEIGHT(pos[npos - 1], 0);
     369             :             }
     370       18288 :             else if (isspace((unsigned char) *state->prsbuf) ||
     371        8802 :                      *(state->prsbuf) == '\0')
     372        9918 :                 RETURN_TOKEN;
     373        8370 :             else if (!isdigit((unsigned char) *state->prsbuf))
     374           0 :                 PRSSYNTAXERROR;
     375             :         }
     376             :         else                    /* internal error */
     377           0 :             elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
     378             :                  statecode);
     379             : 
     380             :         /* get next char */
     381      592424 :         state->prsbuf += pg_mblen_cstr(state->prsbuf);
     382             :     }
     383             : }

Generated by: LCOV version 1.16