LCOV - code coverage report
Current view: top level - src/backend/utils/adt - tsvector_parser.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 87.8 % 139 122
Test Date: 2026-03-01 15:14:58 Functions: 100.0 % 5 5
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * tsvector_parser.c
       4              :  *    Parser for tsvector
       5              :  *
       6              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7              :  *
       8              :  *
       9              :  * IDENTIFICATION
      10              :  *    src/backend/utils/adt/tsvector_parser.c
      11              :  *
      12              :  *-------------------------------------------------------------------------
      13              :  */
      14              : 
      15              : #include "postgres.h"
      16              : 
      17              : #include "tsearch/ts_locale.h"
      18              : #include "tsearch/ts_utils.h"
      19              : 
      20              : 
      21              : /*
      22              :  * Private state of tsvector parser.  Note that tsquery also uses this code to
      23              :  * parse its input, hence the boolean flags.  The oprisdelim and is_tsquery
      24              :  * flags are both true or both false in current usage, but we keep them
      25              :  * separate for clarity.
      26              :  *
      27              :  * If oprisdelim is set, the following characters are treated as delimiters
      28              :  * (in addition to whitespace): ! | & ( )
      29              :  *
      30              :  * is_tsquery affects *only* the content of error messages.
      31              :  *
      32              :  * is_web can be true to further modify tsquery parsing.
      33              :  *
      34              :  * If escontext is an ErrorSaveContext node, then soft errors can be
      35              :  * captured there rather than being thrown.
      36              :  */
      37              : struct TSVectorParseStateData
      38              : {
      39              :     char       *prsbuf;         /* next input character */
      40              :     char       *bufstart;       /* whole string (used only for errors) */
      41              :     char       *word;           /* buffer to hold the current word */
      42              :     int         len;            /* size in bytes allocated for 'word' */
      43              :     int         eml;            /* max bytes per character */
      44              :     bool        oprisdelim;     /* treat ! | * ( ) as delimiters? */
      45              :     bool        is_tsquery;     /* say "tsquery" not "tsvector" in errors? */
      46              :     bool        is_web;         /* we're in websearch_to_tsquery() */
      47              :     Node       *escontext;      /* for soft error reporting */
      48              : };
      49              : 
      50              : 
      51              : /*
      52              :  * Initializes a parser state object for the given input string.
      53              :  * A bitmask of flags (see ts_utils.h) and an error context object
      54              :  * can be provided as well.
      55              :  */
      56              : TSVectorParseState
      57         3819 : init_tsvector_parser(char *input, int flags, Node *escontext)
      58              : {
      59              :     TSVectorParseState state;
      60              : 
      61         3819 :     state = palloc_object(struct TSVectorParseStateData);
      62         3819 :     state->prsbuf = input;
      63         3819 :     state->bufstart = input;
      64         3819 :     state->len = 32;
      65         3819 :     state->word = (char *) palloc(state->len);
      66         3819 :     state->eml = pg_database_encoding_max_length();
      67         3819 :     state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0;
      68         3819 :     state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0;
      69         3819 :     state->is_web = (flags & P_TSV_IS_WEB) != 0;
      70         3819 :     state->escontext = escontext;
      71              : 
      72         3819 :     return state;
      73              : }
      74              : 
      75              : /*
      76              :  * Reinitializes parser to parse 'input', instead of previous input.
      77              :  *
      78              :  * Note that bufstart (the string reported in errors) is not changed.
      79              :  */
      80              : void
      81         4068 : reset_tsvector_parser(TSVectorParseState state, char *input)
      82              : {
      83         4068 :     state->prsbuf = input;
      84         4068 : }
      85              : 
      86              : /*
      87              :  * Shuts down a tsvector parser.
      88              :  */
      89              : void
      90         3816 : close_tsvector_parser(TSVectorParseState state)
      91              : {
      92         3816 :     pfree(state->word);
      93         3816 :     pfree(state);
      94         3816 : }
      95              : 
      96              : /* increase the size of 'word' if needed to hold one more character */
      97              : #define RESIZEPRSBUF \
      98              : do { \
      99              :     int clen = curpos - state->word; \
     100              :     if ( clen + state->eml >= state->len ) \
     101              :     { \
     102              :         state->len *= 2; \
     103              :         state->word = (char *) repalloc(state->word, state->len); \
     104              :         curpos = state->word + clen; \
     105              :     } \
     106              : } while (0)
     107              : 
     108              : /* Fills gettoken_tsvector's output parameters, and returns true */
     109              : #define RETURN_TOKEN \
     110              : do { \
     111              :     if (pos_ptr != NULL) \
     112              :     { \
     113              :         *pos_ptr = pos; \
     114              :         *poslen = npos; \
     115              :     } \
     116              :     else if (pos != NULL) \
     117              :         pfree(pos); \
     118              :     \
     119              :     if (strval != NULL) \
     120              :         *strval = state->word; \
     121              :     if (lenval != NULL) \
     122              :         *lenval = curpos - state->word; \
     123              :     if (endptr != NULL) \
     124              :         *endptr = state->prsbuf; \
     125              :     return true; \
     126              : } while(0)
     127              : 
     128              : 
     129              : /* State codes used in gettoken_tsvector */
     130              : #define WAITWORD        1
     131              : #define WAITENDWORD     2
     132              : #define WAITNEXTCHAR    3
     133              : #define WAITENDCMPLX    4
     134              : #define WAITPOSINFO     5
     135              : #define INPOSINFO       6
     136              : #define WAITPOSDELIM    7
     137              : #define WAITCHARCMPLX   8
     138              : 
     139              : #define PRSSYNTAXERROR return prssyntaxerror(state)
     140              : 
     141              : static bool
     142            9 : prssyntaxerror(TSVectorParseState state)
     143              : {
     144            9 :     errsave(state->escontext,
     145              :             (errcode(ERRCODE_SYNTAX_ERROR),
     146              :              state->is_tsquery ?
     147              :              errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
     148              :              errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
     149              :     /* In soft error situation, return false as convenience for caller */
     150            6 :     return false;
     151              : }
     152              : 
     153              : 
     154              : /*
     155              :  * Get next token from string being parsed. Returns true if successful,
     156              :  * false if end of input string is reached or soft error.
     157              :  *
     158              :  * On success, these output parameters are filled in:
     159              :  *
     160              :  * *strval      pointer to token
     161              :  * *lenval      length of *strval
     162              :  * *pos_ptr     pointer to a palloc'd array of positions and weights
     163              :  *              associated with the token. If the caller is not interested
     164              :  *              in the information, NULL can be supplied. Otherwise
     165              :  *              the caller is responsible for pfreeing the array.
     166              :  * *poslen      number of elements in *pos_ptr
     167              :  * *endptr      scan resumption point
     168              :  *
     169              :  * Pass NULL for any unwanted output parameters.
     170              :  *
     171              :  * If state->escontext is an ErrorSaveContext, then caller must check
     172              :  * SOFT_ERROR_OCCURRED() to determine whether a "false" result means
     173              :  * error or normal end-of-string.
     174              :  */
     175              : bool
     176        96368 : gettoken_tsvector(TSVectorParseState state,
     177              :                   char **strval, int *lenval,
     178              :                   WordEntryPos **pos_ptr, int *poslen,
     179              :                   char **endptr)
     180              : {
     181        96368 :     int         oldstate = 0;
     182        96368 :     char       *curpos = state->word;
     183        96368 :     int         statecode = WAITWORD;
     184              : 
     185              :     /*
     186              :      * pos is for collecting the comma delimited list of positions followed by
     187              :      * the actual token.
     188              :      */
     189        96368 :     WordEntryPos *pos = NULL;
     190        96368 :     int         npos = 0;       /* elements of pos used */
     191        96368 :     int         posalen = 0;    /* allocated size of pos */
     192              : 
     193              :     while (1)
     194              :     {
     195       392619 :         if (statecode == WAITWORD)
     196              :         {
     197       185058 :             if (*(state->prsbuf) == '\0')
     198         1889 :                 return false;
     199       183169 :             else if (!state->is_web && t_iseq(state->prsbuf, '\''))
     200           81 :                 statecode = WAITENDCMPLX;
     201       183088 :             else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
     202              :             {
     203            3 :                 statecode = WAITNEXTCHAR;
     204            3 :                 oldstate = WAITENDWORD;
     205              :             }
     206       183085 :             else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
     207       183085 :                      (state->is_web && t_iseq(state->prsbuf, '"')))
     208            0 :                 PRSSYNTAXERROR;
     209       183085 :             else if (!isspace((unsigned char) *state->prsbuf))
     210              :             {
     211        94395 :                 curpos += ts_copychar_cstr(curpos, state->prsbuf);
     212        94395 :                 statecode = WAITENDWORD;
     213              :             }
     214              :         }
     215       207561 :         else if (statecode == WAITNEXTCHAR)
     216              :         {
     217           81 :             if (*(state->prsbuf) == '\0')
     218            0 :                 ereturn(state->escontext, false,
     219              :                         (errcode(ERRCODE_SYNTAX_ERROR),
     220              :                          errmsg("there is no escaped character: \"%s\"",
     221              :                                 state->bufstart)));
     222              :             else
     223              :             {
     224           81 :                 RESIZEPRSBUF;
     225           81 :                 curpos += ts_copychar_cstr(curpos, state->prsbuf);
     226              :                 Assert(oldstate != 0);
     227           81 :                 statecode = oldstate;
     228              :             }
     229              :         }
     230       207480 :         else if (statecode == WAITENDWORD)
     231              :         {
     232       191667 :             if (!state->is_web && t_iseq(state->prsbuf, '\\'))
     233              :             {
     234           36 :                 statecode = WAITNEXTCHAR;
     235           36 :                 oldstate = WAITENDWORD;
     236              :             }
     237       191631 :             else if (isspace((unsigned char) *state->prsbuf) || *(state->prsbuf) == '\0' ||
     238       103440 :                      (state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
     239       102546 :                      (state->is_web && t_iseq(state->prsbuf, '"')))
     240              :             {
     241        89091 :                 RESIZEPRSBUF;
     242        89091 :                 if (curpos == state->word)
     243            0 :                     PRSSYNTAXERROR;
     244        89091 :                 *(curpos) = '\0';
     245        89091 :                 RETURN_TOKEN;
     246              :             }
     247       102540 :             else if (t_iseq(state->prsbuf, ':'))
     248              :             {
     249         5307 :                 if (curpos == state->word)
     250            0 :                     PRSSYNTAXERROR;
     251         5307 :                 *(curpos) = '\0';
     252         5307 :                 if (state->oprisdelim)
     253          348 :                     RETURN_TOKEN;
     254              :                 else
     255         4959 :                     statecode = INPOSINFO;
     256              :             }
     257              :             else
     258              :             {
     259        97233 :                 RESIZEPRSBUF;
     260        97233 :                 curpos += ts_copychar_cstr(curpos, state->prsbuf);
     261              :             }
     262              :         }
     263        15813 :         else if (statecode == WAITENDCMPLX)
     264              :         {
     265          462 :             if (!state->is_web && t_iseq(state->prsbuf, '\''))
     266              :             {
     267           81 :                 statecode = WAITCHARCMPLX;
     268              :             }
     269          381 :             else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
     270              :             {
     271           42 :                 statecode = WAITNEXTCHAR;
     272           42 :                 oldstate = WAITENDCMPLX;
     273              :             }
     274          339 :             else if (*(state->prsbuf) == '\0')
     275            0 :                 PRSSYNTAXERROR;
     276              :             else
     277              :             {
     278          339 :                 RESIZEPRSBUF;
     279          339 :                 curpos += ts_copychar_cstr(curpos, state->prsbuf);
     280              :             }
     281              :         }
     282        15351 :         else if (statecode == WAITCHARCMPLX)
     283              :         {
     284           81 :             if (!state->is_web && t_iseq(state->prsbuf, '\''))
     285              :             {
     286            0 :                 RESIZEPRSBUF;
     287            0 :                 curpos += ts_copychar_cstr(curpos, state->prsbuf);
     288            0 :                 statecode = WAITENDCMPLX;
     289              :             }
     290              :             else
     291              :             {
     292           81 :                 RESIZEPRSBUF;
     293           81 :                 *(curpos) = '\0';
     294           81 :                 if (curpos == state->word)
     295            9 :                     PRSSYNTAXERROR;
     296           72 :                 if (state->oprisdelim)
     297              :                 {
     298              :                     /* state->prsbuf+=pg_mblen_cstr(state->prsbuf); */
     299           33 :                     RETURN_TOKEN;
     300              :                 }
     301              :                 else
     302           39 :                     statecode = WAITPOSINFO;
     303           39 :                 continue;       /* recheck current character */
     304              :             }
     305              :         }
     306        15270 :         else if (statecode == WAITPOSINFO)
     307              :         {
     308           39 :             if (t_iseq(state->prsbuf, ':'))
     309            0 :                 statecode = INPOSINFO;
     310              :             else
     311           39 :                 RETURN_TOKEN;
     312              :         }
     313        15231 :         else if (statecode == INPOSINFO)
     314              :         {
     315         5262 :             if (isdigit((unsigned char) *state->prsbuf))
     316              :             {
     317         5262 :                 if (posalen == 0)
     318              :                 {
     319         4959 :                     posalen = 4;
     320         4959 :                     pos = palloc_array(WordEntryPos, posalen);
     321         4959 :                     npos = 0;
     322              :                 }
     323          303 :                 else if (npos + 1 >= posalen)
     324              :                 {
     325           57 :                     posalen *= 2;
     326           57 :                     pos = repalloc_array(pos, WordEntryPos, posalen);
     327              :                 }
     328         5262 :                 npos++;
     329         5262 :                 WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
     330              :                 /* we cannot get here in tsquery, so no need for 2 errmsgs */
     331         5262 :                 if (WEP_GETPOS(pos[npos - 1]) == 0)
     332            0 :                     ereturn(state->escontext, false,
     333              :                             (errcode(ERRCODE_SYNTAX_ERROR),
     334              :                              errmsg("wrong position info in tsvector: \"%s\"",
     335              :                                     state->bufstart)));
     336         5262 :                 WEP_SETWEIGHT(pos[npos - 1], 0);
     337         5262 :                 statecode = WAITPOSDELIM;
     338              :             }
     339              :             else
     340            0 :                 PRSSYNTAXERROR;
     341              :         }
     342         9969 :         else if (statecode == WAITPOSDELIM)
     343              :         {
     344         9969 :             if (t_iseq(state->prsbuf, ','))
     345          303 :                 statecode = INPOSINFO;
     346         9666 :             else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
     347              :             {
     348          210 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     349            0 :                     PRSSYNTAXERROR;
     350          210 :                 WEP_SETWEIGHT(pos[npos - 1], 3);
     351              :             }
     352         9456 :             else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
     353              :             {
     354          108 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     355            0 :                     PRSSYNTAXERROR;
     356          108 :                 WEP_SETWEIGHT(pos[npos - 1], 2);
     357              :             }
     358         9348 :             else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
     359              :             {
     360          138 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     361            0 :                     PRSSYNTAXERROR;
     362          138 :                 WEP_SETWEIGHT(pos[npos - 1], 1);
     363              :             }
     364         9210 :             else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
     365              :             {
     366           66 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     367            0 :                     PRSSYNTAXERROR;
     368           66 :                 WEP_SETWEIGHT(pos[npos - 1], 0);
     369              :             }
     370         9144 :             else if (isspace((unsigned char) *state->prsbuf) ||
     371         4401 :                      *(state->prsbuf) == '\0')
     372         4959 :                 RETURN_TOKEN;
     373         4185 :             else if (!isdigit((unsigned char) *state->prsbuf))
     374            0 :                 PRSSYNTAXERROR;
     375              :         }
     376              :         else                    /* internal error */
     377            0 :             elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
     378              :                  statecode);
     379              : 
     380              :         /* get next char */
     381       296212 :         state->prsbuf += pg_mblen_cstr(state->prsbuf);
     382              :     }
     383              : }
        

Generated by: LCOV version 2.0-1