LCOV - code coverage report
Current view: top level - src/backend/utils/adt - tsquery_cleanup.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 122 159 76.7 %
Date: 2025-01-18 04:15:08 Functions: 7 9 77.8 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * tsquery_cleanup.c
       4             :  *   Cleanup query from NOT values and/or stopword
       5             :  *   Utility functions to correct work.
       6             :  *
       7             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/utils/adt/tsquery_cleanup.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : 
      16             : #include "postgres.h"
      17             : 
      18             : #include "miscadmin.h"
      19             : #include "tsearch/ts_utils.h"
      20             : #include "varatt.h"
      21             : 
      22             : typedef struct NODE
      23             : {
      24             :     struct NODE *left;
      25             :     struct NODE *right;
      26             :     QueryItem  *valnode;
      27             : } NODE;
      28             : 
      29             : /*
      30             :  * make query tree from plain view of query
      31             :  */
      32             : static NODE *
      33        2928 : maketree(QueryItem *in)
      34             : {
      35        2928 :     NODE       *node = (NODE *) palloc(sizeof(NODE));
      36             : 
      37             :     /* since this function recurses, it could be driven to stack overflow. */
      38        2928 :     check_stack_depth();
      39             : 
      40        2928 :     node->valnode = in;
      41        2928 :     node->right = node->left = NULL;
      42        2928 :     if (in->type == QI_OPR)
      43             :     {
      44        1290 :         node->right = maketree(in + 1);
      45        1290 :         if (in->qoperator.oper != OP_NOT)
      46        1206 :             node->left = maketree(in + in->qoperator.left);
      47             :     }
      48        2928 :     return node;
      49             : }
      50             : 
      51             : /*
      52             :  * Internal state for plaintree and plainnode
      53             :  */
      54             : typedef struct
      55             : {
      56             :     QueryItem  *ptr;
      57             :     int         len;            /* allocated size of ptr */
      58             :     int         cur;            /* number of elements in ptr */
      59             : } PLAINTREE;
      60             : 
      61             : static void
      62        1602 : plainnode(PLAINTREE *state, NODE *node)
      63             : {
      64             :     /* since this function recurses, it could be driven to stack overflow. */
      65        1602 :     check_stack_depth();
      66             : 
      67        1602 :     if (state->cur == state->len)
      68             :     {
      69           0 :         state->len *= 2;
      70           0 :         state->ptr = (QueryItem *) repalloc(state->ptr, state->len * sizeof(QueryItem));
      71             :     }
      72        1602 :     memcpy(&(state->ptr[state->cur]), node->valnode, sizeof(QueryItem));
      73        1602 :     if (node->valnode->type == QI_VAL)
      74         972 :         state->cur++;
      75         630 :     else if (node->valnode->qoperator.oper == OP_NOT)
      76             :     {
      77          66 :         state->ptr[state->cur].qoperator.left = 1;
      78          66 :         state->cur++;
      79          66 :         plainnode(state, node->right);
      80             :     }
      81             :     else
      82             :     {
      83         564 :         int         cur = state->cur;
      84             : 
      85         564 :         state->cur++;
      86         564 :         plainnode(state, node->right);
      87         564 :         state->ptr[cur].qoperator.left = state->cur - cur;
      88         564 :         plainnode(state, node->left);
      89             :     }
      90        1602 :     pfree(node);
      91        1602 : }
      92             : 
      93             : /*
      94             :  * make plain view of tree from a NODE-tree representation
      95             :  */
      96             : static QueryItem *
      97         408 : plaintree(NODE *root, int *len)
      98             : {
      99             :     PLAINTREE   pl;
     100             : 
     101         408 :     pl.cur = 0;
     102         408 :     pl.len = 16;
     103         408 :     if (root && (root->valnode->type == QI_VAL || root->valnode->type == QI_OPR))
     104             :     {
     105         408 :         pl.ptr = (QueryItem *) palloc(pl.len * sizeof(QueryItem));
     106         408 :         plainnode(&pl, root);
     107             :     }
     108             :     else
     109           0 :         pl.ptr = NULL;
     110         408 :     *len = pl.cur;
     111         408 :     return pl.ptr;
     112             : }
     113             : 
     114             : static void
     115          42 : freetree(NODE *node)
     116             : {
     117             :     /* since this function recurses, it could be driven to stack overflow. */
     118          42 :     check_stack_depth();
     119             : 
     120          42 :     if (!node)
     121           0 :         return;
     122          42 :     if (node->left)
     123           0 :         freetree(node->left);
     124          42 :     if (node->right)
     125           0 :         freetree(node->right);
     126          42 :     pfree(node);
     127             : }
     128             : 
     129             : /*
     130             :  * clean tree for ! operator.
     131             :  * It's useful for debug, but in
     132             :  * other case, such view is used with search in index.
     133             :  * Operator ! always return TRUE
     134             :  */
     135             : static NODE *
     136           0 : clean_NOT_intree(NODE *node)
     137             : {
     138             :     /* since this function recurses, it could be driven to stack overflow. */
     139           0 :     check_stack_depth();
     140             : 
     141           0 :     if (node->valnode->type == QI_VAL)
     142           0 :         return node;
     143             : 
     144           0 :     if (node->valnode->qoperator.oper == OP_NOT)
     145             :     {
     146           0 :         freetree(node);
     147           0 :         return NULL;
     148             :     }
     149             : 
     150             :     /* operator & or | */
     151           0 :     if (node->valnode->qoperator.oper == OP_OR)
     152             :     {
     153           0 :         if ((node->left = clean_NOT_intree(node->left)) == NULL ||
     154           0 :             (node->right = clean_NOT_intree(node->right)) == NULL)
     155             :         {
     156           0 :             freetree(node);
     157           0 :             return NULL;
     158             :         }
     159             :     }
     160             :     else
     161             :     {
     162           0 :         NODE       *res = node;
     163             : 
     164             :         Assert(node->valnode->qoperator.oper == OP_AND ||
     165             :                node->valnode->qoperator.oper == OP_PHRASE);
     166             : 
     167           0 :         node->left = clean_NOT_intree(node->left);
     168           0 :         node->right = clean_NOT_intree(node->right);
     169           0 :         if (node->left == NULL && node->right == NULL)
     170             :         {
     171           0 :             pfree(node);
     172           0 :             res = NULL;
     173             :         }
     174           0 :         else if (node->left == NULL)
     175             :         {
     176           0 :             res = node->right;
     177           0 :             pfree(node);
     178             :         }
     179           0 :         else if (node->right == NULL)
     180             :         {
     181           0 :             res = node->left;
     182           0 :             pfree(node);
     183             :         }
     184           0 :         return res;
     185             :     }
     186           0 :     return node;
     187             : }
     188             : 
     189             : QueryItem *
     190           0 : clean_NOT(QueryItem *ptr, int *len)
     191             : {
     192           0 :     NODE       *root = maketree(ptr);
     193             : 
     194           0 :     return plaintree(clean_NOT_intree(root), len);
     195             : }
     196             : 
     197             : 
     198             : /*
     199             :  * Remove QI_VALSTOP (stopword) nodes from query tree.
     200             :  *
     201             :  * Returns NULL if the query degenerates to nothing.  Input must not be NULL.
     202             :  *
     203             :  * When we remove a phrase operator due to removing one or both of its
     204             :  * arguments, we might need to adjust the distance of a parent phrase
     205             :  * operator.  For example, 'a' is a stopword, so:
     206             :  *      (b <-> a) <-> c  should become  b <2> c
     207             :  *      b <-> (a <-> c)  should become  b <2> c
     208             :  *      (b <-> (a <-> a)) <-> c  should become    b <3> c
     209             :  *      b <-> ((a <-> a) <-> c)  should become    b <3> c
     210             :  * To handle that, we define two output parameters:
     211             :  *      ladd: amount to add to a phrase distance to the left of this node
     212             :  *      radd: amount to add to a phrase distance to the right of this node
     213             :  * We need two outputs because we could need to bubble up adjustments to two
     214             :  * different parent phrase operators.  Consider
     215             :  *      w <-> (((a <-> x) <2> (y <3> a)) <-> z)
     216             :  * After we've removed the two a's and are considering the <2> node (which is
     217             :  * now just x <2> y), we have an ladd distance of 1 that needs to propagate
     218             :  * up to the topmost (leftmost) <->, and an radd distance of 3 that needs to
     219             :  * propagate to the rightmost <->, so that we'll end up with
     220             :  *      w <2> ((x <2> y) <4> z)
     221             :  * Near the bottom of the tree, we may have subtrees consisting only of
     222             :  * stopwords.  The distances of any phrase operators within such a subtree are
     223             :  * summed and propagated to both ladd and radd, since we don't know which side
     224             :  * of the lowest surviving phrase operator we are in.  The rule is that any
     225             :  * subtree that degenerates to NULL must return equal values of ladd and radd,
     226             :  * and the parent node dealing with it should incorporate only one of those.
     227             :  *
     228             :  * Currently, we only implement this adjustment for adjacent phrase operators.
     229             :  * Thus for example 'x <-> ((a <-> y) | z)' will become 'x <-> (y | z)', which
     230             :  * isn't ideal, but there is no way to represent the really desired semantics
     231             :  * without some redesign of the tsquery structure.  Certainly it would not be
     232             :  * any better to convert that to 'x <2> (y | z)'.  Since this is such a weird
     233             :  * corner case, let it go for now.  But we can fix it in cases where the
     234             :  * intervening non-phrase operator also gets removed, for example
     235             :  * '((x <-> a) | a) <-> y' will become 'x <2> y'.
     236             :  */
     237             : static NODE *
     238        2928 : clean_stopword_intree(NODE *node, int *ladd, int *radd)
     239             : {
     240             :     /* since this function recurses, it could be driven to stack overflow. */
     241        2928 :     check_stack_depth();
     242             : 
     243             :     /* default output parameters indicate no change in parent distance */
     244        2928 :     *ladd = *radd = 0;
     245             : 
     246        2928 :     if (node->valnode->type == QI_VAL)
     247         972 :         return node;
     248        1956 :     else if (node->valnode->type == QI_VALSTOP)
     249             :     {
     250         666 :         pfree(node);
     251         666 :         return NULL;
     252             :     }
     253             : 
     254             :     Assert(node->valnode->type == QI_OPR);
     255             : 
     256        1290 :     if (node->valnode->qoperator.oper == OP_NOT)
     257             :     {
     258             :         /* NOT doesn't change pattern width, so just report child distances */
     259          84 :         node->right = clean_stopword_intree(node->right, ladd, radd);
     260          84 :         if (!node->right)
     261             :         {
     262          18 :             freetree(node);
     263          18 :             return NULL;
     264             :         }
     265             :     }
     266             :     else
     267             :     {
     268        1206 :         NODE       *res = node;
     269             :         bool        isphrase;
     270             :         int         ndistance,
     271             :                     lladd,
     272             :                     lradd,
     273             :                     rladd,
     274             :                     rradd;
     275             : 
     276             :         /* First, recurse */
     277        1206 :         node->left = clean_stopword_intree(node->left, &lladd, &lradd);
     278        1206 :         node->right = clean_stopword_intree(node->right, &rladd, &rradd);
     279             : 
     280             :         /* Check if current node is OP_PHRASE, get its distance */
     281        1206 :         isphrase = (node->valnode->qoperator.oper == OP_PHRASE);
     282        1206 :         ndistance = isphrase ? node->valnode->qoperator.distance : 0;
     283             : 
     284        1206 :         if (node->left == NULL && node->right == NULL)
     285             :         {
     286             :             /*
     287             :              * When we collapse out a phrase node entirely, propagate its own
     288             :              * distance into both *ladd and *radd; it is the responsibility of
     289             :              * the parent node to count it only once.  Also, for a phrase
     290             :              * node, distances coming from children are summed and propagated
     291             :              * up to parent (we assume lladd == lradd and rladd == rradd, else
     292             :              * rule was broken at a lower level).  But if this isn't a phrase
     293             :              * node, take the larger of the two child distances; that
     294             :              * corresponds to what TS_execute will do in non-stopword cases.
     295             :              */
     296          24 :             if (isphrase)
     297           0 :                 *ladd = *radd = lladd + ndistance + rladd;
     298             :             else
     299          24 :                 *ladd = *radd = Max(lladd, rladd);
     300          24 :             freetree(node);
     301          24 :             return NULL;
     302             :         }
     303        1182 :         else if (node->left == NULL)
     304             :         {
     305             :             /* Removing this operator and left subnode */
     306             :             /* lladd and lradd are equal/redundant, don't count both */
     307         252 :             if (isphrase)
     308             :             {
     309             :                 /* operator's own distance must propagate to left */
     310         162 :                 *ladd = lladd + ndistance + rladd;
     311         162 :                 *radd = rradd;
     312             :             }
     313             :             else
     314             :             {
     315             :                 /* at non-phrase op, just forget the left subnode entirely */
     316          90 :                 *ladd = rladd;
     317          90 :                 *radd = rradd;
     318             :             }
     319         252 :             res = node->right;
     320         252 :             pfree(node);
     321             :         }
     322         930 :         else if (node->right == NULL)
     323             :         {
     324             :             /* Removing this operator and right subnode */
     325             :             /* rladd and rradd are equal/redundant, don't count both */
     326         366 :             if (isphrase)
     327             :             {
     328             :                 /* operator's own distance must propagate to right */
     329         216 :                 *ladd = lladd;
     330         216 :                 *radd = lradd + ndistance + rradd;
     331             :             }
     332             :             else
     333             :             {
     334             :                 /* at non-phrase op, just forget the right subnode entirely */
     335         150 :                 *ladd = lladd;
     336         150 :                 *radd = lradd;
     337             :             }
     338         366 :             res = node->left;
     339         366 :             pfree(node);
     340             :         }
     341         564 :         else if (isphrase)
     342             :         {
     343             :             /* Absorb appropriate corrections at this level */
     344         342 :             node->valnode->qoperator.distance += lradd + rladd;
     345             :             /* Propagate up any unaccounted-for corrections */
     346         342 :             *ladd = lladd;
     347         342 :             *radd = rradd;
     348             :         }
     349             :         else
     350             :         {
     351             :             /* We're keeping a non-phrase operator, so ladd/radd remain 0 */
     352             :         }
     353             : 
     354        1182 :         return res;
     355             :     }
     356          66 :     return node;
     357             : }
     358             : 
     359             : /*
     360             :  * Number of elements in query tree
     361             :  */
     362             : static int32
     363        1602 : calcstrlen(NODE *node)
     364             : {
     365        1602 :     int32       size = 0;
     366             : 
     367        1602 :     if (node->valnode->type == QI_VAL)
     368             :     {
     369         972 :         size = node->valnode->qoperand.length + 1;
     370             :     }
     371             :     else
     372             :     {
     373             :         Assert(node->valnode->type == QI_OPR);
     374             : 
     375         630 :         size = calcstrlen(node->right);
     376         630 :         if (node->valnode->qoperator.oper != OP_NOT)
     377         564 :             size += calcstrlen(node->left);
     378             :     }
     379             : 
     380        1602 :     return size;
     381             : }
     382             : 
     383             : /*
     384             :  * Remove QI_VALSTOP (stopword) nodes from TSQuery.
     385             :  */
     386             : TSQuery
     387         432 : cleanup_tsquery_stopwords(TSQuery in, bool noisy)
     388             : {
     389             :     int32       len,
     390             :                 lenstr,
     391             :                 commonlen,
     392             :                 i;
     393             :     NODE       *root;
     394             :     int         ladd,
     395             :                 radd;
     396             :     TSQuery     out;
     397             :     QueryItem  *items;
     398             :     char       *operands;
     399             : 
     400         432 :     if (in->size == 0)
     401           0 :         return in;
     402             : 
     403             :     /* eliminate stop words */
     404         432 :     root = clean_stopword_intree(maketree(GETQUERY(in)), &ladd, &radd);
     405         432 :     if (root == NULL)
     406             :     {
     407          24 :         if (noisy)
     408          24 :             ereport(NOTICE,
     409             :                     (errmsg("text-search query contains only stop words or doesn't contain lexemes, ignored")));
     410          24 :         out = palloc(HDRSIZETQ);
     411          24 :         out->size = 0;
     412          24 :         SET_VARSIZE(out, HDRSIZETQ);
     413          24 :         return out;
     414             :     }
     415             : 
     416             :     /*
     417             :      * Build TSQuery from plain view
     418             :      */
     419             : 
     420         408 :     lenstr = calcstrlen(root);
     421         408 :     items = plaintree(root, &len);
     422         408 :     commonlen = COMPUTESIZE(len, lenstr);
     423             : 
     424         408 :     out = palloc(commonlen);
     425         408 :     SET_VARSIZE(out, commonlen);
     426         408 :     out->size = len;
     427             : 
     428         408 :     memcpy(GETQUERY(out), items, len * sizeof(QueryItem));
     429             : 
     430         408 :     items = GETQUERY(out);
     431         408 :     operands = GETOPERAND(out);
     432        2010 :     for (i = 0; i < out->size; i++)
     433             :     {
     434        1602 :         QueryOperand *op = (QueryOperand *) &items[i];
     435             : 
     436        1602 :         if (op->type != QI_VAL)
     437         630 :             continue;
     438             : 
     439         972 :         memcpy(operands, GETOPERAND(in) + op->distance, op->length);
     440         972 :         operands[op->length] = '\0';
     441         972 :         op->distance = operands - GETOPERAND(out);
     442         972 :         operands += op->length + 1;
     443             :     }
     444             : 
     445         408 :     return out;
     446             : }

Generated by: LCOV version 1.14