LCOV - code coverage report
Current view: top level - src/backend/tsearch - ts_selfuncs.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 86.5 % 104 90
Test Date: 2026-03-01 22:14:38 Functions: 83.3 % 6 5
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * ts_selfuncs.c
       4              :  *    Selectivity estimation functions for text search operators.
       5              :  *
       6              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7              :  *
       8              :  *
       9              :  * IDENTIFICATION
      10              :  *    src/backend/tsearch/ts_selfuncs.c
      11              :  *
      12              :  *-------------------------------------------------------------------------
      13              :  */
      14              : #include "postgres.h"
      15              : 
      16              : #include "access/htup_details.h"
      17              : #include "catalog/pg_statistic.h"
      18              : #include "catalog/pg_type.h"
      19              : #include "miscadmin.h"
      20              : #include "nodes/nodes.h"
      21              : #include "tsearch/ts_type.h"
      22              : #include "utils/fmgrprotos.h"
      23              : #include "utils/lsyscache.h"
      24              : #include "utils/selfuncs.h"
      25              : 
      26              : 
      27              : /*
      28              :  * The default text search selectivity is chosen to be small enough to
      29              :  * encourage indexscans for typical table densities.  See selfuncs.h and
      30              :  * DEFAULT_EQ_SEL for details.
      31              :  */
      32              : #define DEFAULT_TS_MATCH_SEL 0.005
      33              : 
      34              : /* lookup table type for binary searching through MCELEMs */
      35              : typedef struct
      36              : {
      37              :     text       *element;
      38              :     float4      frequency;
      39              : } TextFreq;
      40              : 
      41              : /* type of keys for bsearch'ing through an array of TextFreqs */
      42              : typedef struct
      43              : {
      44              :     char       *lexeme;
      45              :     int         length;
      46              : } LexemeKey;
      47              : 
      48              : static Selectivity tsquerysel(VariableStatData *vardata, Datum constval);
      49              : static Selectivity mcelem_tsquery_selec(TSQuery query,
      50              :                                         const Datum *mcelem, int nmcelem,
      51              :                                         const float4 *numbers, int nnumbers);
      52              : static Selectivity tsquery_opr_selec(QueryItem *item, char *operand,
      53              :                                      TextFreq *lookup, int length, float4 minfreq);
      54              : static int  compare_lexeme_textfreq(const void *e1, const void *e2);
      55              : 
      56              : #define tsquery_opr_selec_no_stats(query) \
      57              :     tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), NULL, 0, 0)
      58              : 
      59              : 
      60              : /*
      61              :  *  tsmatchsel -- Selectivity of "@@"
      62              :  *
      63              :  * restriction selectivity function for tsvector @@ tsquery and
      64              :  * tsquery @@ tsvector
      65              :  */
      66              : Datum
      67          513 : tsmatchsel(PG_FUNCTION_ARGS)
      68              : {
      69          513 :     PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
      70              : 
      71              : #ifdef NOT_USED
      72              :     Oid         operator = PG_GETARG_OID(1);
      73              : #endif
      74          513 :     List       *args = (List *) PG_GETARG_POINTER(2);
      75          513 :     int         varRelid = PG_GETARG_INT32(3);
      76              :     VariableStatData vardata;
      77              :     Node       *other;
      78              :     bool        varonleft;
      79              :     Selectivity selec;
      80              : 
      81              :     /*
      82              :      * If expression is not variable = something or something = variable, then
      83              :      * punt and return a default estimate.
      84              :      */
      85          513 :     if (!get_restriction_variable(root, args, varRelid,
      86              :                                   &vardata, &other, &varonleft))
      87            0 :         PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
      88              : 
      89              :     /*
      90              :      * Can't do anything useful if the something is not a constant, either.
      91              :      */
      92          513 :     if (!IsA(other, Const))
      93              :     {
      94            0 :         ReleaseVariableStats(vardata);
      95            0 :         PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
      96              :     }
      97              : 
      98              :     /*
      99              :      * The "@@" operator is strict, so we can cope with NULL right away
     100              :      */
     101          513 :     if (((Const *) other)->constisnull)
     102              :     {
     103            0 :         ReleaseVariableStats(vardata);
     104            0 :         PG_RETURN_FLOAT8(0.0);
     105              :     }
     106              : 
     107              :     /*
     108              :      * OK, there's a Var and a Const we're dealing with here.  We need the
     109              :      * Const to be a TSQuery, else we can't do anything useful.  We have to
     110              :      * check this because the Var might be the TSQuery not the TSVector.
     111              :      *
     112              :      * Also check that the Var really is a TSVector, in case this estimator is
     113              :      * mistakenly attached to some other operator.
     114              :      */
     115          513 :     if (((Const *) other)->consttype == TSQUERYOID &&
     116          513 :         vardata.vartype == TSVECTOROID)
     117              :     {
     118              :         /* tsvector @@ tsquery or the other way around */
     119          513 :         selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
     120              :     }
     121              :     else
     122              :     {
     123              :         /* If we can't see the query structure, must punt */
     124            0 :         selec = DEFAULT_TS_MATCH_SEL;
     125              :     }
     126              : 
     127          513 :     ReleaseVariableStats(vardata);
     128              : 
     129          513 :     CLAMP_PROBABILITY(selec);
     130              : 
     131          513 :     PG_RETURN_FLOAT8((float8) selec);
     132              : }
     133              : 
     134              : 
     135              : /*
     136              :  *  tsmatchjoinsel -- join selectivity of "@@"
     137              :  *
     138              :  * join selectivity function for tsvector @@ tsquery and tsquery @@ tsvector
     139              :  */
     140              : Datum
     141            0 : tsmatchjoinsel(PG_FUNCTION_ARGS)
     142              : {
     143              :     /* for the moment we just punt */
     144            0 :     PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
     145              : }
     146              : 
     147              : 
     148              : /*
     149              :  * @@ selectivity for tsvector var vs tsquery constant
     150              :  */
     151              : static Selectivity
     152          513 : tsquerysel(VariableStatData *vardata, Datum constval)
     153              : {
     154              :     Selectivity selec;
     155              :     TSQuery     query;
     156              : 
     157              :     /* The caller made sure the const is a TSQuery, so get it now */
     158          513 :     query = DatumGetTSQuery(constval);
     159              : 
     160              :     /* Empty query matches nothing */
     161          513 :     if (query->size == 0)
     162            0 :         return (Selectivity) 0.0;
     163              : 
     164          513 :     if (HeapTupleIsValid(vardata->statsTuple))
     165              :     {
     166              :         Form_pg_statistic stats;
     167              :         AttStatsSlot sslot;
     168              : 
     169          495 :         stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
     170              : 
     171              :         /* MCELEM will be an array of TEXT elements for a tsvector column */
     172          495 :         if (get_attstatsslot(&sslot, vardata->statsTuple,
     173              :                              STATISTIC_KIND_MCELEM, InvalidOid,
     174              :                              ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
     175              :         {
     176              :             /*
     177              :              * There is a most-common-elements slot for the tsvector Var, so
     178              :              * use that.
     179              :              */
     180          495 :             selec = mcelem_tsquery_selec(query, sslot.values, sslot.nvalues,
     181          495 :                                          sslot.numbers, sslot.nnumbers);
     182          495 :             free_attstatsslot(&sslot);
     183              :         }
     184              :         else
     185              :         {
     186              :             /* No most-common-elements info, so do without */
     187            0 :             selec = tsquery_opr_selec_no_stats(query);
     188              :         }
     189              : 
     190              :         /*
     191              :          * MCE stats count only non-null rows, so adjust for null rows.
     192              :          */
     193          495 :         selec *= (1.0 - stats->stanullfrac);
     194              :     }
     195              :     else
     196              :     {
     197              :         /* No stats at all, so do without */
     198           18 :         selec = tsquery_opr_selec_no_stats(query);
     199              :         /* we assume no nulls here, so no stanullfrac correction */
     200              :     }
     201              : 
     202          513 :     return selec;
     203              : }
     204              : 
     205              : /*
     206              :  * Extract data from the pg_statistic arrays into useful format.
     207              :  */
     208              : static Selectivity
     209          495 : mcelem_tsquery_selec(TSQuery query, const Datum *mcelem, int nmcelem,
     210              :                      const float4 *numbers, int nnumbers)
     211              : {
     212              :     float4      minfreq;
     213              :     TextFreq   *lookup;
     214              :     Selectivity selec;
     215              :     int         i;
     216              : 
     217              :     /*
     218              :      * There should be two more Numbers than Values, because the last two
     219              :      * cells are taken for minimal and maximal frequency.  Punt if not.
     220              :      *
     221              :      * (Note: the MCELEM statistics slot definition allows for a third extra
     222              :      * number containing the frequency of nulls, but we're not expecting that
     223              :      * to appear for a tsvector column.)
     224              :      */
     225          495 :     if (nnumbers != nmcelem + 2)
     226            0 :         return tsquery_opr_selec_no_stats(query);
     227              : 
     228              :     /*
     229              :      * Transpose the data into a single array so we can use bsearch().
     230              :      */
     231          495 :     lookup = palloc_array(TextFreq, nmcelem);
     232       495495 :     for (i = 0; i < nmcelem; i++)
     233              :     {
     234              :         /*
     235              :          * The text Datums came from an array, so it cannot be compressed or
     236              :          * stored out-of-line -- it's safe to use VARSIZE_ANY*.
     237              :          */
     238              :         Assert(!VARATT_IS_COMPRESSED(DatumGetPointer(mcelem[i])) && !VARATT_IS_EXTERNAL(DatumGetPointer(mcelem[i])));
     239       495000 :         lookup[i].element = (text *) DatumGetPointer(mcelem[i]);
     240       495000 :         lookup[i].frequency = numbers[i];
     241              :     }
     242              : 
     243              :     /*
     244              :      * Grab the lowest MCE frequency. compute_tsvector_stats() stored it for
     245              :      * us in the one before the last cell of the Numbers array.
     246              :      */
     247          495 :     minfreq = numbers[nnumbers - 2];
     248              : 
     249          495 :     selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
     250              :                               nmcelem, minfreq);
     251              : 
     252          495 :     pfree(lookup);
     253              : 
     254          495 :     return selec;
     255              : }
     256              : 
     257              : /*
     258              :  * Traverse the tsquery in preorder, calculating selectivity as:
     259              :  *
     260              :  *   selec(left_oper) * selec(right_oper) in AND & PHRASE nodes,
     261              :  *
     262              :  *   selec(left_oper) + selec(right_oper) -
     263              :  *      selec(left_oper) * selec(right_oper) in OR nodes,
     264              :  *
     265              :  *   1 - select(oper) in NOT nodes
     266              :  *
     267              :  *   histogram-based estimation in prefix VAL nodes
     268              :  *
     269              :  *   freq[val] in exact VAL nodes, if the value is in MCELEM
     270              :  *   min(freq[MCELEM]) / 2 in VAL nodes, if it is not
     271              :  *
     272              :  * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
     273              :  * binary search for determining freq[MCELEM].
     274              :  *
     275              :  * If we don't have stats for the tsvector, we still use this logic,
     276              :  * except we use default estimates for VAL nodes.  This case is signaled
     277              :  * by lookup == NULL.
     278              :  */
     279              : static Selectivity
     280         1539 : tsquery_opr_selec(QueryItem *item, char *operand,
     281              :                   TextFreq *lookup, int length, float4 minfreq)
     282              : {
     283              :     Selectivity selec;
     284              : 
     285              :     /* since this function recurses, it could be driven to stack overflow */
     286         1539 :     check_stack_depth();
     287              : 
     288         1539 :     if (item->type == QI_VAL)
     289              :     {
     290          921 :         QueryOperand *oper = (QueryOperand *) item;
     291              :         LexemeKey   key;
     292              : 
     293              :         /*
     294              :          * Prepare the key for bsearch().
     295              :          */
     296          921 :         key.lexeme = operand + oper->distance;
     297          921 :         key.length = oper->length;
     298              : 
     299          921 :         if (oper->prefix)
     300              :         {
     301              :             /* Prefix match, ie the query item is lexeme:* */
     302              :             Selectivity matched,
     303              :                         allmces;
     304              :             int         i,
     305              :                         n_matched;
     306              : 
     307              :             /*
     308              :              * Our strategy is to scan through the MCELEM list and combine the
     309              :              * frequencies of the ones that match the prefix.  We then
     310              :              * extrapolate the fraction of matching MCELEMs to the remaining
     311              :              * rows, assuming that the MCELEMs are representative of the whole
     312              :              * lexeme population in this respect.  (Compare
     313              :              * histogram_selectivity().)  Note that these are most common
     314              :              * elements not most common values, so they're not mutually
     315              :              * exclusive.  We treat occurrences as independent events.
     316              :              *
     317              :              * This is only a good plan if we have a pretty fair number of
     318              :              * MCELEMs available; we set the threshold at 100.  If no stats or
     319              :              * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
     320              :              */
     321           51 :             if (lookup == NULL || length < 100)
     322           21 :                 return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
     323              : 
     324           36 :             matched = allmces = 0;
     325           36 :             n_matched = 0;
     326        36036 :             for (i = 0; i < length; i++)
     327              :             {
     328        36000 :                 TextFreq   *t = lookup + i;
     329        36000 :                 int         tlen = VARSIZE_ANY_EXHDR(t->element);
     330              : 
     331        36000 :                 if (tlen >= key.length &&
     332        36000 :                     strncmp(key.lexeme, VARDATA_ANY(t->element),
     333        36000 :                             key.length) == 0)
     334              :                 {
     335         1296 :                     matched += t->frequency - matched * t->frequency;
     336         1296 :                     n_matched++;
     337              :                 }
     338        36000 :                 allmces += t->frequency - allmces * t->frequency;
     339              :             }
     340              : 
     341              :             /* Clamp to ensure sanity in the face of roundoff error */
     342           36 :             CLAMP_PROBABILITY(matched);
     343           36 :             CLAMP_PROBABILITY(allmces);
     344              : 
     345           36 :             selec = matched + (1.0 - allmces) * ((double) n_matched / length);
     346              : 
     347              :             /*
     348              :              * In any case, never believe that a prefix match has selectivity
     349              :              * less than we would assign for a non-MCELEM lexeme.  This
     350              :              * preserves the property that "word:*" should be estimated to
     351              :              * match at least as many rows as "word" would be.
     352              :              */
     353           36 :             selec = Max(Min(DEFAULT_TS_MATCH_SEL, minfreq / 2), selec);
     354              :         }
     355              :         else
     356              :         {
     357              :             /* Regular exact lexeme match */
     358              :             TextFreq   *searchres;
     359              : 
     360              :             /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
     361          870 :             if (lookup == NULL)
     362            6 :                 return (Selectivity) DEFAULT_TS_MATCH_SEL;
     363              : 
     364          864 :             searchres = (TextFreq *) bsearch(&key, lookup, length,
     365              :                                              sizeof(TextFreq),
     366              :                                              compare_lexeme_textfreq);
     367              : 
     368          864 :             if (searchres)
     369              :             {
     370              :                 /*
     371              :                  * The element is in MCELEM.  Return precise selectivity (or
     372              :                  * at least as precise as ANALYZE could find out).
     373              :                  */
     374          804 :                 selec = searchres->frequency;
     375              :             }
     376              :             else
     377              :             {
     378              :                 /*
     379              :                  * The element is not in MCELEM.  Estimate its frequency as
     380              :                  * half that of the least-frequent MCE.  (We know it cannot be
     381              :                  * more than minfreq, and it could be a great deal less.  Half
     382              :                  * seems like a good compromise.)  For probably-historical
     383              :                  * reasons, clamp to not more than DEFAULT_TS_MATCH_SEL.
     384              :                  */
     385           60 :                 selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
     386              :             }
     387              :         }
     388              :     }
     389              :     else
     390              :     {
     391              :         /* Current TSQuery node is an operator */
     392              :         Selectivity s1,
     393              :                     s2;
     394              : 
     395          618 :         switch (item->qoperator.oper)
     396              :         {
     397          210 :             case OP_NOT:
     398          210 :                 selec = 1.0 - tsquery_opr_selec(item + 1, operand,
     399              :                                                 lookup, length, minfreq);
     400          210 :                 break;
     401              : 
     402          285 :             case OP_PHRASE:
     403              :             case OP_AND:
     404          285 :                 s1 = tsquery_opr_selec(item + 1, operand,
     405              :                                        lookup, length, minfreq);
     406          285 :                 s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
     407              :                                        lookup, length, minfreq);
     408          285 :                 selec = s1 * s2;
     409          285 :                 break;
     410              : 
     411          123 :             case OP_OR:
     412          123 :                 s1 = tsquery_opr_selec(item + 1, operand,
     413              :                                        lookup, length, minfreq);
     414          123 :                 s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
     415              :                                        lookup, length, minfreq);
     416          123 :                 selec = s1 + s2 - s1 * s2;
     417          123 :                 break;
     418              : 
     419            0 :             default:
     420            0 :                 elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
     421              :                 selec = 0;      /* keep compiler quiet */
     422              :                 break;
     423              :         }
     424              :     }
     425              : 
     426              :     /* Clamp intermediate results to stay sane despite roundoff error */
     427         1518 :     CLAMP_PROBABILITY(selec);
     428              : 
     429         1518 :     return selec;
     430              : }
     431              : 
     432              : /*
     433              :  * bsearch() comparator for a lexeme (non-NULL terminated string with length)
     434              :  * and a TextFreq. Use length, then byte-for-byte comparison, because that's
     435              :  * how ANALYZE code sorted data before storing it in a statistic tuple.
     436              :  * See ts_typanalyze.c for details.
     437              :  */
     438              : static int
     439         6672 : compare_lexeme_textfreq(const void *e1, const void *e2)
     440              : {
     441         6672 :     const LexemeKey *key = (const LexemeKey *) e1;
     442         6672 :     const TextFreq *t = (const TextFreq *) e2;
     443              :     int         len1,
     444              :                 len2;
     445              : 
     446         6672 :     len1 = key->length;
     447         6672 :     len2 = VARSIZE_ANY_EXHDR(t->element);
     448              : 
     449              :     /* Compare lengths first, possibly avoiding a strncmp call */
     450         6672 :     if (len1 > len2)
     451          540 :         return 1;
     452         6132 :     else if (len1 < len2)
     453            0 :         return -1;
     454              : 
     455              :     /* Fall back on byte-for-byte comparison */
     456         6132 :     return strncmp(key->lexeme, VARDATA_ANY(t->element), len1);
     457              : }
        

Generated by: LCOV version 2.0-1