LCOV - code coverage report
Current view: top level - src/backend/tsearch - ts_selfuncs.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 88 102 86.3 %
Date: 2024-12-03 07:14:52 Functions: 5 6 83.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * ts_selfuncs.c
       4             :  *    Selectivity estimation functions for text search operators.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
       7             :  *
       8             :  *
       9             :  * IDENTIFICATION
      10             :  *    src/backend/tsearch/ts_selfuncs.c
      11             :  *
      12             :  *-------------------------------------------------------------------------
      13             :  */
      14             : #include "postgres.h"
      15             : 
      16             : #include "access/htup_details.h"
      17             : #include "catalog/pg_statistic.h"
      18             : #include "catalog/pg_type.h"
      19             : #include "miscadmin.h"
      20             : #include "nodes/nodes.h"
      21             : #include "tsearch/ts_type.h"
      22             : #include "utils/fmgrprotos.h"
      23             : #include "utils/lsyscache.h"
      24             : #include "utils/selfuncs.h"
      25             : 
      26             : 
      27             : /*
      28             :  * The default text search selectivity is chosen to be small enough to
      29             :  * encourage indexscans for typical table densities.  See selfuncs.h and
      30             :  * DEFAULT_EQ_SEL for details.
      31             :  */
      32             : #define DEFAULT_TS_MATCH_SEL 0.005
      33             : 
      34             : /* lookup table type for binary searching through MCELEMs */
      35             : typedef struct
      36             : {
      37             :     text       *element;
      38             :     float4      frequency;
      39             : } TextFreq;
      40             : 
      41             : /* type of keys for bsearch'ing through an array of TextFreqs */
      42             : typedef struct
      43             : {
      44             :     char       *lexeme;
      45             :     int         length;
      46             : } LexemeKey;
      47             : 
      48             : static Selectivity tsquerysel(VariableStatData *vardata, Datum constval);
      49             : static Selectivity mcelem_tsquery_selec(TSQuery query,
      50             :                                         Datum *mcelem, int nmcelem,
      51             :                                         float4 *numbers, int nnumbers);
      52             : static Selectivity tsquery_opr_selec(QueryItem *item, char *operand,
      53             :                                      TextFreq *lookup, int length, float4 minfreq);
      54             : static int  compare_lexeme_textfreq(const void *e1, const void *e2);
      55             : 
      56             : #define tsquery_opr_selec_no_stats(query) \
      57             :     tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), NULL, 0, 0)
      58             : 
      59             : 
      60             : /*
      61             :  *  tsmatchsel -- Selectivity of "@@"
      62             :  *
      63             :  * restriction selectivity function for tsvector @@ tsquery and
      64             :  * tsquery @@ tsvector
      65             :  */
      66             : Datum
      67        1026 : tsmatchsel(PG_FUNCTION_ARGS)
      68             : {
      69        1026 :     PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
      70             : 
      71             : #ifdef NOT_USED
      72             :     Oid         operator = PG_GETARG_OID(1);
      73             : #endif
      74        1026 :     List       *args = (List *) PG_GETARG_POINTER(2);
      75        1026 :     int         varRelid = PG_GETARG_INT32(3);
      76             :     VariableStatData vardata;
      77             :     Node       *other;
      78             :     bool        varonleft;
      79             :     Selectivity selec;
      80             : 
      81             :     /*
      82             :      * If expression is not variable = something or something = variable, then
      83             :      * punt and return a default estimate.
      84             :      */
      85        1026 :     if (!get_restriction_variable(root, args, varRelid,
      86             :                                   &vardata, &other, &varonleft))
      87           0 :         PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
      88             : 
      89             :     /*
      90             :      * Can't do anything useful if the something is not a constant, either.
      91             :      */
      92        1026 :     if (!IsA(other, Const))
      93             :     {
      94           0 :         ReleaseVariableStats(vardata);
      95           0 :         PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
      96             :     }
      97             : 
      98             :     /*
      99             :      * The "@@" operator is strict, so we can cope with NULL right away
     100             :      */
     101        1026 :     if (((Const *) other)->constisnull)
     102             :     {
     103           0 :         ReleaseVariableStats(vardata);
     104           0 :         PG_RETURN_FLOAT8(0.0);
     105             :     }
     106             : 
     107             :     /*
     108             :      * OK, there's a Var and a Const we're dealing with here.  We need the
     109             :      * Const to be a TSQuery, else we can't do anything useful.  We have to
     110             :      * check this because the Var might be the TSQuery not the TSVector.
     111             :      */
     112        1026 :     if (((Const *) other)->consttype == TSQUERYOID)
     113             :     {
     114             :         /* tsvector @@ tsquery or the other way around */
     115             :         Assert(vardata.vartype == TSVECTOROID);
     116             : 
     117        1026 :         selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
     118             :     }
     119             :     else
     120             :     {
     121             :         /* If we can't see the query structure, must punt */
     122           0 :         selec = DEFAULT_TS_MATCH_SEL;
     123             :     }
     124             : 
     125        1026 :     ReleaseVariableStats(vardata);
     126             : 
     127        1026 :     CLAMP_PROBABILITY(selec);
     128             : 
     129        1026 :     PG_RETURN_FLOAT8((float8) selec);
     130             : }
     131             : 
     132             : 
     133             : /*
     134             :  *  tsmatchjoinsel -- join selectivity of "@@"
     135             :  *
     136             :  * join selectivity function for tsvector @@ tsquery and tsquery @@ tsvector
     137             :  */
     138             : Datum
     139           0 : tsmatchjoinsel(PG_FUNCTION_ARGS)
     140             : {
     141             :     /* for the moment we just punt */
     142           0 :     PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
     143             : }
     144             : 
     145             : 
     146             : /*
     147             :  * @@ selectivity for tsvector var vs tsquery constant
     148             :  */
     149             : static Selectivity
     150        1026 : tsquerysel(VariableStatData *vardata, Datum constval)
     151             : {
     152             :     Selectivity selec;
     153             :     TSQuery     query;
     154             : 
     155             :     /* The caller made sure the const is a TSQuery, so get it now */
     156        1026 :     query = DatumGetTSQuery(constval);
     157             : 
     158             :     /* Empty query matches nothing */
     159        1026 :     if (query->size == 0)
     160           0 :         return (Selectivity) 0.0;
     161             : 
     162        1026 :     if (HeapTupleIsValid(vardata->statsTuple))
     163             :     {
     164             :         Form_pg_statistic stats;
     165             :         AttStatsSlot sslot;
     166             : 
     167         990 :         stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
     168             : 
     169             :         /* MCELEM will be an array of TEXT elements for a tsvector column */
     170         990 :         if (get_attstatsslot(&sslot, vardata->statsTuple,
     171             :                              STATISTIC_KIND_MCELEM, InvalidOid,
     172             :                              ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
     173             :         {
     174             :             /*
     175             :              * There is a most-common-elements slot for the tsvector Var, so
     176             :              * use that.
     177             :              */
     178         990 :             selec = mcelem_tsquery_selec(query, sslot.values, sslot.nvalues,
     179             :                                          sslot.numbers, sslot.nnumbers);
     180         990 :             free_attstatsslot(&sslot);
     181             :         }
     182             :         else
     183             :         {
     184             :             /* No most-common-elements info, so do without */
     185           0 :             selec = tsquery_opr_selec_no_stats(query);
     186             :         }
     187             : 
     188             :         /*
     189             :          * MCE stats count only non-null rows, so adjust for null rows.
     190             :          */
     191         990 :         selec *= (1.0 - stats->stanullfrac);
     192             :     }
     193             :     else
     194             :     {
     195             :         /* No stats at all, so do without */
     196          36 :         selec = tsquery_opr_selec_no_stats(query);
     197             :         /* we assume no nulls here, so no stanullfrac correction */
     198             :     }
     199             : 
     200        1026 :     return selec;
     201             : }
     202             : 
     203             : /*
     204             :  * Extract data from the pg_statistic arrays into useful format.
     205             :  */
     206             : static Selectivity
     207         990 : mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
     208             :                      float4 *numbers, int nnumbers)
     209             : {
     210             :     float4      minfreq;
     211             :     TextFreq   *lookup;
     212             :     Selectivity selec;
     213             :     int         i;
     214             : 
     215             :     /*
     216             :      * There should be two more Numbers than Values, because the last two
     217             :      * cells are taken for minimal and maximal frequency.  Punt if not.
     218             :      *
     219             :      * (Note: the MCELEM statistics slot definition allows for a third extra
     220             :      * number containing the frequency of nulls, but we're not expecting that
     221             :      * to appear for a tsvector column.)
     222             :      */
     223         990 :     if (nnumbers != nmcelem + 2)
     224           0 :         return tsquery_opr_selec_no_stats(query);
     225             : 
     226             :     /*
     227             :      * Transpose the data into a single array so we can use bsearch().
     228             :      */
     229         990 :     lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
     230      990990 :     for (i = 0; i < nmcelem; i++)
     231             :     {
     232             :         /*
     233             :          * The text Datums came from an array, so it cannot be compressed or
     234             :          * stored out-of-line -- it's safe to use VARSIZE_ANY*.
     235             :          */
     236             :         Assert(!VARATT_IS_COMPRESSED(mcelem[i]) && !VARATT_IS_EXTERNAL(mcelem[i]));
     237      990000 :         lookup[i].element = (text *) DatumGetPointer(mcelem[i]);
     238      990000 :         lookup[i].frequency = numbers[i];
     239             :     }
     240             : 
     241             :     /*
     242             :      * Grab the lowest frequency. compute_tsvector_stats() stored it for us in
     243             :      * the one before the last cell of the Numbers array. See ts_typanalyze.c
     244             :      */
     245         990 :     minfreq = numbers[nnumbers - 2];
     246             : 
     247         990 :     selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
     248             :                               nmcelem, minfreq);
     249             : 
     250         990 :     pfree(lookup);
     251             : 
     252         990 :     return selec;
     253             : }
     254             : 
     255             : /*
     256             :  * Traverse the tsquery in preorder, calculating selectivity as:
     257             :  *
     258             :  *   selec(left_oper) * selec(right_oper) in AND & PHRASE nodes,
     259             :  *
     260             :  *   selec(left_oper) + selec(right_oper) -
     261             :  *      selec(left_oper) * selec(right_oper) in OR nodes,
     262             :  *
     263             :  *   1 - select(oper) in NOT nodes
     264             :  *
     265             :  *   histogram-based estimation in prefix VAL nodes
     266             :  *
     267             :  *   freq[val] in exact VAL nodes, if the value is in MCELEM
     268             :  *   min(freq[MCELEM]) / 2 in VAL nodes, if it is not
     269             :  *
     270             :  * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
     271             :  * binary search for determining freq[MCELEM].
     272             :  *
     273             :  * If we don't have stats for the tsvector, we still use this logic,
     274             :  * except we use default estimates for VAL nodes.  This case is signaled
     275             :  * by lookup == NULL.
     276             :  */
     277             : static Selectivity
     278        3078 : tsquery_opr_selec(QueryItem *item, char *operand,
     279             :                   TextFreq *lookup, int length, float4 minfreq)
     280             : {
     281             :     Selectivity selec;
     282             : 
     283             :     /* since this function recurses, it could be driven to stack overflow */
     284        3078 :     check_stack_depth();
     285             : 
     286        3078 :     if (item->type == QI_VAL)
     287             :     {
     288        1842 :         QueryOperand *oper = (QueryOperand *) item;
     289             :         LexemeKey   key;
     290             : 
     291             :         /*
     292             :          * Prepare the key for bsearch().
     293             :          */
     294        1842 :         key.lexeme = operand + oper->distance;
     295        1842 :         key.length = oper->length;
     296             : 
     297        1842 :         if (oper->prefix)
     298             :         {
     299             :             /* Prefix match, ie the query item is lexeme:* */
     300             :             Selectivity matched,
     301             :                         allmces;
     302             :             int         i,
     303             :                         n_matched;
     304             : 
     305             :             /*
     306             :              * Our strategy is to scan through the MCELEM list and combine the
     307             :              * frequencies of the ones that match the prefix.  We then
     308             :              * extrapolate the fraction of matching MCELEMs to the remaining
     309             :              * rows, assuming that the MCELEMs are representative of the whole
     310             :              * lexeme population in this respect.  (Compare
     311             :              * histogram_selectivity().)  Note that these are most common
     312             :              * elements not most common values, so they're not mutually
     313             :              * exclusive.  We treat occurrences as independent events.
     314             :              *
     315             :              * This is only a good plan if we have a pretty fair number of
     316             :              * MCELEMs available; we set the threshold at 100.  If no stats or
     317             :              * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
     318             :              */
     319         102 :             if (lookup == NULL || length < 100)
     320          42 :                 return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
     321             : 
     322          72 :             matched = allmces = 0;
     323          72 :             n_matched = 0;
     324       72072 :             for (i = 0; i < length; i++)
     325             :             {
     326       72000 :                 TextFreq   *t = lookup + i;
     327       72000 :                 int         tlen = VARSIZE_ANY_EXHDR(t->element);
     328             : 
     329       72000 :                 if (tlen >= key.length &&
     330       72000 :                     strncmp(key.lexeme, VARDATA_ANY(t->element),
     331       72000 :                             key.length) == 0)
     332             :                 {
     333        2592 :                     matched += t->frequency - matched * t->frequency;
     334        2592 :                     n_matched++;
     335             :                 }
     336       72000 :                 allmces += t->frequency - allmces * t->frequency;
     337             :             }
     338             : 
     339             :             /* Clamp to ensure sanity in the face of roundoff error */
     340          72 :             CLAMP_PROBABILITY(matched);
     341          72 :             CLAMP_PROBABILITY(allmces);
     342             : 
     343          72 :             selec = matched + (1.0 - allmces) * ((double) n_matched / length);
     344             : 
     345             :             /*
     346             :              * In any case, never believe that a prefix match has selectivity
     347             :              * less than we would assign for a non-MCELEM lexeme.  This
     348             :              * preserves the property that "word:*" should be estimated to
     349             :              * match at least as many rows as "word" would be.
     350             :              */
     351          72 :             selec = Max(Min(DEFAULT_TS_MATCH_SEL, minfreq / 2), selec);
     352             :         }
     353             :         else
     354             :         {
     355             :             /* Regular exact lexeme match */
     356             :             TextFreq   *searchres;
     357             : 
     358             :             /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
     359        1740 :             if (lookup == NULL)
     360          12 :                 return (Selectivity) DEFAULT_TS_MATCH_SEL;
     361             : 
     362        1728 :             searchres = (TextFreq *) bsearch(&key, lookup, length,
     363             :                                              sizeof(TextFreq),
     364             :                                              compare_lexeme_textfreq);
     365             : 
     366        1728 :             if (searchres)
     367             :             {
     368             :                 /*
     369             :                  * The element is in MCELEM.  Return precise selectivity (or
     370             :                  * at least as precise as ANALYZE could find out).
     371             :                  */
     372        1608 :                 selec = searchres->frequency;
     373             :             }
     374             :             else
     375             :             {
     376             :                 /*
     377             :                  * The element is not in MCELEM.  Punt, but assume that the
     378             :                  * selectivity cannot be more than minfreq / 2.
     379             :                  */
     380         120 :                 selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
     381             :             }
     382             :         }
     383             :     }
     384             :     else
     385             :     {
     386             :         /* Current TSQuery node is an operator */
     387             :         Selectivity s1,
     388             :                     s2;
     389             : 
     390        1236 :         switch (item->qoperator.oper)
     391             :         {
     392         420 :             case OP_NOT:
     393         420 :                 selec = 1.0 - tsquery_opr_selec(item + 1, operand,
     394             :                                                 lookup, length, minfreq);
     395         420 :                 break;
     396             : 
     397         570 :             case OP_PHRASE:
     398             :             case OP_AND:
     399         570 :                 s1 = tsquery_opr_selec(item + 1, operand,
     400             :                                        lookup, length, minfreq);
     401         570 :                 s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
     402             :                                        lookup, length, minfreq);
     403         570 :                 selec = s1 * s2;
     404         570 :                 break;
     405             : 
     406         246 :             case OP_OR:
     407         246 :                 s1 = tsquery_opr_selec(item + 1, operand,
     408             :                                        lookup, length, minfreq);
     409         246 :                 s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
     410             :                                        lookup, length, minfreq);
     411         246 :                 selec = s1 + s2 - s1 * s2;
     412         246 :                 break;
     413             : 
     414           0 :             default:
     415           0 :                 elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
     416             :                 selec = 0;      /* keep compiler quiet */
     417             :                 break;
     418             :         }
     419             :     }
     420             : 
     421             :     /* Clamp intermediate results to stay sane despite roundoff error */
     422        3036 :     CLAMP_PROBABILITY(selec);
     423             : 
     424        3036 :     return selec;
     425             : }
     426             : 
     427             : /*
     428             :  * bsearch() comparator for a lexeme (non-NULL terminated string with length)
     429             :  * and a TextFreq. Use length, then byte-for-byte comparison, because that's
     430             :  * how ANALYZE code sorted data before storing it in a statistic tuple.
     431             :  * See ts_typanalyze.c for details.
     432             :  */
     433             : static int
     434       13344 : compare_lexeme_textfreq(const void *e1, const void *e2)
     435             : {
     436       13344 :     const LexemeKey *key = (const LexemeKey *) e1;
     437       13344 :     const TextFreq *t = (const TextFreq *) e2;
     438             :     int         len1,
     439             :                 len2;
     440             : 
     441       13344 :     len1 = key->length;
     442       13344 :     len2 = VARSIZE_ANY_EXHDR(t->element);
     443             : 
     444             :     /* Compare lengths first, possibly avoiding a strncmp call */
     445       13344 :     if (len1 > len2)
     446        1080 :         return 1;
     447       12264 :     else if (len1 < len2)
     448           0 :         return -1;
     449             : 
     450             :     /* Fall back on byte-for-byte comparison */
     451       12264 :     return strncmp(key->lexeme, VARDATA_ANY(t->element), len1);
     452             : }

Generated by: LCOV version 1.14