LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtutils.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13beta1 Lines: 580 739 78.5 %
Date: 2020-06-01 08:06:25 Functions: 30 32 93.8 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtutils.c
       4             :  *    Utility code for Postgres btree implementation.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/access/nbtree/nbtutils.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : 
      16             : #include "postgres.h"
      17             : 
      18             : #include <time.h>
      19             : 
      20             : #include "access/nbtree.h"
      21             : #include "access/reloptions.h"
      22             : #include "access/relscan.h"
      23             : #include "catalog/catalog.h"
      24             : #include "commands/progress.h"
      25             : #include "lib/qunique.h"
      26             : #include "miscadmin.h"
      27             : #include "utils/array.h"
      28             : #include "utils/datum.h"
      29             : #include "utils/lsyscache.h"
      30             : #include "utils/memutils.h"
      31             : #include "utils/rel.h"
      32             : 
      33             : 
      34             : typedef struct BTSortArrayContext
      35             : {
      36             :     FmgrInfo    flinfo;
      37             :     Oid         collation;
      38             :     bool        reverse;
      39             : } BTSortArrayContext;
      40             : 
      41             : static Datum _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
      42             :                                       StrategyNumber strat,
      43             :                                       Datum *elems, int nelems);
      44             : static int  _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
      45             :                                     bool reverse,
      46             :                                     Datum *elems, int nelems);
      47             : static int  _bt_compare_array_elements(const void *a, const void *b, void *arg);
      48             : static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
      49             :                                      ScanKey leftarg, ScanKey rightarg,
      50             :                                      bool *result);
      51             : static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption);
      52             : static void _bt_mark_scankey_required(ScanKey skey);
      53             : static bool _bt_check_rowcompare(ScanKey skey,
      54             :                                  IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
      55             :                                  ScanDirection dir, bool *continuescan);
      56             : static int  _bt_keep_natts(Relation rel, IndexTuple lastleft,
      57             :                            IndexTuple firstright, BTScanInsert itup_key);
      58             : 
      59             : 
      60             : /*
      61             :  * _bt_mkscankey
      62             :  *      Build an insertion scan key that contains comparison data from itup
      63             :  *      as well as comparator routines appropriate to the key datatypes.
      64             :  *
      65             :  *      When itup is a non-pivot tuple, the returned insertion scan key is
      66             :  *      suitable for finding a place for it to go on the leaf level.  Pivot
      67             :  *      tuples can be used to re-find leaf page with matching high key, but
      68             :  *      then caller needs to set scan key's pivotsearch field to true.  This
      69             :  *      allows caller to search for a leaf page with a matching high key,
      70             :  *      which is usually to the left of the first leaf page a non-pivot match
      71             :  *      might appear on.
      72             :  *
      73             :  *      The result is intended for use with _bt_compare() and _bt_truncate().
      74             :  *      Callers that don't need to fill out the insertion scankey arguments
      75             :  *      (e.g. they use an ad-hoc comparison routine, or only need a scankey
      76             :  *      for _bt_truncate()) can pass a NULL index tuple.  The scankey will
      77             :  *      be initialized as if an "all truncated" pivot tuple was passed
      78             :  *      instead.
      79             :  *
      80             :  *      Note that we may occasionally have to share lock the metapage to
      81             :  *      determine whether or not the keys in the index are expected to be
      82             :  *      unique (i.e. if this is a "heapkeyspace" index).  We assume a
      83             :  *      heapkeyspace index when caller passes a NULL tuple, allowing index
      84             :  *      build callers to avoid accessing the non-existent metapage.  We
      85             :  *      also assume that the index is _not_ allequalimage when a NULL tuple
      86             :  *      is passed; CREATE INDEX callers call _bt_allequalimage() to set the
      87             :  *      field themselves.
      88             :  */
      89             : BTScanInsert
      90    13714366 : _bt_mkscankey(Relation rel, IndexTuple itup)
      91             : {
      92             :     BTScanInsert key;
      93             :     ScanKey     skey;
      94             :     TupleDesc   itupdesc;
      95             :     int         indnkeyatts;
      96             :     int16      *indoption;
      97             :     int         tupnatts;
      98             :     int         i;
      99             : 
     100    13714366 :     itupdesc = RelationGetDescr(rel);
     101    13714366 :     indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
     102    13714366 :     indoption = rel->rd_indoption;
     103    13714366 :     tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0;
     104             : 
     105             :     Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel));
     106             : 
     107             :     /*
     108             :      * We'll execute search using scan key constructed on key columns.
     109             :      * Truncated attributes and non-key attributes are omitted from the final
     110             :      * scan key.
     111             :      */
     112    13714366 :     key = palloc(offsetof(BTScanInsertData, scankeys) +
     113    13714366 :                  sizeof(ScanKeyData) * indnkeyatts);
     114    13714366 :     if (itup)
     115    13492970 :         _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage);
     116             :     else
     117             :     {
     118             :         /* Utility statement callers can set these fields themselves */
     119      221396 :         key->heapkeyspace = true;
     120      221396 :         key->allequalimage = false;
     121             :     }
     122    13714366 :     key->anynullkeys = false;    /* initial assumption */
     123    13714366 :     key->nextkey = false;
     124    13714366 :     key->pivotsearch = false;
     125    13714366 :     key->keysz = Min(indnkeyatts, tupnatts);
     126    13714366 :     key->scantid = key->heapkeyspace && itup ?
     127    27428732 :         BTreeTupleGetHeapTID(itup) : NULL;
     128    13714366 :     skey = key->scankeys;
     129    43867620 :     for (i = 0; i < indnkeyatts; i++)
     130             :     {
     131             :         FmgrInfo   *procinfo;
     132             :         Datum       arg;
     133             :         bool        null;
     134             :         int         flags;
     135             : 
     136             :         /*
     137             :          * We can use the cached (default) support procs since no cross-type
     138             :          * comparison can be needed.
     139             :          */
     140    30153254 :         procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
     141             : 
     142             :         /*
     143             :          * Key arguments built from truncated attributes (or when caller
     144             :          * provides no tuple) are defensively represented as NULL values. They
     145             :          * should never be used.
     146             :          */
     147    30153254 :         if (i < tupnatts)
     148    29765322 :             arg = index_getattr(itup, i + 1, itupdesc, &null);
     149             :         else
     150             :         {
     151      387932 :             arg = (Datum) 0;
     152      387932 :             null = true;
     153             :         }
     154    30153254 :         flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
     155    60306508 :         ScanKeyEntryInitializeWithInfo(&skey[i],
     156             :                                        flags,
     157    30153254 :                                        (AttrNumber) (i + 1),
     158             :                                        InvalidStrategy,
     159             :                                        InvalidOid,
     160    30153254 :                                        rel->rd_indcollation[i],
     161             :                                        procinfo,
     162             :                                        arg);
     163             :         /* Record if any key attribute is NULL (or truncated) */
     164    30153254 :         if (null)
     165      388032 :             key->anynullkeys = true;
     166             :     }
     167             : 
     168    13714366 :     return key;
     169             : }
     170             : 
     171             : /*
     172             :  * free a retracement stack made by _bt_search.
     173             :  */
     174             : void
     175    21461334 : _bt_freestack(BTStack stack)
     176             : {
     177             :     BTStack     ostack;
     178             : 
     179    40595340 :     while (stack != NULL)
     180             :     {
     181    19134006 :         ostack = stack;
     182    19134006 :         stack = stack->bts_parent;
     183    19134006 :         pfree(ostack);
     184             :     }
     185    21461334 : }
     186             : 
     187             : 
     188             : /*
     189             :  *  _bt_preprocess_array_keys() -- Preprocess SK_SEARCHARRAY scan keys
     190             :  *
     191             :  * If there are any SK_SEARCHARRAY scan keys, deconstruct the array(s) and
     192             :  * set up BTArrayKeyInfo info for each one that is an equality-type key.
     193             :  * Prepare modified scan keys in so->arrayKeyData, which will hold the current
     194             :  * array elements during each primitive indexscan operation.  For inequality
     195             :  * array keys, it's sufficient to find the extreme element value and replace
     196             :  * the whole array with that scalar value.
     197             :  *
     198             :  * Note: the reason we need so->arrayKeyData, rather than just scribbling
     199             :  * on scan->keyData, is that callers are permitted to call btrescan without
     200             :  * supplying a new set of scankey data.
     201             :  */
     202             : void
     203    11193068 : _bt_preprocess_array_keys(IndexScanDesc scan)
     204             : {
     205    11193068 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     206    11193068 :     int         numberOfKeys = scan->numberOfKeys;
     207    11193068 :     int16      *indoption = scan->indexRelation->rd_indoption;
     208             :     int         numArrayKeys;
     209             :     ScanKey     cur;
     210             :     int         i;
     211             :     MemoryContext oldContext;
     212             : 
     213             :     /* Quick check to see if there are any array keys */
     214    11193068 :     numArrayKeys = 0;
     215    30414128 :     for (i = 0; i < numberOfKeys; i++)
     216             :     {
     217    19221060 :         cur = &scan->keyData[i];
     218    19221060 :         if (cur->sk_flags & SK_SEARCHARRAY)
     219             :         {
     220         448 :             numArrayKeys++;
     221             :             Assert(!(cur->sk_flags & (SK_ROW_HEADER | SK_SEARCHNULL | SK_SEARCHNOTNULL)));
     222             :             /* If any arrays are null as a whole, we can quit right now. */
     223         448 :             if (cur->sk_flags & SK_ISNULL)
     224             :             {
     225           0 :                 so->numArrayKeys = -1;
     226           0 :                 so->arrayKeyData = NULL;
     227           0 :                 return;
     228             :             }
     229             :         }
     230             :     }
     231             : 
     232             :     /* Quit if nothing to do. */
     233    11193068 :     if (numArrayKeys == 0)
     234             :     {
     235    11192672 :         so->numArrayKeys = 0;
     236    11192672 :         so->arrayKeyData = NULL;
     237    11192672 :         return;
     238             :     }
     239             : 
     240             :     /*
     241             :      * Make a scan-lifespan context to hold array-associated data, or reset it
     242             :      * if we already have one from a previous rescan cycle.
     243             :      */
     244         396 :     if (so->arrayContext == NULL)
     245         396 :         so->arrayContext = AllocSetContextCreate(CurrentMemoryContext,
     246             :                                                  "BTree array context",
     247             :                                                  ALLOCSET_SMALL_SIZES);
     248             :     else
     249           0 :         MemoryContextReset(so->arrayContext);
     250             : 
     251         396 :     oldContext = MemoryContextSwitchTo(so->arrayContext);
     252             : 
     253             :     /* Create modifiable copy of scan->keyData in the workspace context */
     254         396 :     so->arrayKeyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
     255         792 :     memcpy(so->arrayKeyData,
     256         396 :            scan->keyData,
     257         396 :            scan->numberOfKeys * sizeof(ScanKeyData));
     258             : 
     259             :     /* Allocate space for per-array data in the workspace context */
     260         396 :     so->arrayKeys = (BTArrayKeyInfo *) palloc0(numArrayKeys * sizeof(BTArrayKeyInfo));
     261             : 
     262             :     /* Now process each array key */
     263         396 :     numArrayKeys = 0;
     264         960 :     for (i = 0; i < numberOfKeys; i++)
     265             :     {
     266             :         ArrayType  *arrayval;
     267             :         int16       elmlen;
     268             :         bool        elmbyval;
     269             :         char        elmalign;
     270             :         int         num_elems;
     271             :         Datum      *elem_values;
     272             :         bool       *elem_nulls;
     273             :         int         num_nonnulls;
     274             :         int         j;
     275             : 
     276         564 :         cur = &so->arrayKeyData[i];
     277         564 :         if (!(cur->sk_flags & SK_SEARCHARRAY))
     278         120 :             continue;
     279             : 
     280             :         /*
     281             :          * First, deconstruct the array into elements.  Anything allocated
     282             :          * here (including a possibly detoasted array value) is in the
     283             :          * workspace context.
     284             :          */
     285         448 :         arrayval = DatumGetArrayTypeP(cur->sk_argument);
     286             :         /* We could cache this data, but not clear it's worth it */
     287         448 :         get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
     288             :                              &elmlen, &elmbyval, &elmalign);
     289         448 :         deconstruct_array(arrayval,
     290             :                           ARR_ELEMTYPE(arrayval),
     291             :                           elmlen, elmbyval, elmalign,
     292             :                           &elem_values, &elem_nulls, &num_elems);
     293             : 
     294             :         /*
     295             :          * Compress out any null elements.  We can ignore them since we assume
     296             :          * all btree operators are strict.
     297             :          */
     298         448 :         num_nonnulls = 0;
     299        2372 :         for (j = 0; j < num_elems; j++)
     300             :         {
     301        1924 :             if (!elem_nulls[j])
     302        1924 :                 elem_values[num_nonnulls++] = elem_values[j];
     303             :         }
     304             : 
     305             :         /* We could pfree(elem_nulls) now, but not worth the cycles */
     306             : 
     307             :         /* If there's no non-nulls, the scan qual is unsatisfiable */
     308         448 :         if (num_nonnulls == 0)
     309             :         {
     310           0 :             numArrayKeys = -1;
     311           0 :             break;
     312             :         }
     313             : 
     314             :         /*
     315             :          * If the comparison operator is not equality, then the array qual
     316             :          * degenerates to a simple comparison against the smallest or largest
     317             :          * non-null array element, as appropriate.
     318             :          */
     319         448 :         switch (cur->sk_strategy)
     320             :         {
     321           0 :             case BTLessStrategyNumber:
     322             :             case BTLessEqualStrategyNumber:
     323           0 :                 cur->sk_argument =
     324           0 :                     _bt_find_extreme_element(scan, cur,
     325             :                                              BTGreaterStrategyNumber,
     326             :                                              elem_values, num_nonnulls);
     327           0 :                 continue;
     328         444 :             case BTEqualStrategyNumber:
     329             :                 /* proceed with rest of loop */
     330         444 :                 break;
     331           4 :             case BTGreaterEqualStrategyNumber:
     332             :             case BTGreaterStrategyNumber:
     333           4 :                 cur->sk_argument =
     334           4 :                     _bt_find_extreme_element(scan, cur,
     335             :                                              BTLessStrategyNumber,
     336             :                                              elem_values, num_nonnulls);
     337           4 :                 continue;
     338           0 :             default:
     339           0 :                 elog(ERROR, "unrecognized StrategyNumber: %d",
     340             :                      (int) cur->sk_strategy);
     341             :                 break;
     342             :         }
     343             : 
     344             :         /*
     345             :          * Sort the non-null elements and eliminate any duplicates.  We must
     346             :          * sort in the same ordering used by the index column, so that the
     347             :          * successive primitive indexscans produce data in index order.
     348             :          */
     349        1332 :         num_elems = _bt_sort_array_elements(scan, cur,
     350         444 :                                             (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0,
     351             :                                             elem_values, num_nonnulls);
     352             : 
     353             :         /*
     354             :          * And set up the BTArrayKeyInfo data.
     355             :          */
     356         444 :         so->arrayKeys[numArrayKeys].scan_key = i;
     357         444 :         so->arrayKeys[numArrayKeys].num_elems = num_elems;
     358         444 :         so->arrayKeys[numArrayKeys].elem_values = elem_values;
     359         444 :         numArrayKeys++;
     360             :     }
     361             : 
     362         396 :     so->numArrayKeys = numArrayKeys;
     363             : 
     364         396 :     MemoryContextSwitchTo(oldContext);
     365             : }
     366             : 
     367             : /*
     368             :  * _bt_find_extreme_element() -- get least or greatest array element
     369             :  *
     370             :  * scan and skey identify the index column, whose opfamily determines the
     371             :  * comparison semantics.  strat should be BTLessStrategyNumber to get the
     372             :  * least element, or BTGreaterStrategyNumber to get the greatest.
     373             :  */
     374             : static Datum
     375           4 : _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
     376             :                          StrategyNumber strat,
     377             :                          Datum *elems, int nelems)
     378             : {
     379           4 :     Relation    rel = scan->indexRelation;
     380             :     Oid         elemtype,
     381             :                 cmp_op;
     382             :     RegProcedure cmp_proc;
     383             :     FmgrInfo    flinfo;
     384             :     Datum       result;
     385             :     int         i;
     386             : 
     387             :     /*
     388             :      * Determine the nominal datatype of the array elements.  We have to
     389             :      * support the convention that sk_subtype == InvalidOid means the opclass
     390             :      * input type; this is a hack to simplify life for ScanKeyInit().
     391             :      */
     392           4 :     elemtype = skey->sk_subtype;
     393           4 :     if (elemtype == InvalidOid)
     394           0 :         elemtype = rel->rd_opcintype[skey->sk_attno - 1];
     395             : 
     396             :     /*
     397             :      * Look up the appropriate comparison operator in the opfamily.
     398             :      *
     399             :      * Note: it's possible that this would fail, if the opfamily is
     400             :      * incomplete, but it seems quite unlikely that an opfamily would omit
     401             :      * non-cross-type comparison operators for any datatype that it supports
     402             :      * at all.
     403             :      */
     404           4 :     cmp_op = get_opfamily_member(rel->rd_opfamily[skey->sk_attno - 1],
     405             :                                  elemtype,
     406             :                                  elemtype,
     407             :                                  strat);
     408           4 :     if (!OidIsValid(cmp_op))
     409           0 :         elog(ERROR, "missing operator %d(%u,%u) in opfamily %u",
     410             :              strat, elemtype, elemtype,
     411             :              rel->rd_opfamily[skey->sk_attno - 1]);
     412           4 :     cmp_proc = get_opcode(cmp_op);
     413           4 :     if (!RegProcedureIsValid(cmp_proc))
     414           0 :         elog(ERROR, "missing oprcode for operator %u", cmp_op);
     415             : 
     416           4 :     fmgr_info(cmp_proc, &flinfo);
     417             : 
     418             :     Assert(nelems > 0);
     419           4 :     result = elems[0];
     420           8 :     for (i = 1; i < nelems; i++)
     421             :     {
     422           4 :         if (DatumGetBool(FunctionCall2Coll(&flinfo,
     423             :                                            skey->sk_collation,
     424             :                                            elems[i],
     425             :                                            result)))
     426           0 :             result = elems[i];
     427             :     }
     428             : 
     429           4 :     return result;
     430             : }
     431             : 
     432             : /*
     433             :  * _bt_sort_array_elements() -- sort and de-dup array elements
     434             :  *
     435             :  * The array elements are sorted in-place, and the new number of elements
     436             :  * after duplicate removal is returned.
     437             :  *
     438             :  * scan and skey identify the index column, whose opfamily determines the
     439             :  * comparison semantics.  If reverse is true, we sort in descending order.
     440             :  */
     441             : static int
     442         444 : _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
     443             :                         bool reverse,
     444             :                         Datum *elems, int nelems)
     445             : {
     446         444 :     Relation    rel = scan->indexRelation;
     447             :     Oid         elemtype;
     448             :     RegProcedure cmp_proc;
     449             :     BTSortArrayContext cxt;
     450             : 
     451         444 :     if (nelems <= 1)
     452           8 :         return nelems;          /* no work to do */
     453             : 
     454             :     /*
     455             :      * Determine the nominal datatype of the array elements.  We have to
     456             :      * support the convention that sk_subtype == InvalidOid means the opclass
     457             :      * input type; this is a hack to simplify life for ScanKeyInit().
     458             :      */
     459         436 :     elemtype = skey->sk_subtype;
     460         436 :     if (elemtype == InvalidOid)
     461           0 :         elemtype = rel->rd_opcintype[skey->sk_attno - 1];
     462             : 
     463             :     /*
     464             :      * Look up the appropriate comparison function in the opfamily.
     465             :      *
     466             :      * Note: it's possible that this would fail, if the opfamily is
     467             :      * incomplete, but it seems quite unlikely that an opfamily would omit
     468             :      * non-cross-type support functions for any datatype that it supports at
     469             :      * all.
     470             :      */
     471         436 :     cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1],
     472             :                                  elemtype,
     473             :                                  elemtype,
     474             :                                  BTORDER_PROC);
     475         436 :     if (!RegProcedureIsValid(cmp_proc))
     476           0 :         elog(ERROR, "missing support function %d(%u,%u) in opfamily %u",
     477             :              BTORDER_PROC, elemtype, elemtype,
     478             :              rel->rd_opfamily[skey->sk_attno - 1]);
     479             : 
     480             :     /* Sort the array elements */
     481         436 :     fmgr_info(cmp_proc, &cxt.flinfo);
     482         436 :     cxt.collation = skey->sk_collation;
     483         436 :     cxt.reverse = reverse;
     484         436 :     qsort_arg((void *) elems, nelems, sizeof(Datum),
     485             :               _bt_compare_array_elements, (void *) &cxt);
     486             : 
     487             :     /* Now scan the sorted elements and remove duplicates */
     488         436 :     return qunique_arg(elems, nelems, sizeof(Datum),
     489             :                        _bt_compare_array_elements, &cxt);
     490             : }
     491             : 
     492             : /*
     493             :  * qsort_arg comparator for sorting array elements
     494             :  */
     495             : static int
     496        4156 : _bt_compare_array_elements(const void *a, const void *b, void *arg)
     497             : {
     498        4156 :     Datum       da = *((const Datum *) a);
     499        4156 :     Datum       db = *((const Datum *) b);
     500        4156 :     BTSortArrayContext *cxt = (BTSortArrayContext *) arg;
     501             :     int32       compare;
     502             : 
     503        4156 :     compare = DatumGetInt32(FunctionCall2Coll(&cxt->flinfo,
     504             :                                               cxt->collation,
     505             :                                               da, db));
     506        4156 :     if (cxt->reverse)
     507           0 :         INVERT_COMPARE_RESULT(compare);
     508        4156 :     return compare;
     509             : }
     510             : 
     511             : /*
     512             :  * _bt_start_array_keys() -- Initialize array keys at start of a scan
     513             :  *
     514             :  * Set up the cur_elem counters and fill in the first sk_argument value for
     515             :  * each array scankey.  We can't do this until we know the scan direction.
     516             :  */
     517             : void
     518         390 : _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
     519             : {
     520         390 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     521             :     int         i;
     522             : 
     523         832 :     for (i = 0; i < so->numArrayKeys; i++)
     524             :     {
     525         442 :         BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
     526         442 :         ScanKey     skey = &so->arrayKeyData[curArrayKey->scan_key];
     527             : 
     528             :         Assert(curArrayKey->num_elems > 0);
     529         442 :         if (ScanDirectionIsBackward(dir))
     530           0 :             curArrayKey->cur_elem = curArrayKey->num_elems - 1;
     531             :         else
     532         442 :             curArrayKey->cur_elem = 0;
     533         442 :         skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem];
     534             :     }
     535         390 : }
     536             : 
     537             : /*
     538             :  * _bt_advance_array_keys() -- Advance to next set of array elements
     539             :  *
     540             :  * Returns true if there is another set of values to consider, false if not.
     541             :  * On true result, the scankeys are initialized with the next set of values.
     542             :  */
     543             : bool
     544        2352 : _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir)
     545             : {
     546        2352 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     547        2352 :     bool        found = false;
     548             :     int         i;
     549             : 
     550             :     /*
     551             :      * We must advance the last array key most quickly, since it will
     552             :      * correspond to the lowest-order index column among the available
     553             :      * qualifications. This is necessary to ensure correct ordering of output
     554             :      * when there are multiple array keys.
     555             :      */
     556        3202 :     for (i = so->numArrayKeys - 1; i >= 0; i--)
     557             :     {
     558        2816 :         BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
     559        2816 :         ScanKey     skey = &so->arrayKeyData[curArrayKey->scan_key];
     560        2816 :         int         cur_elem = curArrayKey->cur_elem;
     561        2816 :         int         num_elems = curArrayKey->num_elems;
     562             : 
     563        2816 :         if (ScanDirectionIsBackward(dir))
     564             :         {
     565           0 :             if (--cur_elem < 0)
     566             :             {
     567           0 :                 cur_elem = num_elems - 1;
     568           0 :                 found = false;  /* need to advance next array key */
     569             :             }
     570             :             else
     571           0 :                 found = true;
     572             :         }
     573             :         else
     574             :         {
     575        2816 :             if (++cur_elem >= num_elems)
     576             :             {
     577         850 :                 cur_elem = 0;
     578         850 :                 found = false;  /* need to advance next array key */
     579             :             }
     580             :             else
     581        1966 :                 found = true;
     582             :         }
     583             : 
     584        2816 :         curArrayKey->cur_elem = cur_elem;
     585        2816 :         skey->sk_argument = curArrayKey->elem_values[cur_elem];
     586        2816 :         if (found)
     587        1966 :             break;
     588             :     }
     589             : 
     590             :     /* advance parallel scan */
     591        2352 :     if (scan->parallel_scan != NULL)
     592           0 :         _bt_parallel_advance_array_keys(scan);
     593             : 
     594        2352 :     return found;
     595             : }
     596             : 
     597             : /*
     598             :  * _bt_mark_array_keys() -- Handle array keys during btmarkpos
     599             :  *
     600             :  * Save the current state of the array keys as the "mark" position.
     601             :  */
     602             : void
     603           4 : _bt_mark_array_keys(IndexScanDesc scan)
     604             : {
     605           4 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     606             :     int         i;
     607             : 
     608           8 :     for (i = 0; i < so->numArrayKeys; i++)
     609             :     {
     610           4 :         BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
     611             : 
     612           4 :         curArrayKey->mark_elem = curArrayKey->cur_elem;
     613             :     }
     614           4 : }
     615             : 
     616             : /*
     617             :  * _bt_restore_array_keys() -- Handle array keys during btrestrpos
     618             :  *
     619             :  * Restore the array keys to where they were when the mark was set.
     620             :  */
     621             : void
     622           4 : _bt_restore_array_keys(IndexScanDesc scan)
     623             : {
     624           4 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     625           4 :     bool        changed = false;
     626             :     int         i;
     627             : 
     628             :     /* Restore each array key to its position when the mark was set */
     629           8 :     for (i = 0; i < so->numArrayKeys; i++)
     630             :     {
     631           4 :         BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
     632           4 :         ScanKey     skey = &so->arrayKeyData[curArrayKey->scan_key];
     633           4 :         int         mark_elem = curArrayKey->mark_elem;
     634             : 
     635           4 :         if (curArrayKey->cur_elem != mark_elem)
     636             :         {
     637           0 :             curArrayKey->cur_elem = mark_elem;
     638           0 :             skey->sk_argument = curArrayKey->elem_values[mark_elem];
     639           0 :             changed = true;
     640             :         }
     641             :     }
     642             : 
     643             :     /*
     644             :      * If we changed any keys, we must redo _bt_preprocess_keys.  That might
     645             :      * sound like overkill, but in cases with multiple keys per index column
     646             :      * it seems necessary to do the full set of pushups.
     647             :      */
     648           4 :     if (changed)
     649             :     {
     650           0 :         _bt_preprocess_keys(scan);
     651             :         /* The mark should have been set on a consistent set of keys... */
     652             :         Assert(so->qual_ok);
     653             :     }
     654           4 : }
     655             : 
     656             : 
     657             : /*
     658             :  *  _bt_preprocess_keys() -- Preprocess scan keys
     659             :  *
     660             :  * The given search-type keys (in scan->keyData[] or so->arrayKeyData[])
     661             :  * are copied to so->keyData[] with possible transformation.
     662             :  * scan->numberOfKeys is the number of input keys, so->numberOfKeys gets
     663             :  * the number of output keys (possibly less, never greater).
     664             :  *
     665             :  * The output keys are marked with additional sk_flags bits beyond the
     666             :  * system-standard bits supplied by the caller.  The DESC and NULLS_FIRST
     667             :  * indoption bits for the relevant index attribute are copied into the flags.
     668             :  * Also, for a DESC column, we commute (flip) all the sk_strategy numbers
     669             :  * so that the index sorts in the desired direction.
     670             :  *
     671             :  * One key purpose of this routine is to discover which scan keys must be
     672             :  * satisfied to continue the scan.  It also attempts to eliminate redundant
     673             :  * keys and detect contradictory keys.  (If the index opfamily provides
     674             :  * incomplete sets of cross-type operators, we may fail to detect redundant
     675             :  * or contradictory keys, but we can survive that.)
     676             :  *
     677             :  * The output keys must be sorted by index attribute.  Presently we expect
     678             :  * (but verify) that the input keys are already so sorted --- this is done
     679             :  * by match_clauses_to_index() in indxpath.c.  Some reordering of the keys
     680             :  * within each attribute may be done as a byproduct of the processing here,
     681             :  * but no other code depends on that.
     682             :  *
     683             :  * The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD
     684             :  * if they must be satisfied in order to continue the scan forward or backward
     685             :  * respectively.  _bt_checkkeys uses these flags.  For example, if the quals
     686             :  * are "x = 1 AND y < 4 AND z < 5", then _bt_checkkeys will reject a tuple
     687             :  * (1,2,7), but we must continue the scan in case there are tuples (1,3,z).
     688             :  * But once we reach tuples like (1,4,z) we can stop scanning because no
     689             :  * later tuples could match.  This is reflected by marking the x and y keys,
     690             :  * but not the z key, with SK_BT_REQFWD.  In general, the keys for leading
     691             :  * attributes with "=" keys are marked both SK_BT_REQFWD and SK_BT_REQBKWD.
     692             :  * For the first attribute without an "=" key, any "<" and "<=" keys are
     693             :  * marked SK_BT_REQFWD while any ">" and ">=" keys are marked SK_BT_REQBKWD.
     694             :  * This can be seen to be correct by considering the above example.  Note
     695             :  * in particular that if there are no keys for a given attribute, the keys for
     696             :  * subsequent attributes can never be required; for instance "WHERE y = 4"
     697             :  * requires a full-index scan.
     698             :  *
     699             :  * If possible, redundant keys are eliminated: we keep only the tightest
     700             :  * >/>= bound and the tightest </<= bound, and if there's an = key then
     701             :  * that's the only one returned.  (So, we return either a single = key,
     702             :  * or one or two boundary-condition keys for each attr.)  However, if we
     703             :  * cannot compare two keys for lack of a suitable cross-type operator,
     704             :  * we cannot eliminate either.  If there are two such keys of the same
     705             :  * operator strategy, the second one is just pushed into the output array
     706             :  * without further processing here.  We may also emit both >/>= or both
     707             :  * </<= keys if we can't compare them.  The logic about required keys still
     708             :  * works if we don't eliminate redundant keys.
     709             :  *
     710             :  * Note that one reason we need direction-sensitive required-key flags is
     711             :  * precisely that we may not be able to eliminate redundant keys.  Suppose
     712             :  * we have "x > 4::int AND x > 10::bigint", and we are unable to determine
     713             :  * which key is more restrictive for lack of a suitable cross-type operator.
     714             :  * _bt_first will arbitrarily pick one of the keys to do the initial
     715             :  * positioning with.  If it picks x > 4, then the x > 10 condition will fail
     716             :  * until we reach index entries > 10; but we can't stop the scan just because
     717             :  * x > 10 is failing.  On the other hand, if we are scanning backwards, then
     718             :  * failure of either key is indeed enough to stop the scan.  (In general, when
     719             :  * inequality keys are present, the initial-positioning code only promises to
     720             :  * position before the first possible match, not exactly at the first match,
     721             :  * for a forward scan; or after the last match for a backward scan.)
     722             :  *
     723             :  * As a byproduct of this work, we can detect contradictory quals such
     724             :  * as "x = 1 AND x > 2".  If we see that, we return so->qual_ok = false,
     725             :  * indicating the scan need not be run at all since no tuples can match.
     726             :  * (In this case we do not bother completing the output key array!)
     727             :  * Again, missing cross-type operators might cause us to fail to prove the
     728             :  * quals contradictory when they really are, but the scan will work correctly.
     729             :  *
     730             :  * Row comparison keys are currently also treated without any smarts:
     731             :  * we just transfer them into the preprocessed array without any
     732             :  * editorialization.  We can treat them the same as an ordinary inequality
     733             :  * comparison on the row's first index column, for the purposes of the logic
     734             :  * about required keys.
     735             :  *
     736             :  * Note: the reason we have to copy the preprocessed scan keys into private
     737             :  * storage is that we are modifying the array based on comparisons of the
     738             :  * key argument values, which could change on a rescan or after moving to
     739             :  * new elements of array keys.  Therefore we can't overwrite the source data.
     740             :  */
     741             : void
     742    11187880 : _bt_preprocess_keys(IndexScanDesc scan)
     743             : {
     744    11187880 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     745    11187880 :     int         numberOfKeys = scan->numberOfKeys;
     746    11187880 :     int16      *indoption = scan->indexRelation->rd_indoption;
     747             :     int         new_numberOfKeys;
     748             :     int         numberOfEqualCols;
     749             :     ScanKey     inkeys;
     750             :     ScanKey     outkeys;
     751             :     ScanKey     cur;
     752             :     ScanKey     xform[BTMaxStrategyNumber];
     753             :     bool        test_result;
     754             :     int         i,
     755             :                 j;
     756             :     AttrNumber  attno;
     757             : 
     758             :     /* initialize result variables */
     759    11187880 :     so->qual_ok = true;
     760    11187880 :     so->numberOfKeys = 0;
     761             : 
     762    11187880 :     if (numberOfKeys < 1)
     763     4096760 :         return;                 /* done if qual-less scan */
     764             : 
     765             :     /*
     766             :      * Read so->arrayKeyData if array keys are present, else scan->keyData
     767             :      */
     768    11182434 :     if (so->arrayKeyData != NULL)
     769        2360 :         inkeys = so->arrayKeyData;
     770             :     else
     771    11180074 :         inkeys = scan->keyData;
     772             : 
     773    11182434 :     outkeys = so->keyData;
     774    11182434 :     cur = &inkeys[0];
     775             :     /* we check that input keys are correctly ordered */
     776    11182434 :     if (cur->sk_attno < 1)
     777           0 :         elog(ERROR, "btree index keys must be ordered by attribute");
     778             : 
     779             :     /* We can short-circuit most of the work if there's just one key */
     780    11182434 :     if (numberOfKeys == 1)
     781             :     {
     782             :         /* Apply indoption to scankey (might change sk_strategy!) */
     783     4091298 :         if (!_bt_fix_scankey_strategy(cur, indoption))
     784         890 :             so->qual_ok = false;
     785     4091298 :         memcpy(outkeys, cur, sizeof(ScanKeyData));
     786     4091298 :         so->numberOfKeys = 1;
     787             :         /* We can mark the qual as required if it's for first index col */
     788     4091298 :         if (cur->sk_attno == 1)
     789     4091144 :             _bt_mark_scankey_required(outkeys);
     790     4091298 :         return;
     791             :     }
     792             : 
     793             :     /*
     794             :      * Otherwise, do the full set of pushups.
     795             :      */
     796     7091136 :     new_numberOfKeys = 0;
     797     7091136 :     numberOfEqualCols = 0;
     798             : 
     799             :     /*
     800             :      * Initialize for processing of keys for attr 1.
     801             :      *
     802             :      * xform[i] points to the currently best scan key of strategy type i+1; it
     803             :      * is NULL if we haven't yet found such a key for this attr.
     804             :      */
     805     7091136 :     attno = 1;
     806     7091136 :     memset(xform, 0, sizeof(xform));
     807             : 
     808             :     /*
     809             :      * Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to
     810             :      * handle after-last-key processing.  Actual exit from the loop is at the
     811             :      * "break" statement below.
     812             :      */
     813     7091136 :     for (i = 0;; cur++, i++)
     814             :     {
     815    22217628 :         if (i < numberOfKeys)
     816             :         {
     817             :             /* Apply indoption to scankey (might change sk_strategy!) */
     818    15126492 :             if (!_bt_fix_scankey_strategy(cur, indoption))
     819             :             {
     820             :                 /* NULL can't be matched, so give up */
     821           0 :                 so->qual_ok = false;
     822           0 :                 return;
     823             :             }
     824             :         }
     825             : 
     826             :         /*
     827             :          * If we are at the end of the keys for a particular attr, finish up
     828             :          * processing and emit the cleaned-up keys.
     829             :          */
     830    22217628 :         if (i == numberOfKeys || cur->sk_attno != attno)
     831             :         {
     832    15125780 :             int         priorNumberOfEqualCols = numberOfEqualCols;
     833             : 
     834             :             /* check input keys are correctly ordered */
     835    15125780 :             if (i < numberOfKeys && cur->sk_attno < attno)
     836           0 :                 elog(ERROR, "btree index keys must be ordered by attribute");
     837             : 
     838             :             /*
     839             :              * If = has been specified, all other keys can be eliminated as
     840             :              * redundant.  If we have a case like key = 1 AND key > 2, we can
     841             :              * set qual_ok to false and abandon further processing.
     842             :              *
     843             :              * We also have to deal with the case of "key IS NULL", which is
     844             :              * unsatisfiable in combination with any other index condition. By
     845             :              * the time we get here, that's been classified as an equality
     846             :              * check, and we've rejected any combination of it with a regular
     847             :              * equality condition; but not with other types of conditions.
     848             :              */
     849    15125780 :             if (xform[BTEqualStrategyNumber - 1])
     850             :             {
     851    14389006 :                 ScanKey     eq = xform[BTEqualStrategyNumber - 1];
     852             : 
     853    86333956 :                 for (j = BTMaxStrategyNumber; --j >= 0;)
     854             :                 {
     855    71944966 :                     ScanKey     chk = xform[j];
     856             : 
     857    71944966 :                     if (!chk || j == (BTEqualStrategyNumber - 1))
     858    71944898 :                         continue;
     859             : 
     860          68 :                     if (eq->sk_flags & SK_SEARCHNULL)
     861             :                     {
     862             :                         /* IS NULL is contradictory to anything else */
     863          16 :                         so->qual_ok = false;
     864          16 :                         return;
     865             :                     }
     866             : 
     867          52 :                     if (_bt_compare_scankey_args(scan, chk, eq, chk,
     868             :                                                  &test_result))
     869             :                     {
     870          52 :                         if (!test_result)
     871             :                         {
     872             :                             /* keys proven mutually contradictory */
     873           0 :                             so->qual_ok = false;
     874           0 :                             return;
     875             :                         }
     876             :                         /* else discard the redundant non-equality key */
     877          52 :                         xform[j] = NULL;
     878             :                     }
     879             :                     /* else, cannot determine redundancy, keep both keys */
     880             :                 }
     881             :                 /* track number of attrs for which we have "=" keys */
     882    14388990 :                 numberOfEqualCols++;
     883             :             }
     884             : 
     885             :             /* try to keep only one of <, <= */
     886    15125764 :             if (xform[BTLessStrategyNumber - 1]
     887         754 :                 && xform[BTLessEqualStrategyNumber - 1])
     888             :             {
     889           0 :                 ScanKey     lt = xform[BTLessStrategyNumber - 1];
     890           0 :                 ScanKey     le = xform[BTLessEqualStrategyNumber - 1];
     891             : 
     892           0 :                 if (_bt_compare_scankey_args(scan, le, lt, le,
     893             :                                              &test_result))
     894             :                 {
     895           0 :                     if (test_result)
     896           0 :                         xform[BTLessEqualStrategyNumber - 1] = NULL;
     897             :                     else
     898           0 :                         xform[BTLessStrategyNumber - 1] = NULL;
     899             :                 }
     900             :             }
     901             : 
     902             :             /* try to keep only one of >, >= */
     903    15125764 :             if (xform[BTGreaterStrategyNumber - 1]
     904      734050 :                 && xform[BTGreaterEqualStrategyNumber - 1])
     905             :             {
     906           0 :                 ScanKey     gt = xform[BTGreaterStrategyNumber - 1];
     907           0 :                 ScanKey     ge = xform[BTGreaterEqualStrategyNumber - 1];
     908             : 
     909           0 :                 if (_bt_compare_scankey_args(scan, ge, gt, ge,
     910             :                                              &test_result))
     911             :                 {
     912           0 :                     if (test_result)
     913           0 :                         xform[BTGreaterEqualStrategyNumber - 1] = NULL;
     914             :                     else
     915           0 :                         xform[BTGreaterStrategyNumber - 1] = NULL;
     916             :                 }
     917             :             }
     918             : 
     919             :             /*
     920             :              * Emit the cleaned-up keys into the outkeys[] array, and then
     921             :              * mark them if they are required.  They are required (possibly
     922             :              * only in one direction) if all attrs before this one had "=".
     923             :              */
     924    90754584 :             for (j = BTMaxStrategyNumber; --j >= 0;)
     925             :             {
     926    75628820 :                 if (xform[j])
     927             :                 {
     928    15126364 :                     ScanKey     outkey = &outkeys[new_numberOfKeys++];
     929             : 
     930    15126364 :                     memcpy(outkey, xform[j], sizeof(ScanKeyData));
     931    15126364 :                     if (priorNumberOfEqualCols == attno - 1)
     932    15125454 :                         _bt_mark_scankey_required(outkey);
     933             :                 }
     934             :             }
     935             : 
     936             :             /*
     937             :              * Exit loop here if done.
     938             :              */
     939    15125764 :             if (i == numberOfKeys)
     940     7091120 :                 break;
     941             : 
     942             :             /* Re-initialize for new attno */
     943     8034644 :             attno = cur->sk_attno;
     944     8034644 :             memset(xform, 0, sizeof(xform));
     945             :         }
     946             : 
     947             :         /* check strategy this key's operator corresponds to */
     948    15126492 :         j = cur->sk_strategy - 1;
     949             : 
     950             :         /* if row comparison, push it directly to the output array */
     951    15126492 :         if (cur->sk_flags & SK_ROW_HEADER)
     952             :         {
     953           0 :             ScanKey     outkey = &outkeys[new_numberOfKeys++];
     954             : 
     955           0 :             memcpy(outkey, cur, sizeof(ScanKeyData));
     956           0 :             if (numberOfEqualCols == attno - 1)
     957           0 :                 _bt_mark_scankey_required(outkey);
     958             : 
     959             :             /*
     960             :              * We don't support RowCompare using equality; such a qual would
     961             :              * mess up the numberOfEqualCols tracking.
     962             :              */
     963             :             Assert(j != (BTEqualStrategyNumber - 1));
     964           0 :             continue;
     965             :         }
     966             : 
     967             :         /* have we seen one of these before? */
     968    15126492 :         if (xform[j] == NULL)
     969             :         {
     970             :             /* nope, so remember this scankey */
     971    15126448 :             xform[j] = cur;
     972             :         }
     973             :         else
     974             :         {
     975             :             /* yup, keep only the more restrictive key */
     976          44 :             if (_bt_compare_scankey_args(scan, cur, cur, xform[j],
     977             :                                          &test_result))
     978             :             {
     979          44 :                 if (test_result)
     980          40 :                     xform[j] = cur;
     981           4 :                 else if (j == (BTEqualStrategyNumber - 1))
     982             :                 {
     983             :                     /* key == a && key == b, but a != b */
     984           0 :                     so->qual_ok = false;
     985           0 :                     return;
     986             :                 }
     987             :                 /* else old key is more restrictive, keep it */
     988             :             }
     989             :             else
     990             :             {
     991             :                 /*
     992             :                  * We can't determine which key is more restrictive.  Keep the
     993             :                  * previous one in xform[j] and push this one directly to the
     994             :                  * output array.
     995             :                  */
     996           0 :                 ScanKey     outkey = &outkeys[new_numberOfKeys++];
     997             : 
     998           0 :                 memcpy(outkey, cur, sizeof(ScanKeyData));
     999           0 :                 if (numberOfEqualCols == attno - 1)
    1000           0 :                     _bt_mark_scankey_required(outkey);
    1001             :             }
    1002             :         }
    1003             :     }
    1004             : 
    1005     7091120 :     so->numberOfKeys = new_numberOfKeys;
    1006             : }
    1007             : 
    1008             : /*
    1009             :  * Compare two scankey values using a specified operator.
    1010             :  *
    1011             :  * The test we want to perform is logically "leftarg op rightarg", where
    1012             :  * leftarg and rightarg are the sk_argument values in those ScanKeys, and
    1013             :  * the comparison operator is the one in the op ScanKey.  However, in
    1014             :  * cross-data-type situations we may need to look up the correct operator in
    1015             :  * the index's opfamily: it is the one having amopstrategy = op->sk_strategy
    1016             :  * and amoplefttype/amoprighttype equal to the two argument datatypes.
    1017             :  *
    1018             :  * If the opfamily doesn't supply a complete set of cross-type operators we
    1019             :  * may not be able to make the comparison.  If we can make the comparison
    1020             :  * we store the operator result in *result and return true.  We return false
    1021             :  * if the comparison could not be made.
    1022             :  *
    1023             :  * Note: op always points at the same ScanKey as either leftarg or rightarg.
    1024             :  * Since we don't scribble on the scankeys, this aliasing should cause no
    1025             :  * trouble.
    1026             :  *
    1027             :  * Note: this routine needs to be insensitive to any DESC option applied
    1028             :  * to the index column.  For example, "x < 4" is a tighter constraint than
    1029             :  * "x < 5" regardless of which way the index is sorted.
    1030             :  */
    1031             : static bool
    1032          96 : _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
    1033             :                          ScanKey leftarg, ScanKey rightarg,
    1034             :                          bool *result)
    1035             : {
    1036          96 :     Relation    rel = scan->indexRelation;
    1037             :     Oid         lefttype,
    1038             :                 righttype,
    1039             :                 optype,
    1040             :                 opcintype,
    1041             :                 cmp_op;
    1042             :     StrategyNumber strat;
    1043             : 
    1044             :     /*
    1045             :      * First, deal with cases where one or both args are NULL.  This should
    1046             :      * only happen when the scankeys represent IS NULL/NOT NULL conditions.
    1047             :      */
    1048          96 :     if ((leftarg->sk_flags | rightarg->sk_flags) & SK_ISNULL)
    1049             :     {
    1050             :         bool        leftnull,
    1051             :                     rightnull;
    1052             : 
    1053          40 :         if (leftarg->sk_flags & SK_ISNULL)
    1054             :         {
    1055             :             Assert(leftarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL));
    1056           4 :             leftnull = true;
    1057             :         }
    1058             :         else
    1059          36 :             leftnull = false;
    1060          40 :         if (rightarg->sk_flags & SK_ISNULL)
    1061             :         {
    1062             :             Assert(rightarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL));
    1063          36 :             rightnull = true;
    1064             :         }
    1065             :         else
    1066           4 :             rightnull = false;
    1067             : 
    1068             :         /*
    1069             :          * We treat NULL as either greater than or less than all other values.
    1070             :          * Since true > false, the tests below work correctly for NULLS LAST
    1071             :          * logic.  If the index is NULLS FIRST, we need to flip the strategy.
    1072             :          */
    1073          40 :         strat = op->sk_strategy;
    1074          40 :         if (op->sk_flags & SK_BT_NULLS_FIRST)
    1075           0 :             strat = BTCommuteStrategyNumber(strat);
    1076             : 
    1077          40 :         switch (strat)
    1078             :         {
    1079          40 :             case BTLessStrategyNumber:
    1080          40 :                 *result = (leftnull < rightnull);
    1081          40 :                 break;
    1082           0 :             case BTLessEqualStrategyNumber:
    1083           0 :                 *result = (leftnull <= rightnull);
    1084           0 :                 break;
    1085           0 :             case BTEqualStrategyNumber:
    1086           0 :                 *result = (leftnull == rightnull);
    1087           0 :                 break;
    1088           0 :             case BTGreaterEqualStrategyNumber:
    1089           0 :                 *result = (leftnull >= rightnull);
    1090           0 :                 break;
    1091           0 :             case BTGreaterStrategyNumber:
    1092           0 :                 *result = (leftnull > rightnull);
    1093           0 :                 break;
    1094           0 :             default:
    1095           0 :                 elog(ERROR, "unrecognized StrategyNumber: %d", (int) strat);
    1096             :                 *result = false;    /* keep compiler quiet */
    1097             :                 break;
    1098             :         }
    1099          40 :         return true;
    1100             :     }
    1101             : 
    1102             :     /*
    1103             :      * The opfamily we need to worry about is identified by the index column.
    1104             :      */
    1105             :     Assert(leftarg->sk_attno == rightarg->sk_attno);
    1106             : 
    1107          56 :     opcintype = rel->rd_opcintype[leftarg->sk_attno - 1];
    1108             : 
    1109             :     /*
    1110             :      * Determine the actual datatypes of the ScanKey arguments.  We have to
    1111             :      * support the convention that sk_subtype == InvalidOid means the opclass
    1112             :      * input type; this is a hack to simplify life for ScanKeyInit().
    1113             :      */
    1114          56 :     lefttype = leftarg->sk_subtype;
    1115          56 :     if (lefttype == InvalidOid)
    1116           0 :         lefttype = opcintype;
    1117          56 :     righttype = rightarg->sk_subtype;
    1118          56 :     if (righttype == InvalidOid)
    1119           0 :         righttype = opcintype;
    1120          56 :     optype = op->sk_subtype;
    1121          56 :     if (optype == InvalidOid)
    1122           0 :         optype = opcintype;
    1123             : 
    1124             :     /*
    1125             :      * If leftarg and rightarg match the types expected for the "op" scankey,
    1126             :      * we can use its already-looked-up comparison function.
    1127             :      */
    1128          56 :     if (lefttype == opcintype && righttype == optype)
    1129             :     {
    1130          56 :         *result = DatumGetBool(FunctionCall2Coll(&op->sk_func,
    1131             :                                                  op->sk_collation,
    1132             :                                                  leftarg->sk_argument,
    1133             :                                                  rightarg->sk_argument));
    1134          56 :         return true;
    1135             :     }
    1136             : 
    1137             :     /*
    1138             :      * Otherwise, we need to go to the syscache to find the appropriate
    1139             :      * operator.  (This cannot result in infinite recursion, since no
    1140             :      * indexscan initiated by syscache lookup will use cross-data-type
    1141             :      * operators.)
    1142             :      *
    1143             :      * If the sk_strategy was flipped by _bt_fix_scankey_strategy, we have to
    1144             :      * un-flip it to get the correct opfamily member.
    1145             :      */
    1146           0 :     strat = op->sk_strategy;
    1147           0 :     if (op->sk_flags & SK_BT_DESC)
    1148           0 :         strat = BTCommuteStrategyNumber(strat);
    1149             : 
    1150           0 :     cmp_op = get_opfamily_member(rel->rd_opfamily[leftarg->sk_attno - 1],
    1151             :                                  lefttype,
    1152             :                                  righttype,
    1153             :                                  strat);
    1154           0 :     if (OidIsValid(cmp_op))
    1155             :     {
    1156           0 :         RegProcedure cmp_proc = get_opcode(cmp_op);
    1157             : 
    1158           0 :         if (RegProcedureIsValid(cmp_proc))
    1159             :         {
    1160           0 :             *result = DatumGetBool(OidFunctionCall2Coll(cmp_proc,
    1161             :                                                         op->sk_collation,
    1162             :                                                         leftarg->sk_argument,
    1163             :                                                         rightarg->sk_argument));
    1164           0 :             return true;
    1165             :         }
    1166             :     }
    1167             : 
    1168             :     /* Can't make the comparison */
    1169           0 :     *result = false;            /* suppress compiler warnings */
    1170           0 :     return false;
    1171             : }
    1172             : 
    1173             : /*
    1174             :  * Adjust a scankey's strategy and flags setting as needed for indoptions.
    1175             :  *
    1176             :  * We copy the appropriate indoption value into the scankey sk_flags
    1177             :  * (shifting to avoid clobbering system-defined flag bits).  Also, if
    1178             :  * the DESC option is set, commute (flip) the operator strategy number.
    1179             :  *
    1180             :  * A secondary purpose is to check for IS NULL/NOT NULL scankeys and set up
    1181             :  * the strategy field correctly for them.
    1182             :  *
    1183             :  * Lastly, for ordinary scankeys (not IS NULL/NOT NULL), we check for a
    1184             :  * NULL comparison value.  Since all btree operators are assumed strict,
    1185             :  * a NULL means that the qual cannot be satisfied.  We return true if the
    1186             :  * comparison value isn't NULL, or false if the scan should be abandoned.
    1187             :  *
    1188             :  * This function is applied to the *input* scankey structure; therefore
    1189             :  * on a rescan we will be looking at already-processed scankeys.  Hence
    1190             :  * we have to be careful not to re-commute the strategy if we already did it.
    1191             :  * It's a bit ugly to modify the caller's copy of the scankey but in practice
    1192             :  * there shouldn't be any problem, since the index's indoptions are certainly
    1193             :  * not going to change while the scankey survives.
    1194             :  */
    1195             : static bool
    1196    19217790 : _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption)
    1197             : {
    1198             :     int         addflags;
    1199             : 
    1200    19217790 :     addflags = indoption[skey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT;
    1201             : 
    1202             :     /*
    1203             :      * We treat all btree operators as strict (even if they're not so marked
    1204             :      * in pg_proc). This means that it is impossible for an operator condition
    1205             :      * with a NULL comparison constant to succeed, and we can reject it right
    1206             :      * away.
    1207             :      *
    1208             :      * However, we now also support "x IS NULL" clauses as search conditions,
    1209             :      * so in that case keep going. The planner has not filled in any
    1210             :      * particular strategy in this case, so set it to BTEqualStrategyNumber
    1211             :      * --- we can treat IS NULL as an equality operator for purposes of search
    1212             :      * strategy.
    1213             :      *
    1214             :      * Likewise, "x IS NOT NULL" is supported.  We treat that as either "less
    1215             :      * than NULL" in a NULLS LAST index, or "greater than NULL" in a NULLS
    1216             :      * FIRST index.
    1217             :      *
    1218             :      * Note: someday we might have to fill in sk_collation from the index
    1219             :      * column's collation.  At the moment this is a non-issue because we'll
    1220             :      * never actually call the comparison operator on a NULL.
    1221             :      */
    1222    19217790 :     if (skey->sk_flags & SK_ISNULL)
    1223             :     {
    1224             :         /* SK_ISNULL shouldn't be set in a row header scankey */
    1225             :         Assert(!(skey->sk_flags & SK_ROW_HEADER));
    1226             : 
    1227             :         /* Set indoption flags in scankey (might be done already) */
    1228       70644 :         skey->sk_flags |= addflags;
    1229             : 
    1230             :         /* Set correct strategy for IS NULL or NOT NULL search */
    1231       70644 :         if (skey->sk_flags & SK_SEARCHNULL)
    1232             :         {
    1233          84 :             skey->sk_strategy = BTEqualStrategyNumber;
    1234          84 :             skey->sk_subtype = InvalidOid;
    1235          84 :             skey->sk_collation = InvalidOid;
    1236             :         }
    1237       70560 :         else if (skey->sk_flags & SK_SEARCHNOTNULL)
    1238             :         {
    1239       69670 :             if (skey->sk_flags & SK_BT_NULLS_FIRST)
    1240          24 :                 skey->sk_strategy = BTGreaterStrategyNumber;
    1241             :             else
    1242       69646 :                 skey->sk_strategy = BTLessStrategyNumber;
    1243       69670 :             skey->sk_subtype = InvalidOid;
    1244       69670 :             skey->sk_collation = InvalidOid;
    1245             :         }
    1246             :         else
    1247             :         {
    1248             :             /* regular qual, so it cannot be satisfied */
    1249         890 :             return false;
    1250             :         }
    1251             : 
    1252             :         /* Needn't do the rest */
    1253       69754 :         return true;
    1254             :     }
    1255             : 
    1256             :     /* Adjust strategy for DESC, if we didn't already */
    1257    19147146 :     if ((addflags & SK_BT_DESC) && !(skey->sk_flags & SK_BT_DESC))
    1258           0 :         skey->sk_strategy = BTCommuteStrategyNumber(skey->sk_strategy);
    1259    19147146 :     skey->sk_flags |= addflags;
    1260             : 
    1261             :     /* If it's a row header, fix row member flags and strategies similarly */
    1262    19147146 :     if (skey->sk_flags & SK_ROW_HEADER)
    1263             :     {
    1264          24 :         ScanKey     subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
    1265             : 
    1266             :         for (;;)
    1267             :         {
    1268          24 :             Assert(subkey->sk_flags & SK_ROW_MEMBER);
    1269          48 :             addflags = indoption[subkey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT;
    1270          48 :             if ((addflags & SK_BT_DESC) && !(subkey->sk_flags & SK_BT_DESC))
    1271           0 :                 subkey->sk_strategy = BTCommuteStrategyNumber(subkey->sk_strategy);
    1272          48 :             subkey->sk_flags |= addflags;
    1273          48 :             if (subkey->sk_flags & SK_ROW_END)
    1274          24 :                 break;
    1275          24 :             subkey++;
    1276             :         }
    1277             :     }
    1278             : 
    1279    19147146 :     return true;
    1280             : }
    1281             : 
    1282             : /*
    1283             :  * Mark a scankey as "required to continue the scan".
    1284             :  *
    1285             :  * Depending on the operator type, the key may be required for both scan
    1286             :  * directions or just one.  Also, if the key is a row comparison header,
    1287             :  * we have to mark its first subsidiary ScanKey as required.  (Subsequent
    1288             :  * subsidiary ScanKeys are normally for lower-order columns, and thus
    1289             :  * cannot be required, since they're after the first non-equality scankey.)
    1290             :  *
    1291             :  * Note: when we set required-key flag bits in a subsidiary scankey, we are
    1292             :  * scribbling on a data structure belonging to the index AM's caller, not on
    1293             :  * our private copy.  This should be OK because the marking will not change
    1294             :  * from scan to scan within a query, and so we'd just re-mark the same way
    1295             :  * anyway on a rescan.  Something to keep an eye on though.
    1296             :  */
    1297             : static void
    1298    19216598 : _bt_mark_scankey_required(ScanKey skey)
    1299             : {
    1300             :     int         addflags;
    1301             : 
    1302    19216598 :     switch (skey->sk_strategy)
    1303             :     {
    1304       70950 :         case BTLessStrategyNumber:
    1305             :         case BTLessEqualStrategyNumber:
    1306       70950 :             addflags = SK_BT_REQFWD;
    1307       70950 :             break;
    1308    18408550 :         case BTEqualStrategyNumber:
    1309    18408550 :             addflags = SK_BT_REQFWD | SK_BT_REQBKWD;
    1310    18408550 :             break;
    1311      737098 :         case BTGreaterEqualStrategyNumber:
    1312             :         case BTGreaterStrategyNumber:
    1313      737098 :             addflags = SK_BT_REQBKWD;
    1314      737098 :             break;
    1315           0 :         default:
    1316           0 :             elog(ERROR, "unrecognized StrategyNumber: %d",
    1317             :                  (int) skey->sk_strategy);
    1318             :             addflags = 0;       /* keep compiler quiet */
    1319             :             break;
    1320             :     }
    1321             : 
    1322    19216598 :     skey->sk_flags |= addflags;
    1323             : 
    1324    19216598 :     if (skey->sk_flags & SK_ROW_HEADER)
    1325             :     {
    1326          24 :         ScanKey     subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
    1327             : 
    1328             :         /* First subkey should be same column/operator as the header */
    1329             :         Assert(subkey->sk_flags & SK_ROW_MEMBER);
    1330             :         Assert(subkey->sk_attno == skey->sk_attno);
    1331             :         Assert(subkey->sk_strategy == skey->sk_strategy);
    1332          24 :         subkey->sk_flags |= addflags;
    1333             :     }
    1334    19216598 : }
    1335             : 
    1336             : /*
    1337             :  * Test whether an indextuple satisfies all the scankey conditions.
    1338             :  *
    1339             :  * Return true if so, false if not.  If the tuple fails to pass the qual,
    1340             :  * we also determine whether there's any need to continue the scan beyond
    1341             :  * this tuple, and set *continuescan accordingly.  See comments for
    1342             :  * _bt_preprocess_keys(), above, about how this is done.
    1343             :  *
    1344             :  * Forward scan callers can pass a high key tuple in the hopes of having
    1345             :  * us set *continuescan to false, and avoiding an unnecessary visit to
    1346             :  * the page to the right.
    1347             :  *
    1348             :  * scan: index scan descriptor (containing a search-type scankey)
    1349             :  * tuple: index tuple to test
    1350             :  * tupnatts: number of attributes in tupnatts (high key may be truncated)
    1351             :  * dir: direction we are scanning in
    1352             :  * continuescan: output parameter (will be set correctly in all cases)
    1353             :  */
    1354             : bool
    1355    85008490 : _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
    1356             :               ScanDirection dir, bool *continuescan)
    1357             : {
    1358             :     TupleDesc   tupdesc;
    1359             :     BTScanOpaque so;
    1360             :     int         keysz;
    1361             :     int         ikey;
    1362             :     ScanKey     key;
    1363             : 
    1364             :     Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
    1365             : 
    1366    85008490 :     *continuescan = true;       /* default assumption */
    1367             : 
    1368    85008490 :     tupdesc = RelationGetDescr(scan->indexRelation);
    1369    85008490 :     so = (BTScanOpaque) scan->opaque;
    1370    85008490 :     keysz = so->numberOfKeys;
    1371             : 
    1372   218756012 :     for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++)
    1373             :     {
    1374             :         Datum       datum;
    1375             :         bool        isNull;
    1376             :         Datum       test;
    1377             : 
    1378   142399182 :         if (key->sk_attno > tupnatts)
    1379             :         {
    1380             :             /*
    1381             :              * This attribute is truncated (must be high key).  The value for
    1382             :              * this attribute in the first non-pivot tuple on the page to the
    1383             :              * right could be any possible value.  Assume that truncated
    1384             :              * attribute passes the qual.
    1385             :              */
    1386             :             Assert(ScanDirectionIsForward(dir));
    1387             :             Assert(BTreeTupleIsPivot(tuple));
    1388    12613586 :             continue;
    1389             :         }
    1390             : 
    1391             :         /* row-comparison keys need special processing */
    1392   142398180 :         if (key->sk_flags & SK_ROW_HEADER)
    1393             :         {
    1394        1320 :             if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir,
    1395             :                                      continuescan))
    1396        1284 :                 continue;
    1397     8651660 :             return false;
    1398             :         }
    1399             : 
    1400   142396860 :         datum = index_getattr(tuple,
    1401             :                               key->sk_attno,
    1402             :                               tupdesc,
    1403             :                               &isNull);
    1404             : 
    1405   142396860 :         if (key->sk_flags & SK_ISNULL)
    1406             :         {
    1407             :             /* Handle IS NULL/NOT NULL tests */
    1408    12643348 :             if (key->sk_flags & SK_SEARCHNULL)
    1409             :             {
    1410       32092 :                 if (isNull)
    1411          84 :                     continue;   /* tuple satisfies this qual */
    1412             :             }
    1413             :             else
    1414             :             {
    1415             :                 Assert(key->sk_flags & SK_SEARCHNOTNULL);
    1416    12611256 :                 if (!isNull)
    1417    12611216 :                     continue;   /* tuple satisfies this qual */
    1418             :             }
    1419             : 
    1420             :             /*
    1421             :              * Tuple fails this qual.  If it's a required qual for the current
    1422             :              * scan direction, then we can conclude no further tuples will
    1423             :              * pass, either.
    1424             :              */
    1425       32048 :             if ((key->sk_flags & SK_BT_REQFWD) &&
    1426             :                 ScanDirectionIsForward(dir))
    1427          16 :                 *continuescan = false;
    1428       32032 :             else if ((key->sk_flags & SK_BT_REQBKWD) &&
    1429             :                      ScanDirectionIsBackward(dir))
    1430           0 :                 *continuescan = false;
    1431             : 
    1432             :             /*
    1433             :              * In any case, this indextuple doesn't match the qual.
    1434             :              */
    1435       32048 :             return false;
    1436             :         }
    1437             : 
    1438   129753512 :         if (isNull)
    1439             :         {
    1440          88 :             if (key->sk_flags & SK_BT_NULLS_FIRST)
    1441             :             {
    1442             :                 /*
    1443             :                  * Since NULLs are sorted before non-NULLs, we know we have
    1444             :                  * reached the lower limit of the range of values for this
    1445             :                  * index attr.  On a backward scan, we can stop if this qual
    1446             :                  * is one of the "must match" subset.  We can stop regardless
    1447             :                  * of whether the qual is > or <, so long as it's required,
    1448             :                  * because it's not possible for any future tuples to pass. On
    1449             :                  * a forward scan, however, we must keep going, because we may
    1450             :                  * have initially positioned to the start of the index.
    1451             :                  */
    1452           0 :                 if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
    1453             :                     ScanDirectionIsBackward(dir))
    1454           0 :                     *continuescan = false;
    1455             :             }
    1456             :             else
    1457             :             {
    1458             :                 /*
    1459             :                  * Since NULLs are sorted after non-NULLs, we know we have
    1460             :                  * reached the upper limit of the range of values for this
    1461             :                  * index attr.  On a forward scan, we can stop if this qual is
    1462             :                  * one of the "must match" subset.  We can stop regardless of
    1463             :                  * whether the qual is > or <, so long as it's required,
    1464             :                  * because it's not possible for any future tuples to pass. On
    1465             :                  * a backward scan, however, we must keep going, because we
    1466             :                  * may have initially positioned to the end of the index.
    1467             :                  */
    1468          88 :                 if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
    1469             :                     ScanDirectionIsForward(dir))
    1470          56 :                     *continuescan = false;
    1471             :             }
    1472             : 
    1473             :             /*
    1474             :              * In any case, this indextuple doesn't match the qual.
    1475             :              */
    1476          88 :             return false;
    1477             :         }
    1478             : 
    1479   129753424 :         test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
    1480             :                                  datum, key->sk_argument);
    1481             : 
    1482   129753424 :         if (!DatumGetBool(test))
    1483             :         {
    1484             :             /*
    1485             :              * Tuple fails this qual.  If it's a required qual for the current
    1486             :              * scan direction, then we can conclude no further tuples will
    1487             :              * pass, either.
    1488             :              *
    1489             :              * Note: because we stop the scan as soon as any required equality
    1490             :              * qual fails, it is critical that equality quals be used for the
    1491             :              * initial positioning in _bt_first() when they are available. See
    1492             :              * comments in _bt_first().
    1493             :              */
    1494     8619488 :             if ((key->sk_flags & SK_BT_REQFWD) &&
    1495             :                 ScanDirectionIsForward(dir))
    1496     8464214 :                 *continuescan = false;
    1497      155274 :             else if ((key->sk_flags & SK_BT_REQBKWD) &&
    1498             :                      ScanDirectionIsBackward(dir))
    1499          52 :                 *continuescan = false;
    1500             : 
    1501             :             /*
    1502             :              * In any case, this indextuple doesn't match the qual.
    1503             :              */
    1504     8619488 :             return false;
    1505             :         }
    1506             :     }
    1507             : 
    1508             :     /* If we get here, the tuple passes all index quals. */
    1509    76356830 :     return true;
    1510             : }
    1511             : 
    1512             : /*
    1513             :  * Test whether an indextuple satisfies a row-comparison scan condition.
    1514             :  *
    1515             :  * Return true if so, false if not.  If not, also clear *continuescan if
    1516             :  * it's not possible for any future tuples in the current scan direction
    1517             :  * to pass the qual.
    1518             :  *
    1519             :  * This is a subroutine for _bt_checkkeys, which see for more info.
    1520             :  */
    1521             : static bool
    1522        1320 : _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
    1523             :                      TupleDesc tupdesc, ScanDirection dir, bool *continuescan)
    1524             : {
    1525        1320 :     ScanKey     subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
    1526        1320 :     int32       cmpresult = 0;
    1527             :     bool        result;
    1528             : 
    1529             :     /* First subkey should be same as the header says */
    1530             :     Assert(subkey->sk_attno == skey->sk_attno);
    1531             : 
    1532             :     /* Loop over columns of the row condition */
    1533             :     for (;;)
    1534         104 :     {
    1535             :         Datum       datum;
    1536             :         bool        isNull;
    1537             : 
    1538             :         Assert(subkey->sk_flags & SK_ROW_MEMBER);
    1539             : 
    1540        1424 :         if (subkey->sk_attno > tupnatts)
    1541             :         {
    1542             :             /*
    1543             :              * This attribute is truncated (must be high key).  The value for
    1544             :              * this attribute in the first non-pivot tuple on the page to the
    1545             :              * right could be any possible value.  Assume that truncated
    1546             :              * attribute passes the qual.
    1547             :              */
    1548             :             Assert(ScanDirectionIsForward(dir));
    1549             :             Assert(BTreeTupleIsPivot(tuple));
    1550           4 :             cmpresult = 0;
    1551           4 :             if (subkey->sk_flags & SK_ROW_END)
    1552           4 :                 break;
    1553           0 :             subkey++;
    1554           0 :             continue;
    1555             :         }
    1556             : 
    1557        1420 :         datum = index_getattr(tuple,
    1558             :                               subkey->sk_attno,
    1559             :                               tupdesc,
    1560             :                               &isNull);
    1561             : 
    1562        1420 :         if (isNull)
    1563             :         {
    1564          32 :             if (subkey->sk_flags & SK_BT_NULLS_FIRST)
    1565             :             {
    1566             :                 /*
    1567             :                  * Since NULLs are sorted before non-NULLs, we know we have
    1568             :                  * reached the lower limit of the range of values for this
    1569             :                  * index attr.  On a backward scan, we can stop if this qual
    1570             :                  * is one of the "must match" subset.  We can stop regardless
    1571             :                  * of whether the qual is > or <, so long as it's required,
    1572             :                  * because it's not possible for any future tuples to pass. On
    1573             :                  * a forward scan, however, we must keep going, because we may
    1574             :                  * have initially positioned to the start of the index.
    1575             :                  */
    1576           0 :                 if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
    1577             :                     ScanDirectionIsBackward(dir))
    1578           0 :                     *continuescan = false;
    1579             :             }
    1580             :             else
    1581             :             {
    1582             :                 /*
    1583             :                  * Since NULLs are sorted after non-NULLs, we know we have
    1584             :                  * reached the upper limit of the range of values for this
    1585             :                  * index attr.  On a forward scan, we can stop if this qual is
    1586             :                  * one of the "must match" subset.  We can stop regardless of
    1587             :                  * whether the qual is > or <, so long as it's required,
    1588             :                  * because it's not possible for any future tuples to pass. On
    1589             :                  * a backward scan, however, we must keep going, because we
    1590             :                  * may have initially positioned to the end of the index.
    1591             :                  */
    1592          32 :                 if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
    1593             :                     ScanDirectionIsForward(dir))
    1594           0 :                     *continuescan = false;
    1595             :             }
    1596             : 
    1597             :             /*
    1598             :              * In any case, this indextuple doesn't match the qual.
    1599             :              */
    1600          32 :             return false;
    1601             :         }
    1602             : 
    1603        1388 :         if (subkey->sk_flags & SK_ISNULL)
    1604             :         {
    1605             :             /*
    1606             :              * Unlike the simple-scankey case, this isn't a disallowed case.
    1607             :              * But it can never match.  If all the earlier row comparison
    1608             :              * columns are required for the scan direction, we can stop the
    1609             :              * scan, because there can't be another tuple that will succeed.
    1610             :              */
    1611           0 :             if (subkey != (ScanKey) DatumGetPointer(skey->sk_argument))
    1612           0 :                 subkey--;
    1613           0 :             if ((subkey->sk_flags & SK_BT_REQFWD) &&
    1614             :                 ScanDirectionIsForward(dir))
    1615           0 :                 *continuescan = false;
    1616           0 :             else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
    1617             :                      ScanDirectionIsBackward(dir))
    1618           0 :                 *continuescan = false;
    1619           0 :             return false;
    1620             :         }
    1621             : 
    1622             :         /* Perform the test --- three-way comparison not bool operator */
    1623        1388 :         cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func,
    1624             :                                                     subkey->sk_collation,
    1625             :                                                     datum,
    1626             :                                                     subkey->sk_argument));
    1627             : 
    1628        1388 :         if (subkey->sk_flags & SK_BT_DESC)
    1629           0 :             INVERT_COMPARE_RESULT(cmpresult);
    1630             : 
    1631             :         /* Done comparing if unequal, else advance to next column */
    1632        1388 :         if (cmpresult != 0)
    1633        1284 :             break;
    1634             : 
    1635         104 :         if (subkey->sk_flags & SK_ROW_END)
    1636           0 :             break;
    1637         104 :         subkey++;
    1638             :     }
    1639             : 
    1640             :     /*
    1641             :      * At this point cmpresult indicates the overall result of the row
    1642             :      * comparison, and subkey points to the deciding column (or the last
    1643             :      * column if the result is "=").
    1644             :      */
    1645        1288 :     switch (subkey->sk_strategy)
    1646             :     {
    1647             :             /* EQ and NE cases aren't allowed here */
    1648           0 :         case BTLessStrategyNumber:
    1649           0 :             result = (cmpresult < 0);
    1650           0 :             break;
    1651        1060 :         case BTLessEqualStrategyNumber:
    1652        1060 :             result = (cmpresult <= 0);
    1653        1060 :             break;
    1654         160 :         case BTGreaterEqualStrategyNumber:
    1655         160 :             result = (cmpresult >= 0);
    1656         160 :             break;
    1657          68 :         case BTGreaterStrategyNumber:
    1658          68 :             result = (cmpresult > 0);
    1659          68 :             break;
    1660           0 :         default:
    1661           0 :             elog(ERROR, "unrecognized RowCompareType: %d",
    1662             :                  (int) subkey->sk_strategy);
    1663             :             result = 0;         /* keep compiler quiet */
    1664             :             break;
    1665             :     }
    1666             : 
    1667        1288 :     if (!result)
    1668             :     {
    1669             :         /*
    1670             :          * Tuple fails this qual.  If it's a required qual for the current
    1671             :          * scan direction, then we can conclude no further tuples will pass,
    1672             :          * either.  Note we have to look at the deciding column, not
    1673             :          * necessarily the first or last column of the row condition.
    1674             :          */
    1675           4 :         if ((subkey->sk_flags & SK_BT_REQFWD) &&
    1676             :             ScanDirectionIsForward(dir))
    1677           4 :             *continuescan = false;
    1678           0 :         else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
    1679             :                  ScanDirectionIsBackward(dir))
    1680           0 :             *continuescan = false;
    1681             :     }
    1682             : 
    1683        1288 :     return result;
    1684             : }
    1685             : 
    1686             : /*
    1687             :  * _bt_killitems - set LP_DEAD state for items an indexscan caller has
    1688             :  * told us were killed
    1689             :  *
    1690             :  * scan->opaque, referenced locally through so, contains information about the
    1691             :  * current page and killed tuples thereon (generally, this should only be
    1692             :  * called if so->numKilled > 0).
    1693             :  *
    1694             :  * The caller does not have a lock on the page and may or may not have the
    1695             :  * page pinned in a buffer.  Note that read-lock is sufficient for setting
    1696             :  * LP_DEAD status (which is only a hint).
    1697             :  *
    1698             :  * We match items by heap TID before assuming they are the right ones to
    1699             :  * delete.  We cope with cases where items have moved right due to insertions.
    1700             :  * If an item has moved off the current page due to a split, we'll fail to
    1701             :  * find it and do nothing (this is not an error case --- we assume the item
    1702             :  * will eventually get marked in a future indexscan).
    1703             :  *
    1704             :  * Note that if we hold a pin on the target page continuously from initially
    1705             :  * reading the items until applying this function, VACUUM cannot have deleted
    1706             :  * any items from the page, and so there is no need to search left from the
    1707             :  * recorded offset.  (This observation also guarantees that the item is still
    1708             :  * the right one to delete, which might otherwise be questionable since heap
    1709             :  * TIDs can get recycled.)  This holds true even if the page has been modified
    1710             :  * by inserts and page splits, so there is no need to consult the LSN.
    1711             :  *
    1712             :  * If the pin was released after reading the page, then we re-read it.  If it
    1713             :  * has been modified since we read it (as determined by the LSN), we dare not
    1714             :  * flag any entries because it is possible that the old entry was vacuumed
    1715             :  * away and the TID was re-used by a completely different heap tuple.
    1716             :  */
    1717             : void
    1718       99144 : _bt_killitems(IndexScanDesc scan)
    1719             : {
    1720       99144 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1721             :     Page        page;
    1722             :     BTPageOpaque opaque;
    1723             :     OffsetNumber minoff;
    1724             :     OffsetNumber maxoff;
    1725             :     int         i;
    1726       99144 :     int         numKilled = so->numKilled;
    1727       99144 :     bool        killedsomething = false;
    1728             :     bool        droppedpin PG_USED_FOR_ASSERTS_ONLY;
    1729             : 
    1730             :     Assert(BTScanPosIsValid(so->currPos));
    1731             : 
    1732             :     /*
    1733             :      * Always reset the scan state, so we don't look for same items on other
    1734             :      * pages.
    1735             :      */
    1736       99144 :     so->numKilled = 0;
    1737             : 
    1738       99144 :     if (BTScanPosIsPinned(so->currPos))
    1739             :     {
    1740             :         /*
    1741             :          * We have held the pin on this page since we read the index tuples,
    1742             :          * so all we need to do is lock it.  The pin will have prevented
    1743             :          * re-use of any TID on the page, so there is no need to check the
    1744             :          * LSN.
    1745             :          */
    1746        4976 :         droppedpin = false;
    1747        4976 :         LockBuffer(so->currPos.buf, BT_READ);
    1748             : 
    1749        4976 :         page = BufferGetPage(so->currPos.buf);
    1750             :     }
    1751             :     else
    1752             :     {
    1753             :         Buffer      buf;
    1754             : 
    1755       94168 :         droppedpin = true;
    1756             :         /* Attempt to re-read the buffer, getting pin and lock. */
    1757       94168 :         buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ);
    1758             : 
    1759       94168 :         page = BufferGetPage(buf);
    1760       94168 :         if (BufferGetLSNAtomic(buf) == so->currPos.lsn)
    1761       94128 :             so->currPos.buf = buf;
    1762             :         else
    1763             :         {
    1764             :             /* Modified while not pinned means hinting is not safe. */
    1765          40 :             _bt_relbuf(scan->indexRelation, buf);
    1766          40 :             return;
    1767             :         }
    1768             :     }
    1769             : 
    1770       99104 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1771       99104 :     minoff = P_FIRSTDATAKEY(opaque);
    1772       99104 :     maxoff = PageGetMaxOffsetNumber(page);
    1773             : 
    1774      228960 :     for (i = 0; i < numKilled; i++)
    1775             :     {
    1776      129856 :         int         itemIndex = so->killedItems[i];
    1777      129856 :         BTScanPosItem *kitem = &so->currPos.items[itemIndex];
    1778      129856 :         OffsetNumber offnum = kitem->indexOffset;
    1779             : 
    1780             :         Assert(itemIndex >= so->currPos.firstItem &&
    1781             :                itemIndex <= so->currPos.lastItem);
    1782      129856 :         if (offnum < minoff)
    1783           0 :             continue;           /* pure paranoia */
    1784      591436 :         while (offnum <= maxoff)
    1785             :         {
    1786      582586 :             ItemId      iid = PageGetItemId(page, offnum);
    1787      582586 :             IndexTuple  ituple = (IndexTuple) PageGetItem(page, iid);
    1788      582586 :             bool        killtuple = false;
    1789             : 
    1790      582586 :             if (BTreeTupleIsPosting(ituple))
    1791             :             {
    1792      274094 :                 int         pi = i + 1;
    1793      274094 :                 int         nposting = BTreeTupleGetNPosting(ituple);
    1794             :                 int         j;
    1795             : 
    1796             :                 /*
    1797             :                  * We rely on the convention that heap TIDs in the scanpos
    1798             :                  * items array are stored in ascending heap TID order for a
    1799             :                  * group of TIDs that originally came from a posting list
    1800             :                  * tuple.  This convention even applies during backwards
    1801             :                  * scans, where returning the TIDs in descending order might
    1802             :                  * seem more natural.  This is about effectiveness, not
    1803             :                  * correctness.
    1804             :                  *
    1805             :                  * Note that the page may have been modified in almost any way
    1806             :                  * since we first read it (in the !droppedpin case), so it's
    1807             :                  * possible that this posting list tuple wasn't a posting list
    1808             :                  * tuple when we first encountered its heap TIDs.
    1809             :                  */
    1810      282986 :                 for (j = 0; j < nposting; j++)
    1811             :                 {
    1812      282798 :                     ItemPointer item = BTreeTupleGetPostingN(ituple, j);
    1813             : 
    1814      282798 :                     if (!ItemPointerEquals(item, &kitem->heapTid))
    1815      273906 :                         break;  /* out of posting list loop */
    1816             : 
    1817             :                     /*
    1818             :                      * kitem must have matching offnum when heap TIDs match,
    1819             :                      * though only in the common case where the page can't
    1820             :                      * have been concurrently modified
    1821             :                      */
    1822             :                     Assert(kitem->indexOffset == offnum || !droppedpin);
    1823             : 
    1824             :                     /*
    1825             :                      * Read-ahead to later kitems here.
    1826             :                      *
    1827             :                      * We rely on the assumption that not advancing kitem here
    1828             :                      * will prevent us from considering the posting list tuple
    1829             :                      * fully dead by not matching its next heap TID in next
    1830             :                      * loop iteration.
    1831             :                      *
    1832             :                      * If, on the other hand, this is the final heap TID in
    1833             :                      * the posting list tuple, then tuple gets killed
    1834             :                      * regardless (i.e. we handle the case where the last
    1835             :                      * kitem is also the last heap TID in the last index tuple
    1836             :                      * correctly -- posting tuple still gets killed).
    1837             :                      */
    1838        8892 :                     if (pi < numKilled)
    1839        5936 :                         kitem = &so->currPos.items[so->killedItems[pi++]];
    1840             :                 }
    1841             : 
    1842             :                 /*
    1843             :                  * Don't bother advancing the outermost loop's int iterator to
    1844             :                  * avoid processing killed items that relate to the same
    1845             :                  * offnum/posting list tuple.  This micro-optimization hardly
    1846             :                  * seems worth it.  (Further iterations of the outermost loop
    1847             :                  * will fail to match on this same posting list's first heap
    1848             :                  * TID instead, so we'll advance to the next offnum/index
    1849             :                  * tuple pretty quickly.)
    1850             :                  */
    1851      274094 :                 if (j == nposting)
    1852         188 :                     killtuple = true;
    1853             :             }
    1854      308492 :             else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
    1855      121120 :                 killtuple = true;
    1856             : 
    1857             :             /*
    1858             :              * Mark index item as dead, if it isn't already.  Since this
    1859             :              * happens while holding a buffer lock possibly in shared mode,
    1860             :              * it's possible that multiple processes attempt to do this
    1861             :              * simultaneously, leading to multiple full-page images being sent
    1862             :              * to WAL (if wal_log_hints or data checksums are enabled), which
    1863             :              * is undesirable.
    1864             :              */
    1865      582586 :             if (killtuple && !ItemIdIsDead(iid))
    1866             :             {
    1867             :                 /* found the item/all posting list items */
    1868      121006 :                 ItemIdMarkDead(iid);
    1869      121006 :                 killedsomething = true;
    1870      121006 :                 break;          /* out of inner search loop */
    1871             :             }
    1872      461580 :             offnum = OffsetNumberNext(offnum);
    1873             :         }
    1874             :     }
    1875             : 
    1876             :     /*
    1877             :      * Since this can be redone later if needed, mark as dirty hint.
    1878             :      *
    1879             :      * Whenever we mark anything LP_DEAD, we also set the page's
    1880             :      * BTP_HAS_GARBAGE flag, which is likewise just a hint.
    1881             :      */
    1882       99104 :     if (killedsomething)
    1883             :     {
    1884       96110 :         opaque->btpo_flags |= BTP_HAS_GARBAGE;
    1885       96110 :         MarkBufferDirtyHint(so->currPos.buf, true);
    1886             :     }
    1887             : 
    1888       99104 :     LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
    1889             : }
    1890             : 
    1891             : 
    1892             : /*
    1893             :  * The following routines manage a shared-memory area in which we track
    1894             :  * assignment of "vacuum cycle IDs" to currently-active btree vacuuming
    1895             :  * operations.  There is a single counter which increments each time we
    1896             :  * start a vacuum to assign it a cycle ID.  Since multiple vacuums could
    1897             :  * be active concurrently, we have to track the cycle ID for each active
    1898             :  * vacuum; this requires at most MaxBackends entries (usually far fewer).
    1899             :  * We assume at most one vacuum can be active for a given index.
    1900             :  *
    1901             :  * Access to the shared memory area is controlled by BtreeVacuumLock.
    1902             :  * In principle we could use a separate lmgr locktag for each index,
    1903             :  * but a single LWLock is much cheaper, and given the short time that
    1904             :  * the lock is ever held, the concurrency hit should be minimal.
    1905             :  */
    1906             : 
    1907             : typedef struct BTOneVacInfo
    1908             : {
    1909             :     LockRelId   relid;          /* global identifier of an index */
    1910             :     BTCycleId   cycleid;        /* cycle ID for its active VACUUM */
    1911             : } BTOneVacInfo;
    1912             : 
    1913             : typedef struct BTVacInfo
    1914             : {
    1915             :     BTCycleId   cycle_ctr;      /* cycle ID most recently assigned */
    1916             :     int         num_vacuums;    /* number of currently active VACUUMs */
    1917             :     int         max_vacuums;    /* allocated length of vacuums[] array */
    1918             :     BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER];
    1919             : } BTVacInfo;
    1920             : 
    1921             : static BTVacInfo *btvacinfo;
    1922             : 
    1923             : 
    1924             : /*
    1925             :  * _bt_vacuum_cycleid --- get the active vacuum cycle ID for an index,
    1926             :  *      or zero if there is no active VACUUM
    1927             :  *
    1928             :  * Note: for correct interlocking, the caller must already hold pin and
    1929             :  * exclusive lock on each buffer it will store the cycle ID into.  This
    1930             :  * ensures that even if a VACUUM starts immediately afterwards, it cannot
    1931             :  * process those pages until the page split is complete.
    1932             :  */
    1933             : BTCycleId
    1934       46432 : _bt_vacuum_cycleid(Relation rel)
    1935             : {
    1936       46432 :     BTCycleId   result = 0;
    1937             :     int         i;
    1938             : 
    1939             :     /* Share lock is enough since this is a read-only operation */
    1940       46432 :     LWLockAcquire(BtreeVacuumLock, LW_SHARED);
    1941             : 
    1942       46436 :     for (i = 0; i < btvacinfo->num_vacuums; i++)
    1943             :     {
    1944           6 :         BTOneVacInfo *vac = &btvacinfo->vacuums[i];
    1945             : 
    1946           6 :         if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
    1947           2 :             vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
    1948             :         {
    1949           2 :             result = vac->cycleid;
    1950           2 :             break;
    1951             :         }
    1952             :     }
    1953             : 
    1954       46432 :     LWLockRelease(BtreeVacuumLock);
    1955       46432 :     return result;
    1956             : }
    1957             : 
    1958             : /*
    1959             :  * _bt_start_vacuum --- assign a cycle ID to a just-starting VACUUM operation
    1960             :  *
    1961             :  * Note: the caller must guarantee that it will eventually call
    1962             :  * _bt_end_vacuum, else we'll permanently leak an array slot.  To ensure
    1963             :  * that this happens even in elog(FATAL) scenarios, the appropriate coding
    1964             :  * is not just a PG_TRY, but
    1965             :  *      PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel))
    1966             :  */
    1967             : BTCycleId
    1968        4008 : _bt_start_vacuum(Relation rel)
    1969             : {
    1970             :     BTCycleId   result;
    1971             :     int         i;
    1972             :     BTOneVacInfo *vac;
    1973             : 
    1974        4008 :     LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
    1975             : 
    1976             :     /*
    1977             :      * Assign the next cycle ID, being careful to avoid zero as well as the
    1978             :      * reserved high values.
    1979             :      */
    1980        4008 :     result = ++(btvacinfo->cycle_ctr);
    1981        4008 :     if (result == 0 || result > MAX_BT_CYCLE_ID)
    1982           0 :         result = btvacinfo->cycle_ctr = 1;
    1983             : 
    1984             :     /* Let's just make sure there's no entry already for this index */
    1985        4008 :     for (i = 0; i < btvacinfo->num_vacuums; i++)
    1986             :     {
    1987           0 :         vac = &btvacinfo->vacuums[i];
    1988           0 :         if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
    1989           0 :             vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
    1990             :         {
    1991             :             /*
    1992             :              * Unlike most places in the backend, we have to explicitly
    1993             :              * release our LWLock before throwing an error.  This is because
    1994             :              * we expect _bt_end_vacuum() to be called before transaction
    1995             :              * abort cleanup can run to release LWLocks.
    1996             :              */
    1997           0 :             LWLockRelease(BtreeVacuumLock);
    1998           0 :             elog(ERROR, "multiple active vacuums for index \"%s\"",
    1999             :                  RelationGetRelationName(rel));
    2000             :         }
    2001             :     }
    2002             : 
    2003             :     /* OK, add an entry */
    2004        4008 :     if (btvacinfo->num_vacuums >= btvacinfo->max_vacuums)
    2005             :     {
    2006           0 :         LWLockRelease(BtreeVacuumLock);
    2007           0 :         elog(ERROR, "out of btvacinfo slots");
    2008             :     }
    2009        4008 :     vac = &btvacinfo->vacuums[btvacinfo->num_vacuums];
    2010        4008 :     vac->relid = rel->rd_lockInfo.lockRelId;
    2011        4008 :     vac->cycleid = result;
    2012        4008 :     btvacinfo->num_vacuums++;
    2013             : 
    2014        4008 :     LWLockRelease(BtreeVacuumLock);
    2015        4008 :     return result;
    2016             : }
    2017             : 
    2018             : /*
    2019             :  * _bt_end_vacuum --- mark a btree VACUUM operation as done
    2020             :  *
    2021             :  * Note: this is deliberately coded not to complain if no entry is found;
    2022             :  * this allows the caller to put PG_TRY around the start_vacuum operation.
    2023             :  */
    2024             : void
    2025        4008 : _bt_end_vacuum(Relation rel)
    2026             : {
    2027             :     int         i;
    2028             : 
    2029        4008 :     LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
    2030             : 
    2031             :     /* Find the array entry */
    2032        4008 :     for (i = 0; i < btvacinfo->num_vacuums; i++)
    2033             :     {
    2034        4008 :         BTOneVacInfo *vac = &btvacinfo->vacuums[i];
    2035             : 
    2036        4008 :         if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
    2037        4008 :             vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
    2038             :         {
    2039             :             /* Remove it by shifting down the last entry */
    2040        4008 :             *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
    2041        4008 :             btvacinfo->num_vacuums--;
    2042        4008 :             break;
    2043             :         }
    2044             :     }
    2045             : 
    2046        4008 :     LWLockRelease(BtreeVacuumLock);
    2047        4008 : }
    2048             : 
    2049             : /*
    2050             :  * _bt_end_vacuum wrapped as an on_shmem_exit callback function
    2051             :  */
    2052             : void
    2053           0 : _bt_end_vacuum_callback(int code, Datum arg)
    2054             : {
    2055           0 :     _bt_end_vacuum((Relation) DatumGetPointer(arg));
    2056           0 : }
    2057             : 
    2058             : /*
    2059             :  * BTreeShmemSize --- report amount of shared memory space needed
    2060             :  */
    2061             : Size
    2062        4344 : BTreeShmemSize(void)
    2063             : {
    2064             :     Size        size;
    2065             : 
    2066        4344 :     size = offsetof(BTVacInfo, vacuums);
    2067        4344 :     size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo)));
    2068        4344 :     return size;
    2069             : }
    2070             : 
    2071             : /*
    2072             :  * BTreeShmemInit --- initialize this module's shared memory
    2073             :  */
    2074             : void
    2075        2170 : BTreeShmemInit(void)
    2076             : {
    2077             :     bool        found;
    2078             : 
    2079        2170 :     btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State",
    2080             :                                               BTreeShmemSize(),
    2081             :                                               &found);
    2082             : 
    2083        2170 :     if (!IsUnderPostmaster)
    2084             :     {
    2085             :         /* Initialize shared memory area */
    2086             :         Assert(!found);
    2087             : 
    2088             :         /*
    2089             :          * It doesn't really matter what the cycle counter starts at, but
    2090             :          * having it always start the same doesn't seem good.  Seed with
    2091             :          * low-order bits of time() instead.
    2092             :          */
    2093        2170 :         btvacinfo->cycle_ctr = (BTCycleId) time(NULL);
    2094             : 
    2095        2170 :         btvacinfo->num_vacuums = 0;
    2096        2170 :         btvacinfo->max_vacuums = MaxBackends;
    2097             :     }
    2098             :     else
    2099             :         Assert(found);
    2100        2170 : }
    2101             : 
    2102             : bytea *
    2103         258 : btoptions(Datum reloptions, bool validate)
    2104             : {
    2105             :     static const relopt_parse_elt tab[] = {
    2106             :         {"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)},
    2107             :         {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
    2108             :         offsetof(BTOptions, vacuum_cleanup_index_scale_factor)},
    2109             :         {"deduplicate_items", RELOPT_TYPE_BOOL,
    2110             :         offsetof(BTOptions, deduplicate_items)}
    2111             : 
    2112             :     };
    2113             : 
    2114         258 :     return (bytea *) build_reloptions(reloptions, validate,
    2115             :                                       RELOPT_KIND_BTREE,
    2116             :                                       sizeof(BTOptions),
    2117             :                                       tab, lengthof(tab));
    2118             : 
    2119             : }
    2120             : 
    2121             : /*
    2122             :  *  btproperty() -- Check boolean properties of indexes.
    2123             :  *
    2124             :  * This is optional, but handling AMPROP_RETURNABLE here saves opening the rel
    2125             :  * to call btcanreturn.
    2126             :  */
    2127             : bool
    2128         504 : btproperty(Oid index_oid, int attno,
    2129             :            IndexAMProperty prop, const char *propname,
    2130             :            bool *res, bool *isnull)
    2131             : {
    2132         504 :     switch (prop)
    2133             :     {
    2134          28 :         case AMPROP_RETURNABLE:
    2135             :             /* answer only for columns, not AM or whole index */
    2136          28 :             if (attno == 0)
    2137           8 :                 return false;
    2138             :             /* otherwise, btree can always return data */
    2139          20 :             *res = true;
    2140          20 :             return true;
    2141             : 
    2142         476 :         default:
    2143         476 :             return false;       /* punt to generic code */
    2144             :     }
    2145             : }
    2146             : 
    2147             : /*
    2148             :  *  btbuildphasename() -- Return name of index build phase.
    2149             :  */
    2150             : char *
    2151           0 : btbuildphasename(int64 phasenum)
    2152             : {
    2153           0 :     switch (phasenum)
    2154             :     {
    2155           0 :         case PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE:
    2156           0 :             return "initializing";
    2157           0 :         case PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN:
    2158           0 :             return "scanning table";
    2159           0 :         case PROGRESS_BTREE_PHASE_PERFORMSORT_1:
    2160           0 :             return "sorting live tuples";
    2161           0 :         case PROGRESS_BTREE_PHASE_PERFORMSORT_2:
    2162           0 :             return "sorting dead tuples";
    2163           0 :         case PROGRESS_BTREE_PHASE_LEAF_LOAD:
    2164           0 :             return "loading tuples in tree";
    2165           0 :         default:
    2166           0 :             return NULL;
    2167             :     }
    2168             : }
    2169             : 
    2170             : /*
    2171             :  *  _bt_truncate() -- create tuple without unneeded suffix attributes.
    2172             :  *
    2173             :  * Returns truncated pivot index tuple allocated in caller's memory context,
    2174             :  * with key attributes copied from caller's firstright argument.  If rel is
    2175             :  * an INCLUDE index, non-key attributes will definitely be truncated away,
    2176             :  * since they're not part of the key space.  More aggressive suffix
    2177             :  * truncation can take place when it's clear that the returned tuple does not
    2178             :  * need one or more suffix key attributes.  We only need to keep firstright
    2179             :  * attributes up to and including the first non-lastleft-equal attribute.
    2180             :  * Caller's insertion scankey is used to compare the tuples; the scankey's
    2181             :  * argument values are not considered here.
    2182             :  *
    2183             :  * Note that returned tuple's t_tid offset will hold the number of attributes
    2184             :  * present, so the original item pointer offset is not represented.  Caller
    2185             :  * should only change truncated tuple's downlink.  Note also that truncated
    2186             :  * key attributes are treated as containing "minus infinity" values by
    2187             :  * _bt_compare().
    2188             :  *
    2189             :  * In the worst case (when a heap TID must be appended to distinguish lastleft
    2190             :  * from firstright), the size of the returned tuple is the size of firstright
    2191             :  * plus the size of an additional MAXALIGN()'d item pointer.  This guarantee
    2192             :  * is important, since callers need to stay under the 1/3 of a page
    2193             :  * restriction on tuple size.  If this routine is ever taught to truncate
    2194             :  * within an attribute/datum, it will need to avoid returning an enlarged
    2195             :  * tuple to caller when truncation + TOAST compression ends up enlarging the
    2196             :  * final datum.
    2197             :  */
    2198             : IndexTuple
    2199       97992 : _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
    2200             :              BTScanInsert itup_key)
    2201             : {
    2202       97992 :     TupleDesc   itupdesc = RelationGetDescr(rel);
    2203       97992 :     int16       nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
    2204             :     int         keepnatts;
    2205             :     IndexTuple  pivot;
    2206             :     IndexTuple  tidpivot;
    2207             :     ItemPointer pivotheaptid;
    2208             :     Size        newsize;
    2209             : 
    2210             :     /*
    2211             :      * We should only ever truncate non-pivot tuples from leaf pages.  It's
    2212             :      * never okay to truncate when splitting an internal page.
    2213             :      */
    2214             :     Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright));
    2215             : 
    2216             :     /* Determine how many attributes must be kept in truncated tuple */
    2217       97992 :     keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key);
    2218             : 
    2219             : #ifdef DEBUG_NO_TRUNCATE
    2220             :     /* Force truncation to be ineffective for testing purposes */
    2221             :     keepnatts = nkeyatts + 1;
    2222             : #endif
    2223             : 
    2224       97992 :     pivot = index_truncate_tuple(itupdesc, firstright,
    2225             :                                  Min(keepnatts, nkeyatts));
    2226             : 
    2227       97992 :     if (BTreeTupleIsPosting(pivot))
    2228             :     {
    2229             :         /*
    2230             :          * index_truncate_tuple() just returns a straight copy of firstright
    2231             :          * when it has no attributes to truncate.  When that happens, we may
    2232             :          * need to truncate away a posting list here instead.
    2233             :          */
    2234             :         Assert(keepnatts == nkeyatts || keepnatts == nkeyatts + 1);
    2235             :         Assert(IndexRelationGetNumberOfAttributes(rel) == nkeyatts);
    2236         688 :         pivot->t_info &= ~INDEX_SIZE_MASK;
    2237         688 :         pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright));
    2238             :     }
    2239             : 
    2240             :     /*
    2241             :      * If there is a distinguishing key attribute within pivot tuple, we're
    2242             :      * done
    2243             :      */
    2244       97992 :     if (keepnatts <= nkeyatts)
    2245             :     {
    2246       88080 :         BTreeTupleSetNAtts(pivot, keepnatts, false);
    2247       88080 :         return pivot;
    2248             :     }
    2249             : 
    2250             :     /*
    2251             :      * We have to store a heap TID in the new pivot tuple, since no non-TID
    2252             :      * key attribute value in firstright distinguishes the right side of the
    2253             :      * split from the left side.  nbtree conceptualizes this case as an
    2254             :      * inability to truncate away any key attributes, since heap TID is
    2255             :      * treated as just another key attribute (despite lacking a pg_attribute
    2256             :      * entry).
    2257             :      *
    2258             :      * Use enlarged space that holds a copy of pivot.  We need the extra space
    2259             :      * to store a heap TID at the end (using the special pivot tuple
    2260             :      * representation).  Note that the original pivot already has firstright's
    2261             :      * possible posting list/non-key attribute values removed at this point.
    2262             :      */
    2263        9912 :     newsize = MAXALIGN(IndexTupleSize(pivot)) + MAXALIGN(sizeof(ItemPointerData));
    2264        9912 :     tidpivot = palloc0(newsize);
    2265        9912 :     memcpy(tidpivot, pivot, MAXALIGN(IndexTupleSize(pivot)));
    2266             :     /* Cannot leak memory here */
    2267        9912 :     pfree(pivot);
    2268             : 
    2269             :     /*
    2270             :      * Store all of firstright's key attribute values plus a tiebreaker heap
    2271             :      * TID value in enlarged pivot tuple
    2272             :      */
    2273        9912 :     tidpivot->t_info &= ~INDEX_SIZE_MASK;
    2274        9912 :     tidpivot->t_info |= newsize;
    2275        9912 :     BTreeTupleSetNAtts(tidpivot, nkeyatts, true);
    2276        9912 :     pivotheaptid = BTreeTupleGetHeapTID(tidpivot);
    2277             : 
    2278             :     /*
    2279             :      * Lehman & Yao use lastleft as the leaf high key in all cases, but don't
    2280             :      * consider suffix truncation.  It seems like a good idea to follow that
    2281             :      * example in cases where no truncation takes place -- use lastleft's heap
    2282             :      * TID.  (This is also the closest value to negative infinity that's
    2283             :      * legally usable.)
    2284             :      */
    2285        9912 :     ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid);
    2286             : 
    2287             :     /*
    2288             :      * We're done.  Assert() that heap TID invariants hold before returning.
    2289             :      *
    2290             :      * Lehman and Yao require that the downlink to the right page, which is to
    2291             :      * be inserted into the parent page in the second phase of a page split be
    2292             :      * a strict lower bound on items on the right page, and a non-strict upper
    2293             :      * bound for items on the left page.  Assert that heap TIDs follow these
    2294             :      * invariants, since a heap TID value is apparently needed as a
    2295             :      * tiebreaker.
    2296             :      */
    2297             : #ifndef DEBUG_NO_TRUNCATE
    2298             :     Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft),
    2299             :                               BTreeTupleGetHeapTID(firstright)) < 0);
    2300             :     Assert(ItemPointerCompare(pivotheaptid,
    2301             :                               BTreeTupleGetHeapTID(lastleft)) >= 0);
    2302             :     Assert(ItemPointerCompare(pivotheaptid,
    2303             :                               BTreeTupleGetHeapTID(firstright)) < 0);
    2304             : #else
    2305             : 
    2306             :     /*
    2307             :      * Those invariants aren't guaranteed to hold for lastleft + firstright
    2308             :      * heap TID attribute values when they're considered here only because
    2309             :      * DEBUG_NO_TRUNCATE is defined (a heap TID is probably not actually
    2310             :      * needed as a tiebreaker).  DEBUG_NO_TRUNCATE must therefore use a heap
    2311             :      * TID value that always works as a strict lower bound for items to the
    2312             :      * right.  In particular, it must avoid using firstright's leading key
    2313             :      * attribute values along with lastleft's heap TID value when lastleft's
    2314             :      * TID happens to be greater than firstright's TID.
    2315             :      */
    2316             :     ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid);
    2317             : 
    2318             :     /*
    2319             :      * Pivot heap TID should never be fully equal to firstright.  Note that
    2320             :      * the pivot heap TID will still end up equal to lastleft's heap TID when
    2321             :      * that's the only usable value.
    2322             :      */
    2323             :     ItemPointerSetOffsetNumber(pivotheaptid,
    2324             :                                OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid)));
    2325             :     Assert(ItemPointerCompare(pivotheaptid,
    2326             :                               BTreeTupleGetHeapTID(firstright)) < 0);
    2327             : #endif
    2328             : 
    2329        9912 :     return tidpivot;
    2330             : }
    2331             : 
    2332             : /*
    2333             :  * _bt_keep_natts - how many key attributes to keep when truncating.
    2334             :  *
    2335             :  * Caller provides two tuples that enclose a split point.  Caller's insertion
    2336             :  * scankey is used to compare the tuples; the scankey's argument values are
    2337             :  * not considered here.
    2338             :  *
    2339             :  * This can return a number of attributes that is one greater than the
    2340             :  * number of key attributes for the index relation.  This indicates that the
    2341             :  * caller must use a heap TID as a unique-ifier in new pivot tuple.
    2342             :  */
    2343             : static int
    2344       97992 : _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
    2345             :                BTScanInsert itup_key)
    2346             : {
    2347       97992 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
    2348       97992 :     TupleDesc   itupdesc = RelationGetDescr(rel);
    2349             :     int         keepnatts;
    2350             :     ScanKey     scankey;
    2351             : 
    2352             :     /*
    2353             :      * _bt_compare() treats truncated key attributes as having the value minus
    2354             :      * infinity, which would break searches within !heapkeyspace indexes.  We
    2355             :      * must still truncate away non-key attribute values, though.
    2356             :      */
    2357       97992 :     if (!itup_key->heapkeyspace)
    2358           0 :         return nkeyatts;
    2359             : 
    2360       97992 :     scankey = itup_key->scankeys;
    2361       97992 :     keepnatts = 1;
    2362      156052 :     for (int attnum = 1; attnum <= nkeyatts; attnum++, scankey++)
    2363             :     {
    2364             :         Datum       datum1,
    2365             :                     datum2;
    2366             :         bool        isNull1,
    2367             :                     isNull2;
    2368             : 
    2369      146140 :         datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
    2370      146140 :         datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
    2371             : 
    2372      146140 :         if (isNull1 != isNull2)
    2373       88080 :             break;
    2374             : 
    2375      146140 :         if (!isNull1 &&
    2376      146140 :             DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
    2377             :                                             scankey->sk_collation,
    2378             :                                             datum1,
    2379             :                                             datum2)) != 0)
    2380       88080 :             break;
    2381             : 
    2382       58060 :         keepnatts++;
    2383             :     }
    2384             : 
    2385             :     /*
    2386             :      * Assert that _bt_keep_natts_fast() agrees with us in passing.  This is
    2387             :      * expected in an allequalimage index.
    2388             :      */
    2389             :     Assert(!itup_key->allequalimage ||
    2390             :            keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright));
    2391             : 
    2392       97992 :     return keepnatts;
    2393             : }
    2394             : 
    2395             : /*
    2396             :  * _bt_keep_natts_fast - fast bitwise variant of _bt_keep_natts.
    2397             :  *
    2398             :  * This is exported so that a candidate split point can have its effect on
    2399             :  * suffix truncation inexpensively evaluated ahead of time when finding a
    2400             :  * split location.  A naive bitwise approach to datum comparisons is used to
    2401             :  * save cycles.
    2402             :  *
    2403             :  * The approach taken here usually provides the same answer as _bt_keep_natts
    2404             :  * will (for the same pair of tuples from a heapkeyspace index), since the
    2405             :  * majority of btree opclasses can never indicate that two datums are equal
    2406             :  * unless they're bitwise equal after detoasting.  When an index only has
    2407             :  * "equal image" columns, routine is guaranteed to give the same result as
    2408             :  * _bt_keep_natts would.
    2409             :  *
    2410             :  * Callers can rely on the fact that attributes considered equal here are
    2411             :  * definitely also equal according to _bt_keep_natts, even when the index uses
    2412             :  * an opclass or collation that is not "allequalimage"/deduplication-safe.
    2413             :  * This weaker guarantee is good enough for nbtsplitloc.c caller, since false
    2414             :  * negatives generally only have the effect of making leaf page splits use a
    2415             :  * more balanced split point.
    2416             :  */
    2417             : int
    2418     7575964 : _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
    2419             : {
    2420     7575964 :     TupleDesc   itupdesc = RelationGetDescr(rel);
    2421     7575964 :     int         keysz = IndexRelationGetNumberOfKeyAttributes(rel);
    2422             :     int         keepnatts;
    2423             : 
    2424     7575964 :     keepnatts = 1;
    2425     9565492 :     for (int attnum = 1; attnum <= keysz; attnum++)
    2426             :     {
    2427             :         Datum       datum1,
    2428             :                     datum2;
    2429             :         bool        isNull1,
    2430             :                     isNull2;
    2431             :         Form_pg_attribute att;
    2432             : 
    2433     8483158 :         datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
    2434     8483158 :         datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
    2435     8483158 :         att = TupleDescAttr(itupdesc, attnum - 1);
    2436             : 
    2437     8483158 :         if (isNull1 != isNull2)
    2438     6493630 :             break;
    2439             : 
    2440     8483042 :         if (!isNull1 &&
    2441     8478874 :             !datum_image_eq(datum1, datum2, att->attbyval, att->attlen))
    2442     6493514 :             break;
    2443             : 
    2444     1989528 :         keepnatts++;
    2445             :     }
    2446             : 
    2447     7575964 :     return keepnatts;
    2448             : }
    2449             : 
    2450             : /*
    2451             :  *  _bt_check_natts() -- Verify tuple has expected number of attributes.
    2452             :  *
    2453             :  * Returns value indicating if the expected number of attributes were found
    2454             :  * for a particular offset on page.  This can be used as a general purpose
    2455             :  * sanity check.
    2456             :  *
    2457             :  * Testing a tuple directly with BTreeTupleGetNAtts() should generally be
    2458             :  * preferred to calling here.  That's usually more convenient, and is always
    2459             :  * more explicit.  Call here instead when offnum's tuple may be a negative
    2460             :  * infinity tuple that uses the pre-v11 on-disk representation, or when a low
    2461             :  * context check is appropriate.  This routine is as strict as possible about
    2462             :  * what is expected on each version of btree.
    2463             :  */
    2464             : bool
    2465     2012390 : _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
    2466             : {
    2467     2012390 :     int16       natts = IndexRelationGetNumberOfAttributes(rel);
    2468     2012390 :     int16       nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
    2469     2012390 :     BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    2470             :     IndexTuple  itup;
    2471             :     int         tupnatts;
    2472             : 
    2473             :     /*
    2474             :      * We cannot reliably test a deleted or half-dead page, since they have
    2475             :      * dummy high keys
    2476             :      */
    2477     2012390 :     if (P_IGNORE(opaque))
    2478           0 :         return true;
    2479             : 
    2480             :     Assert(offnum >= FirstOffsetNumber &&
    2481             :            offnum <= PageGetMaxOffsetNumber(page));
    2482             : 
    2483             :     /*
    2484             :      * Mask allocated for number of keys in index tuple must be able to fit
    2485             :      * maximum possible number of index attributes
    2486             :      */
    2487             :     StaticAssertStmt(BT_OFFSET_MASK >= INDEX_MAX_KEYS,
    2488             :                      "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS");
    2489             : 
    2490     2012390 :     itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
    2491     2012390 :     tupnatts = BTreeTupleGetNAtts(itup, rel);
    2492             : 
    2493             :     /* !heapkeyspace indexes do not support deduplication */
    2494     2012390 :     if (!heapkeyspace && BTreeTupleIsPosting(itup))
    2495           0 :         return false;
    2496             : 
    2497             :     /* Posting list tuples should never have "pivot heap TID" bit set */
    2498     2012390 :     if (BTreeTupleIsPosting(itup) &&
    2499          20 :         (ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
    2500             :          BT_PIVOT_HEAP_TID_ATTR) != 0)
    2501           0 :         return false;
    2502             : 
    2503             :     /* INCLUDE indexes do not support deduplication */
    2504     2012390 :     if (natts != nkeyatts && BTreeTupleIsPosting(itup))
    2505           0 :         return false;
    2506             : 
    2507     2012390 :     if (P_ISLEAF(opaque))
    2508             :     {
    2509     2006230 :         if (offnum >= P_FIRSTDATAKEY(opaque))
    2510             :         {
    2511             :             /*
    2512             :              * Non-pivot tuple should never be explicitly marked as a pivot
    2513             :              * tuple
    2514             :              */
    2515     2000108 :             if (BTreeTupleIsPivot(itup))
    2516           0 :                 return false;
    2517             : 
    2518             :             /*
    2519             :              * Leaf tuples that are not the page high key (non-pivot tuples)
    2520             :              * should never be truncated.  (Note that tupnatts must have been
    2521             :              * inferred, even with a posting list tuple, because only pivot
    2522             :              * tuples store tupnatts directly.)
    2523             :              */
    2524     2000108 :             return tupnatts == natts;
    2525             :         }
    2526             :         else
    2527             :         {
    2528             :             /*
    2529             :              * Rightmost page doesn't contain a page high key, so tuple was
    2530             :              * checked above as ordinary leaf tuple
    2531             :              */
    2532             :             Assert(!P_RIGHTMOST(opaque));
    2533             : 
    2534             :             /*
    2535             :              * !heapkeyspace high key tuple contains only key attributes. Note
    2536             :              * that tupnatts will only have been explicitly represented in
    2537             :              * !heapkeyspace indexes that happen to have non-key attributes.
    2538             :              */
    2539        6122 :             if (!heapkeyspace)
    2540           0 :                 return tupnatts == nkeyatts;
    2541             : 
    2542             :             /* Use generic heapkeyspace pivot tuple handling */
    2543             :         }
    2544             :     }
    2545             :     else                        /* !P_ISLEAF(opaque) */
    2546             :     {
    2547        6160 :         if (offnum == P_FIRSTDATAKEY(opaque))
    2548             :         {
    2549             :             /*
    2550             :              * The first tuple on any internal page (possibly the first after
    2551             :              * its high key) is its negative infinity tuple.  Negative
    2552             :              * infinity tuples are always truncated to zero attributes.  They
    2553             :              * are a particular kind of pivot tuple.
    2554             :              */
    2555          34 :             if (heapkeyspace)
    2556          34 :                 return tupnatts == 0;
    2557             : 
    2558             :             /*
    2559             :              * The number of attributes won't be explicitly represented if the
    2560             :              * negative infinity tuple was generated during a page split that
    2561             :              * occurred with a version of Postgres before v11.  There must be
    2562             :              * a problem when there is an explicit representation that is
    2563             :              * non-zero, or when there is no explicit representation and the
    2564             :              * tuple is evidently not a pre-pg_upgrade tuple.
    2565             :              *
    2566             :              * Prior to v11, downlinks always had P_HIKEY as their offset.
    2567             :              * Accept that as an alternative indication of a valid
    2568             :              * !heapkeyspace negative infinity tuple.
    2569             :              */
    2570           0 :             return tupnatts == 0 ||
    2571           0 :                 ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY;
    2572             :         }
    2573             :         else
    2574             :         {
    2575             :             /*
    2576             :              * !heapkeyspace downlink tuple with separator key contains only
    2577             :              * key attributes.  Note that tupnatts will only have been
    2578             :              * explicitly represented in !heapkeyspace indexes that happen to
    2579             :              * have non-key attributes.
    2580             :              */
    2581        6126 :             if (!heapkeyspace)
    2582           0 :                 return tupnatts == nkeyatts;
    2583             : 
    2584             :             /* Use generic heapkeyspace pivot tuple handling */
    2585             :         }
    2586             : 
    2587             :     }
    2588             : 
    2589             :     /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
    2590             :     Assert(heapkeyspace);
    2591             : 
    2592             :     /*
    2593             :      * Explicit representation of the number of attributes is mandatory with
    2594             :      * heapkeyspace index pivot tuples, regardless of whether or not there are
    2595             :      * non-key attributes.
    2596             :      */
    2597       12248 :     if (!BTreeTupleIsPivot(itup))
    2598           0 :         return false;
    2599             : 
    2600             :     /* Pivot tuple should not use posting list representation (redundant) */
    2601       12248 :     if (BTreeTupleIsPosting(itup))
    2602           0 :         return false;
    2603             : 
    2604             :     /*
    2605             :      * Heap TID is a tiebreaker key attribute, so it cannot be untruncated
    2606             :      * when any other key attribute is truncated
    2607             :      */
    2608       12248 :     if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
    2609           0 :         return false;
    2610             : 
    2611             :     /*
    2612             :      * Pivot tuple must have at least one untruncated key attribute (minus
    2613             :      * infinity pivot tuples are the only exception).  Pivot tuples can never
    2614             :      * represent that there is a value present for a key attribute that
    2615             :      * exceeds pg_index.indnkeyatts for the index.
    2616             :      */
    2617       12248 :     return tupnatts > 0 && tupnatts <= nkeyatts;
    2618             : }
    2619             : 
    2620             : /*
    2621             :  *
    2622             :  *  _bt_check_third_page() -- check whether tuple fits on a btree page at all.
    2623             :  *
    2624             :  * We actually need to be able to fit three items on every page, so restrict
    2625             :  * any one item to 1/3 the per-page available space.  Note that itemsz should
    2626             :  * not include the ItemId overhead.
    2627             :  *
    2628             :  * It might be useful to apply TOAST methods rather than throw an error here.
    2629             :  * Using out of line storage would break assumptions made by suffix truncation
    2630             :  * and by contrib/amcheck, though.
    2631             :  */
    2632             : void
    2633         176 : _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
    2634             :                      Page page, IndexTuple newtup)
    2635             : {
    2636             :     Size        itemsz;
    2637             :     BTPageOpaque opaque;
    2638             : 
    2639         176 :     itemsz = MAXALIGN(IndexTupleSize(newtup));
    2640             : 
    2641             :     /* Double check item size against limit */
    2642         176 :     if (itemsz <= BTMaxItemSize(page))
    2643           0 :         return;
    2644             : 
    2645             :     /*
    2646             :      * Tuple is probably too large to fit on page, but it's possible that the
    2647             :      * index uses version 2 or version 3, or that page is an internal page, in
    2648             :      * which case a slightly higher limit applies.
    2649             :      */
    2650         176 :     if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page))
    2651         176 :         return;
    2652             : 
    2653             :     /*
    2654             :      * Internal page insertions cannot fail here, because that would mean that
    2655             :      * an earlier leaf level insertion that should have failed didn't
    2656             :      */
    2657           0 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    2658           0 :     if (!P_ISLEAF(opaque))
    2659           0 :         elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
    2660             :              itemsz, RelationGetRelationName(rel));
    2661             : 
    2662           0 :     ereport(ERROR,
    2663             :             (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
    2664             :              errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
    2665             :                     itemsz,
    2666             :                     needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
    2667             :                     needheaptidspace ? BTMaxItemSize(page) :
    2668             :                     BTMaxItemSizeNoHeapTid(page),
    2669             :                     RelationGetRelationName(rel)),
    2670             :              errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
    2671             :                        ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)),
    2672             :                        ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)),
    2673             :                        RelationGetRelationName(heap)),
    2674             :              errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
    2675             :                      "Consider a function index of an MD5 hash of the value, "
    2676             :                      "or use full text indexing."),
    2677             :              errtableconstraint(heap, RelationGetRelationName(rel))));
    2678             : }
    2679             : 
    2680             : /*
    2681             :  * Are all attributes in rel "equality is image equality" attributes?
    2682             :  *
    2683             :  * We use each attribute's BTEQUALIMAGE_PROC opclass procedure.  If any
    2684             :  * opclass either lacks a BTEQUALIMAGE_PROC procedure or returns false, we
    2685             :  * return false; otherwise we return true.
    2686             :  *
    2687             :  * Returned boolean value is stored in index metapage during index builds.
    2688             :  * Deduplication can only be used when we return true.
    2689             :  */
    2690             : bool
    2691       76568 : _bt_allequalimage(Relation rel, bool debugmessage)
    2692             : {
    2693       76568 :     bool        allequalimage = true;
    2694             : 
    2695             :     /* INCLUDE indexes don't support deduplication */
    2696      153136 :     if (IndexRelationGetNumberOfAttributes(rel) !=
    2697       76568 :         IndexRelationGetNumberOfKeyAttributes(rel))
    2698         208 :         return false;
    2699             : 
    2700             :     /*
    2701             :      * There is no special reason why deduplication cannot work with system
    2702             :      * relations (i.e. with system catalog indexes and TOAST indexes).  We
    2703             :      * deem deduplication unsafe for these indexes all the same, since the
    2704             :      * alternative is to force users to always use deduplication, without
    2705             :      * being able to opt out.  (ALTER INDEX is not supported with system
    2706             :      * indexes, so users would have no way to set the deduplicate_items
    2707             :      * storage parameter to 'off'.)
    2708             :      */
    2709       76360 :     if (IsSystemRelation(rel))
    2710       68136 :         return false;
    2711             : 
    2712       17356 :     for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++)
    2713             :     {
    2714        9292 :         Oid         opfamily = rel->rd_opfamily[i];
    2715        9292 :         Oid         opcintype = rel->rd_opcintype[i];
    2716        9292 :         Oid         collation = rel->rd_indcollation[i];
    2717             :         Oid         equalimageproc;
    2718             : 
    2719        9292 :         equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype,
    2720             :                                            BTEQUALIMAGE_PROC);
    2721             : 
    2722             :         /*
    2723             :          * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to
    2724             :          * be unsafe.  Otherwise, actually call proc and see what it says.
    2725             :          */
    2726       18424 :         if (!OidIsValid(equalimageproc) ||
    2727        9132 :             !DatumGetBool(OidFunctionCall1Coll(equalimageproc, collation,
    2728             :                                                ObjectIdGetDatum(opcintype))))
    2729             :         {
    2730         160 :             allequalimage = false;
    2731         160 :             break;
    2732             :         }
    2733             :     }
    2734             : 
    2735             :     /*
    2736             :      * Don't elog() until here to avoid reporting on a system relation index
    2737             :      * or an INCLUDE index
    2738             :      */
    2739        8224 :     if (debugmessage)
    2740             :     {
    2741        8150 :         if (allequalimage)
    2742        7990 :             elog(DEBUG1, "index \"%s\" can safely use deduplication",
    2743             :                  RelationGetRelationName(rel));
    2744             :         else
    2745         160 :             elog(DEBUG1, "index \"%s\" cannot use deduplication",
    2746             :                  RelationGetRelationName(rel));
    2747             :     }
    2748             : 
    2749        8224 :     return allequalimage;
    2750             : }

Generated by: LCOV version 1.13