LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtsearch.c (source / functions) Hit Total Coverage
Test: PostgreSQL 17devel Lines: 627 721 87.0 %
Date: 2024-04-26 10:11:36 Functions: 19 20 95.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtsearch.c
       4             :  *    Search code for postgres btrees.
       5             :  *
       6             :  *
       7             :  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
       8             :  * Portions Copyright (c) 1994, Regents of the University of California
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/access/nbtree/nbtsearch.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : 
      16             : #include "postgres.h"
      17             : 
      18             : #include "access/nbtree.h"
      19             : #include "access/relscan.h"
      20             : #include "access/xact.h"
      21             : #include "miscadmin.h"
      22             : #include "pgstat.h"
      23             : #include "storage/predicate.h"
      24             : #include "utils/lsyscache.h"
      25             : #include "utils/rel.h"
      26             : 
      27             : 
      28             : static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
      29             : static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
      30             : static int  _bt_binsrch_posting(BTScanInsert key, Page page,
      31             :                                 OffsetNumber offnum);
      32             : static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
      33             :                          OffsetNumber offnum, bool firstPage);
      34             : static void _bt_saveitem(BTScanOpaque so, int itemIndex,
      35             :                          OffsetNumber offnum, IndexTuple itup);
      36             : static int  _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
      37             :                                   OffsetNumber offnum, ItemPointer heapTid,
      38             :                                   IndexTuple itup);
      39             : static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
      40             :                                        OffsetNumber offnum,
      41             :                                        ItemPointer heapTid, int tupleOffset);
      42             : static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
      43             : static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
      44             : static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
      45             :                                   ScanDirection dir);
      46             : static Buffer _bt_walk_left(Relation rel, Buffer buf);
      47             : static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
      48             : static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
      49             : 
      50             : 
      51             : /*
      52             :  *  _bt_drop_lock_and_maybe_pin()
      53             :  *
      54             :  * Unlock the buffer; and if it is safe to release the pin, do that, too.
      55             :  * This will prevent vacuum from stalling in a blocked state trying to read a
      56             :  * page when a cursor is sitting on it.
      57             :  *
      58             :  * See nbtree/README section on making concurrent TID recycling safe.
      59             :  */
      60             : static void
      61     7733942 : _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
      62             : {
      63     7733942 :     _bt_unlockbuf(scan->indexRelation, sp->buf);
      64             : 
      65     7733942 :     if (IsMVCCSnapshot(scan->xs_snapshot) &&
      66     7430428 :         RelationNeedsWAL(scan->indexRelation) &&
      67     7425234 :         !scan->xs_want_itup)
      68             :     {
      69     7319932 :         ReleaseBuffer(sp->buf);
      70     7319932 :         sp->buf = InvalidBuffer;
      71             :     }
      72     7733942 : }
      73             : 
      74             : /*
      75             :  *  _bt_search() -- Search the tree for a particular scankey,
      76             :  *      or more precisely for the first leaf page it could be on.
      77             :  *
      78             :  * The passed scankey is an insertion-type scankey (see nbtree/README),
      79             :  * but it can omit the rightmost column(s) of the index.
      80             :  *
      81             :  * Return value is a stack of parent-page pointers (i.e. there is no entry for
      82             :  * the leaf level/page).  *bufP is set to the address of the leaf-page buffer,
      83             :  * which is locked and pinned.  No locks are held on the parent pages,
      84             :  * however!
      85             :  *
      86             :  * The returned buffer is locked according to access parameter.  Additionally,
      87             :  * access = BT_WRITE will allow an empty root page to be created and returned.
      88             :  * When access = BT_READ, an empty index will result in *bufP being set to
      89             :  * InvalidBuffer.  Also, in BT_WRITE mode, any incomplete splits encountered
      90             :  * during the search will be finished.
      91             :  *
      92             :  * heaprel must be provided by callers that pass access = BT_WRITE, since we
      93             :  * might need to allocate a new root page for caller -- see _bt_allocbuf.
      94             :  */
      95             : BTStack
      96    17911694 : _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
      97             :            int access)
      98             : {
      99    17911694 :     BTStack     stack_in = NULL;
     100    17911694 :     int         page_access = BT_READ;
     101             : 
     102             :     /* heaprel must be set whenever _bt_allocbuf is reachable */
     103             :     Assert(access == BT_READ || access == BT_WRITE);
     104             :     Assert(access == BT_READ || heaprel != NULL);
     105             : 
     106             :     /* Get the root page to start with */
     107    17911694 :     *bufP = _bt_getroot(rel, heaprel, access);
     108             : 
     109             :     /* If index is empty and access = BT_READ, no root page is created. */
     110    17911694 :     if (!BufferIsValid(*bufP))
     111      474068 :         return (BTStack) NULL;
     112             : 
     113             :     /* Loop iterates once per level descended in the tree */
     114             :     for (;;)
     115    14530944 :     {
     116             :         Page        page;
     117             :         BTPageOpaque opaque;
     118             :         OffsetNumber offnum;
     119             :         ItemId      itemid;
     120             :         IndexTuple  itup;
     121             :         BlockNumber child;
     122             :         BTStack     new_stack;
     123             : 
     124             :         /*
     125             :          * Race -- the page we just grabbed may have split since we read its
     126             :          * downlink in its parent page (or the metapage).  If it has, we may
     127             :          * need to move right to its new sibling.  Do that.
     128             :          *
     129             :          * In write-mode, allow _bt_moveright to finish any incomplete splits
     130             :          * along the way.  Strictly speaking, we'd only need to finish an
     131             :          * incomplete split on the leaf page we're about to insert to, not on
     132             :          * any of the upper levels (internal pages with incomplete splits are
     133             :          * also taken care of in _bt_getstackbuf).  But this is a good
     134             :          * opportunity to finish splits of internal pages too.
     135             :          */
     136    31968570 :         *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
     137             :                               stack_in, page_access);
     138             : 
     139             :         /* if this is a leaf page, we're done */
     140    31968570 :         page = BufferGetPage(*bufP);
     141    31968570 :         opaque = BTPageGetOpaque(page);
     142    31968570 :         if (P_ISLEAF(opaque))
     143    17437626 :             break;
     144             : 
     145             :         /*
     146             :          * Find the appropriate pivot tuple on this page.  Its downlink points
     147             :          * to the child page that we're about to descend to.
     148             :          */
     149    14530944 :         offnum = _bt_binsrch(rel, key, *bufP);
     150    14530944 :         itemid = PageGetItemId(page, offnum);
     151    14530944 :         itup = (IndexTuple) PageGetItem(page, itemid);
     152             :         Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
     153    14530944 :         child = BTreeTupleGetDownLink(itup);
     154             : 
     155             :         /*
     156             :          * We need to save the location of the pivot tuple we chose in a new
     157             :          * stack entry for this page/level.  If caller ends up splitting a
     158             :          * page one level down, it usually ends up inserting a new pivot
     159             :          * tuple/downlink immediately after the location recorded here.
     160             :          */
     161    14530944 :         new_stack = (BTStack) palloc(sizeof(BTStackData));
     162    14530944 :         new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
     163    14530944 :         new_stack->bts_offset = offnum;
     164    14530944 :         new_stack->bts_parent = stack_in;
     165             : 
     166             :         /*
     167             :          * Page level 1 is lowest non-leaf page level prior to leaves.  So, if
     168             :          * we're on the level 1 and asked to lock leaf page in write mode,
     169             :          * then lock next page in write mode, because it must be a leaf.
     170             :          */
     171    14530944 :         if (opaque->btpo_level == 1 && access == BT_WRITE)
     172     5466374 :             page_access = BT_WRITE;
     173             : 
     174             :         /* drop the read lock on the page, then acquire one on its child */
     175    14530944 :         *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
     176             : 
     177             :         /* okay, all set to move down a level */
     178    14530944 :         stack_in = new_stack;
     179             :     }
     180             : 
     181             :     /*
     182             :      * If we're asked to lock leaf in write mode, but didn't manage to, then
     183             :      * relock.  This should only happen when the root page is a leaf page (and
     184             :      * the only page in the index other than the metapage).
     185             :      */
     186    17437626 :     if (access == BT_WRITE && page_access == BT_READ)
     187             :     {
     188             :         /* trade in our read lock for a write lock */
     189      739592 :         _bt_unlockbuf(rel, *bufP);
     190      739592 :         _bt_lockbuf(rel, *bufP, BT_WRITE);
     191             : 
     192             :         /*
     193             :          * Race -- the leaf page may have split after we dropped the read lock
     194             :          * but before we acquired a write lock.  If it has, we may need to
     195             :          * move right to its new sibling.  Do that.
     196             :          */
     197      739592 :         *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
     198             :     }
     199             : 
     200    17437626 :     return stack_in;
     201             : }
     202             : 
     203             : /*
     204             :  *  _bt_moveright() -- move right in the btree if necessary.
     205             :  *
     206             :  * When we follow a pointer to reach a page, it is possible that
     207             :  * the page has changed in the meanwhile.  If this happens, we're
     208             :  * guaranteed that the page has "split right" -- that is, that any
     209             :  * data that appeared on the page originally is either on the page
     210             :  * or strictly to the right of it.
     211             :  *
     212             :  * This routine decides whether or not we need to move right in the
     213             :  * tree by examining the high key entry on the page.  If that entry is
     214             :  * strictly less than the scankey, or <= the scankey in the
     215             :  * key.nextkey=true case, then we followed the wrong link and we need
     216             :  * to move right.
     217             :  *
     218             :  * The passed insertion-type scankey can omit the rightmost column(s) of the
     219             :  * index. (see nbtree/README)
     220             :  *
     221             :  * When key.nextkey is false (the usual case), we are looking for the first
     222             :  * item >= key.  When key.nextkey is true, we are looking for the first item
     223             :  * strictly greater than key.
     224             :  *
     225             :  * If forupdate is true, we will attempt to finish any incomplete splits
     226             :  * that we encounter.  This is required when locking a target page for an
     227             :  * insertion, because we don't allow inserting on a page before the split is
     228             :  * completed.  'heaprel' and 'stack' are only used if forupdate is true.
     229             :  *
     230             :  * On entry, we have the buffer pinned and a lock of the type specified by
     231             :  * 'access'.  If we move right, we release the buffer and lock and acquire
     232             :  * the same on the right sibling.  Return value is the buffer we stop at.
     233             :  */
     234             : Buffer
     235    32708162 : _bt_moveright(Relation rel,
     236             :               Relation heaprel,
     237             :               BTScanInsert key,
     238             :               Buffer buf,
     239             :               bool forupdate,
     240             :               BTStack stack,
     241             :               int access)
     242             : {
     243             :     Page        page;
     244             :     BTPageOpaque opaque;
     245             :     int32       cmpval;
     246             : 
     247             :     Assert(!forupdate || heaprel != NULL);
     248             : 
     249             :     /*
     250             :      * When nextkey = false (normal case): if the scan key that brought us to
     251             :      * this page is > the high key stored on the page, then the page has split
     252             :      * and we need to move right.  (pg_upgrade'd !heapkeyspace indexes could
     253             :      * have some duplicates to the right as well as the left, but that's
     254             :      * something that's only ever dealt with on the leaf level, after
     255             :      * _bt_search has found an initial leaf page.)
     256             :      *
     257             :      * When nextkey = true: move right if the scan key is >= page's high key.
     258             :      * (Note that key.scantid cannot be set in this case.)
     259             :      *
     260             :      * The page could even have split more than once, so scan as far as
     261             :      * needed.
     262             :      *
     263             :      * We also have to move right if we followed a link that brought us to a
     264             :      * dead page.
     265             :      */
     266    32708162 :     cmpval = key->nextkey ? 0 : 1;
     267             : 
     268             :     for (;;)
     269             :     {
     270    32709448 :         page = BufferGetPage(buf);
     271    32709448 :         opaque = BTPageGetOpaque(page);
     272             : 
     273    32709448 :         if (P_RIGHTMOST(opaque))
     274    25225104 :             break;
     275             : 
     276             :         /*
     277             :          * Finish any incomplete splits we encounter along the way.
     278             :          */
     279     7484344 :         if (forupdate && P_INCOMPLETE_SPLIT(opaque))
     280             :         {
     281           0 :             BlockNumber blkno = BufferGetBlockNumber(buf);
     282             : 
     283             :             /* upgrade our lock if necessary */
     284           0 :             if (access == BT_READ)
     285             :             {
     286           0 :                 _bt_unlockbuf(rel, buf);
     287           0 :                 _bt_lockbuf(rel, buf, BT_WRITE);
     288             :             }
     289             : 
     290           0 :             if (P_INCOMPLETE_SPLIT(opaque))
     291           0 :                 _bt_finish_split(rel, heaprel, buf, stack);
     292             :             else
     293           0 :                 _bt_relbuf(rel, buf);
     294             : 
     295             :             /* re-acquire the lock in the right mode, and re-check */
     296           0 :             buf = _bt_getbuf(rel, blkno, access);
     297           0 :             continue;
     298             :         }
     299             : 
     300     7484344 :         if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
     301             :         {
     302             :             /* step right one page */
     303        1286 :             buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
     304        1286 :             continue;
     305             :         }
     306             :         else
     307             :             break;
     308             :     }
     309             : 
     310    32708162 :     if (P_IGNORE(opaque))
     311           0 :         elog(ERROR, "fell off the end of index \"%s\"",
     312             :              RelationGetRelationName(rel));
     313             : 
     314    32708162 :     return buf;
     315             : }
     316             : 
     317             : /*
     318             :  *  _bt_binsrch() -- Do a binary search for a key on a particular page.
     319             :  *
     320             :  * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
     321             :  * of the last key < given scankey, or last key <= given scankey if nextkey
     322             :  * is true.  (Since _bt_compare treats the first data key of such a page as
     323             :  * minus infinity, there will be at least one key < scankey, so the result
     324             :  * always points at one of the keys on the page.)
     325             :  *
     326             :  * On a leaf page, _bt_binsrch() returns the final result of the initial
     327             :  * positioning process that started with _bt_first's call to _bt_search.
     328             :  * We're returning a non-pivot tuple offset, so things are a little different.
     329             :  * It is possible that we'll return an offset that's either past the last
     330             :  * non-pivot slot, or (in the case of a backward scan) before the first slot.
     331             :  *
     332             :  * This procedure is not responsible for walking right, it just examines
     333             :  * the given page.  _bt_binsrch() has no lock or refcount side effects
     334             :  * on the buffer.
     335             :  */
     336             : static OffsetNumber
     337    25355744 : _bt_binsrch(Relation rel,
     338             :             BTScanInsert key,
     339             :             Buffer buf)
     340             : {
     341             :     Page        page;
     342             :     BTPageOpaque opaque;
     343             :     OffsetNumber low,
     344             :                 high;
     345             :     int32       result,
     346             :                 cmpval;
     347             : 
     348    25355744 :     page = BufferGetPage(buf);
     349    25355744 :     opaque = BTPageGetOpaque(page);
     350             : 
     351             :     /* Requesting nextkey semantics while using scantid seems nonsensical */
     352             :     Assert(!key->nextkey || key->scantid == NULL);
     353             :     /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
     354             :     Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
     355             : 
     356    25355744 :     low = P_FIRSTDATAKEY(opaque);
     357    25355744 :     high = PageGetMaxOffsetNumber(page);
     358             : 
     359             :     /*
     360             :      * If there are no keys on the page, return the first available slot. Note
     361             :      * this covers two cases: the page is really empty (no keys), or it
     362             :      * contains only a high key.  The latter case is possible after vacuuming.
     363             :      * This can never happen on an internal page, however, since they are
     364             :      * never empty (an internal page must have at least one child).
     365             :      */
     366    25355744 :     if (unlikely(high < low))
     367        7642 :         return low;
     368             : 
     369             :     /*
     370             :      * Binary search to find the first key on the page >= scan key, or first
     371             :      * key > scankey when nextkey is true.
     372             :      *
     373             :      * For nextkey=false (cmpval=1), the loop invariant is: all slots before
     374             :      * 'low' are < scan key, all slots at or after 'high' are >= scan key.
     375             :      *
     376             :      * For nextkey=true (cmpval=0), the loop invariant is: all slots before
     377             :      * 'low' are <= scan key, all slots at or after 'high' are > scan key.
     378             :      *
     379             :      * We can fall out when high == low.
     380             :      */
     381    25348102 :     high++;                     /* establish the loop invariant for high */
     382             : 
     383    25348102 :     cmpval = key->nextkey ? 0 : 1;   /* select comparison value */
     384             : 
     385   166375588 :     while (high > low)
     386             :     {
     387   141027486 :         OffsetNumber mid = low + ((high - low) / 2);
     388             : 
     389             :         /* We have low <= mid < high, so mid points at a real slot */
     390             : 
     391   141027486 :         result = _bt_compare(rel, key, page, mid);
     392             : 
     393   141027486 :         if (result >= cmpval)
     394    90962164 :             low = mid + 1;
     395             :         else
     396    50065322 :             high = mid;
     397             :     }
     398             : 
     399             :     /*
     400             :      * At this point we have high == low.
     401             :      *
     402             :      * On a leaf page we always return the first non-pivot tuple >= scan key
     403             :      * (resp. > scan key) for forward scan callers.  For backward scans, it's
     404             :      * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
     405             :      */
     406    25348102 :     if (P_ISLEAF(opaque))
     407             :     {
     408             :         /*
     409             :          * In the backward scan case we're supposed to locate the last
     410             :          * matching tuple on the leaf level -- not the first matching tuple
     411             :          * (the last tuple will be the first one returned by the scan).
     412             :          *
     413             :          * At this point we've located the first non-pivot tuple immediately
     414             :          * after the last matching tuple (which might just be maxoff + 1).
     415             :          * Compensate by stepping back.
     416             :          */
     417    10817158 :         if (key->backward)
     418       49502 :             return OffsetNumberPrev(low);
     419             : 
     420    10767656 :         return low;
     421             :     }
     422             : 
     423             :     /*
     424             :      * On a non-leaf page, return the last key < scan key (resp. <= scan key).
     425             :      * There must be one if _bt_compare() is playing by the rules.
     426             :      *
     427             :      * _bt_compare() will seldom see any exactly-matching pivot tuples, since
     428             :      * a truncated -inf heap TID is usually enough to prevent it altogether.
     429             :      * Even omitted scan key entries are treated as > truncated attributes.
     430             :      *
     431             :      * However, during backward scans _bt_compare() interprets omitted scan
     432             :      * key attributes as == corresponding truncated -inf attributes instead.
     433             :      * This works just like < would work here.  Under this scheme, < strategy
     434             :      * backward scans will always directly descend to the correct leaf page.
     435             :      * In particular, they will never incur an "extra" leaf page access with a
     436             :      * scan key that happens to contain the same prefix of values as some
     437             :      * pivot tuple's untruncated prefix.  VACUUM relies on this guarantee when
     438             :      * it uses a leaf page high key to "re-find" a page undergoing deletion.
     439             :      */
     440             :     Assert(low > P_FIRSTDATAKEY(opaque));
     441             : 
     442    14530944 :     return OffsetNumberPrev(low);
     443             : }
     444             : 
     445             : /*
     446             :  *
     447             :  *  _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
     448             :  *
     449             :  * Like _bt_binsrch(), but with support for caching the binary search
     450             :  * bounds.  Only used during insertion, and only on the leaf page that it
     451             :  * looks like caller will insert tuple on.  Exclusive-locked and pinned
     452             :  * leaf page is contained within insertstate.
     453             :  *
     454             :  * Caches the bounds fields in insertstate so that a subsequent call can
     455             :  * reuse the low and strict high bounds of original binary search.  Callers
     456             :  * that use these fields directly must be prepared for the case where low
     457             :  * and/or stricthigh are not on the same page (one or both exceed maxoff
     458             :  * for the page).  The case where there are no items on the page (high <
     459             :  * low) makes bounds invalid.
     460             :  *
     461             :  * Caller is responsible for invalidating bounds when it modifies the page
     462             :  * before calling here a second time, and for dealing with posting list
     463             :  * tuple matches (callers can use insertstate's postingoff field to
     464             :  * determine which existing heap TID will need to be replaced by a posting
     465             :  * list split).
     466             :  */
     467             : OffsetNumber
     468    11164056 : _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
     469             : {
     470    11164056 :     BTScanInsert key = insertstate->itup_key;
     471             :     Page        page;
     472             :     BTPageOpaque opaque;
     473             :     OffsetNumber low,
     474             :                 high,
     475             :                 stricthigh;
     476             :     int32       result,
     477             :                 cmpval;
     478             : 
     479    11164056 :     page = BufferGetPage(insertstate->buf);
     480    11164056 :     opaque = BTPageGetOpaque(page);
     481             : 
     482             :     Assert(P_ISLEAF(opaque));
     483             :     Assert(!key->nextkey);
     484             :     Assert(insertstate->postingoff == 0);
     485             : 
     486    11164056 :     if (!insertstate->bounds_valid)
     487             :     {
     488             :         /* Start new binary search */
     489     6701328 :         low = P_FIRSTDATAKEY(opaque);
     490     6701328 :         high = PageGetMaxOffsetNumber(page);
     491             :     }
     492             :     else
     493             :     {
     494             :         /* Restore result of previous binary search against same page */
     495     4462728 :         low = insertstate->low;
     496     4462728 :         high = insertstate->stricthigh;
     497             :     }
     498             : 
     499             :     /* If there are no keys on the page, return the first available slot */
     500    11164056 :     if (unlikely(high < low))
     501             :     {
     502             :         /* Caller can't reuse bounds */
     503       21490 :         insertstate->low = InvalidOffsetNumber;
     504       21490 :         insertstate->stricthigh = InvalidOffsetNumber;
     505       21490 :         insertstate->bounds_valid = false;
     506       21490 :         return low;
     507             :     }
     508             : 
     509             :     /*
     510             :      * Binary search to find the first key on the page >= scan key. (nextkey
     511             :      * is always false when inserting).
     512             :      *
     513             :      * The loop invariant is: all slots before 'low' are < scan key, all slots
     514             :      * at or after 'high' are >= scan key.  'stricthigh' is > scan key, and is
     515             :      * maintained to save additional search effort for caller.
     516             :      *
     517             :      * We can fall out when high == low.
     518             :      */
     519    11142566 :     if (!insertstate->bounds_valid)
     520     6679838 :         high++;                 /* establish the loop invariant for high */
     521    11142566 :     stricthigh = high;          /* high initially strictly higher */
     522             : 
     523    11142566 :     cmpval = 1;                 /* !nextkey comparison value */
     524             : 
     525    60219580 :     while (high > low)
     526             :     {
     527    49077014 :         OffsetNumber mid = low + ((high - low) / 2);
     528             : 
     529             :         /* We have low <= mid < high, so mid points at a real slot */
     530             : 
     531    49077014 :         result = _bt_compare(rel, key, page, mid);
     532             : 
     533    49077014 :         if (result >= cmpval)
     534    37463050 :             low = mid + 1;
     535             :         else
     536             :         {
     537    11613964 :             high = mid;
     538    11613964 :             if (result != 0)
     539    10563210 :                 stricthigh = high;
     540             :         }
     541             : 
     542             :         /*
     543             :          * If tuple at offset located by binary search is a posting list whose
     544             :          * TID range overlaps with caller's scantid, perform posting list
     545             :          * binary search to set postingoff for caller.  Caller must split the
     546             :          * posting list when postingoff is set.  This should happen
     547             :          * infrequently.
     548             :          */
     549    49077014 :         if (unlikely(result == 0 && key->scantid != NULL))
     550             :         {
     551             :             /*
     552             :              * postingoff should never be set more than once per leaf page
     553             :              * binary search.  That would mean that there are duplicate table
     554             :              * TIDs in the index, which is never okay.  Check for that here.
     555             :              */
     556      416058 :             if (insertstate->postingoff != 0)
     557           0 :                 ereport(ERROR,
     558             :                         (errcode(ERRCODE_INDEX_CORRUPTED),
     559             :                          errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
     560             :                                          ItemPointerGetBlockNumber(key->scantid),
     561             :                                          ItemPointerGetOffsetNumber(key->scantid),
     562             :                                          low, stricthigh,
     563             :                                          BufferGetBlockNumber(insertstate->buf),
     564             :                                          RelationGetRelationName(rel))));
     565             : 
     566      416058 :             insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
     567             :         }
     568             :     }
     569             : 
     570             :     /*
     571             :      * On a leaf page, a binary search always returns the first key >= scan
     572             :      * key (at least in !nextkey case), which could be the last slot + 1. This
     573             :      * is also the lower bound of cached search.
     574             :      *
     575             :      * stricthigh may also be the last slot + 1, which prevents caller from
     576             :      * using bounds directly, but is still useful to us if we're called a
     577             :      * second time with cached bounds (cached low will be < stricthigh when
     578             :      * that happens).
     579             :      */
     580    11142566 :     insertstate->low = low;
     581    11142566 :     insertstate->stricthigh = stricthigh;
     582    11142566 :     insertstate->bounds_valid = true;
     583             : 
     584    11142566 :     return low;
     585             : }
     586             : 
     587             : /*----------
     588             :  *  _bt_binsrch_posting() -- posting list binary search.
     589             :  *
     590             :  * Helper routine for _bt_binsrch_insert().
     591             :  *
     592             :  * Returns offset into posting list where caller's scantid belongs.
     593             :  *----------
     594             :  */
     595             : static int
     596      416058 : _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
     597             : {
     598             :     IndexTuple  itup;
     599             :     ItemId      itemid;
     600             :     int         low,
     601             :                 high,
     602             :                 mid,
     603             :                 res;
     604             : 
     605             :     /*
     606             :      * If this isn't a posting tuple, then the index must be corrupt (if it is
     607             :      * an ordinary non-pivot tuple then there must be an existing tuple with a
     608             :      * heap TID that equals inserter's new heap TID/scantid).  Defensively
     609             :      * check that tuple is a posting list tuple whose posting list range
     610             :      * includes caller's scantid.
     611             :      *
     612             :      * (This is also needed because contrib/amcheck's rootdescend option needs
     613             :      * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
     614             :      */
     615      416058 :     itemid = PageGetItemId(page, offnum);
     616      416058 :     itup = (IndexTuple) PageGetItem(page, itemid);
     617      416058 :     if (!BTreeTupleIsPosting(itup))
     618      402196 :         return 0;
     619             : 
     620             :     Assert(key->heapkeyspace && key->allequalimage);
     621             : 
     622             :     /*
     623             :      * In the event that posting list tuple has LP_DEAD bit set, indicate this
     624             :      * to _bt_binsrch_insert() caller by returning -1, a sentinel value.  A
     625             :      * second call to _bt_binsrch_insert() can take place when its caller has
     626             :      * removed the dead item.
     627             :      */
     628       13862 :     if (ItemIdIsDead(itemid))
     629           2 :         return -1;
     630             : 
     631             :     /* "high" is past end of posting list for loop invariant */
     632       13860 :     low = 0;
     633       13860 :     high = BTreeTupleGetNPosting(itup);
     634             :     Assert(high >= 2);
     635             : 
     636      110186 :     while (high > low)
     637             :     {
     638       96326 :         mid = low + ((high - low) / 2);
     639       96326 :         res = ItemPointerCompare(key->scantid,
     640             :                                  BTreeTupleGetPostingN(itup, mid));
     641             : 
     642       96326 :         if (res > 0)
     643       50914 :             low = mid + 1;
     644       45412 :         else if (res < 0)
     645       45412 :             high = mid;
     646             :         else
     647           0 :             return mid;
     648             :     }
     649             : 
     650             :     /* Exact match not found */
     651       13860 :     return low;
     652             : }
     653             : 
     654             : /*----------
     655             :  *  _bt_compare() -- Compare insertion-type scankey to tuple on a page.
     656             :  *
     657             :  *  page/offnum: location of btree item to be compared to.
     658             :  *
     659             :  *      This routine returns:
     660             :  *          <0 if scankey < tuple at offnum;
     661             :  *           0 if scankey == tuple at offnum;
     662             :  *          >0 if scankey > tuple at offnum.
     663             :  *
     664             :  * NULLs in the keys are treated as sortable values.  Therefore
     665             :  * "equality" does not necessarily mean that the item should be returned
     666             :  * to the caller as a matching key.  Similarly, an insertion scankey
     667             :  * with its scantid set is treated as equal to a posting tuple whose TID
     668             :  * range overlaps with their scantid.  There generally won't be a
     669             :  * matching TID in the posting tuple, which caller must handle
     670             :  * themselves (e.g., by splitting the posting list tuple).
     671             :  *
     672             :  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
     673             :  * "minus infinity": this routine will always claim it is less than the
     674             :  * scankey.  The actual key value stored is explicitly truncated to 0
     675             :  * attributes (explicitly minus infinity) with version 3+ indexes, but
     676             :  * that isn't relied upon.  This allows us to implement the Lehman and
     677             :  * Yao convention that the first down-link pointer is before the first
     678             :  * key.  See backend/access/nbtree/README for details.
     679             :  *----------
     680             :  */
     681             : int32
     682   214565662 : _bt_compare(Relation rel,
     683             :             BTScanInsert key,
     684             :             Page page,
     685             :             OffsetNumber offnum)
     686             : {
     687   214565662 :     TupleDesc   itupdesc = RelationGetDescr(rel);
     688   214565662 :     BTPageOpaque opaque = BTPageGetOpaque(page);
     689             :     IndexTuple  itup;
     690             :     ItemPointer heapTid;
     691             :     ScanKey     scankey;
     692             :     int         ncmpkey;
     693             :     int         ntupatts;
     694             :     int32       result;
     695             : 
     696             :     Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
     697             :     Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
     698             :     Assert(key->heapkeyspace || key->scantid == NULL);
     699             : 
     700             :     /*
     701             :      * Force result ">" if target item is first data item on an internal page
     702             :      * --- see NOTE above.
     703             :      */
     704   214565662 :     if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
     705     2313350 :         return 1;
     706             : 
     707   212252312 :     itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
     708   212252312 :     ntupatts = BTreeTupleGetNAtts(itup, rel);
     709             : 
     710             :     /*
     711             :      * The scan key is set up with the attribute number associated with each
     712             :      * term in the key.  It is important that, if the index is multi-key, the
     713             :      * scan contain the first k key attributes, and that they be in order.  If
     714             :      * you think about how multi-key ordering works, you'll understand why
     715             :      * this is.
     716             :      *
     717             :      * We don't test for violation of this condition here, however.  The
     718             :      * initial setup for the index scan had better have gotten it right (see
     719             :      * _bt_first).
     720             :      */
     721             : 
     722   212252312 :     ncmpkey = Min(ntupatts, key->keysz);
     723             :     Assert(key->heapkeyspace || ncmpkey == key->keysz);
     724             :     Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
     725   212252312 :     scankey = key->scankeys;
     726   269028626 :     for (int i = 1; i <= ncmpkey; i++)
     727             :     {
     728             :         Datum       datum;
     729             :         bool        isNull;
     730             : 
     731   249622082 :         datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
     732             : 
     733   249622082 :         if (scankey->sk_flags & SK_ISNULL)   /* key is NULL */
     734             :         {
     735      538758 :             if (isNull)
     736      157024 :                 result = 0;     /* NULL "=" NULL */
     737      381734 :             else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
     738         264 :                 result = -1;    /* NULL "<" NOT_NULL */
     739             :             else
     740      381470 :                 result = 1;     /* NULL ">" NOT_NULL */
     741             :         }
     742   249083324 :         else if (isNull)        /* key is NOT_NULL and item is NULL */
     743             :         {
     744         222 :             if (scankey->sk_flags & SK_BT_NULLS_FIRST)
     745           0 :                 result = 1;     /* NOT_NULL ">" NULL */
     746             :             else
     747         222 :                 result = -1;    /* NOT_NULL "<" NULL */
     748             :         }
     749             :         else
     750             :         {
     751             :             /*
     752             :              * The sk_func needs to be passed the index value as left arg and
     753             :              * the sk_argument as right arg (they might be of different
     754             :              * types).  Since it is convenient for callers to think of
     755             :              * _bt_compare as comparing the scankey to the index item, we have
     756             :              * to flip the sign of the comparison result.  (Unless it's a DESC
     757             :              * column, in which case we *don't* flip the sign.)
     758             :              */
     759   249083102 :             result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
     760             :                                                      scankey->sk_collation,
     761             :                                                      datum,
     762             :                                                      scankey->sk_argument));
     763             : 
     764   249083102 :             if (!(scankey->sk_flags & SK_BT_DESC))
     765   249083036 :                 INVERT_COMPARE_RESULT(result);
     766             :         }
     767             : 
     768             :         /* if the keys are unequal, return the difference */
     769   249622082 :         if (result != 0)
     770   192845768 :             return result;
     771             : 
     772    56776314 :         scankey++;
     773             :     }
     774             : 
     775             :     /*
     776             :      * All non-truncated attributes (other than heap TID) were found to be
     777             :      * equal.  Treat truncated attributes as minus infinity when scankey has a
     778             :      * key attribute value that would otherwise be compared directly.
     779             :      *
     780             :      * Note: it doesn't matter if ntupatts includes non-key attributes;
     781             :      * scankey won't, so explicitly excluding non-key attributes isn't
     782             :      * necessary.
     783             :      */
     784    19406544 :     if (key->keysz > ntupatts)
     785      199080 :         return 1;
     786             : 
     787             :     /*
     788             :      * Use the heap TID attribute and scantid to try to break the tie.  The
     789             :      * rules are the same as any other key attribute -- only the
     790             :      * representation differs.
     791             :      */
     792    19207464 :     heapTid = BTreeTupleGetHeapTID(itup);
     793    19207464 :     if (key->scantid == NULL)
     794             :     {
     795             :         /*
     796             :          * Forward scans have a scankey that is considered greater than a
     797             :          * truncated pivot tuple if and when the scankey has equal values for
     798             :          * attributes up to and including the least significant untruncated
     799             :          * attribute in tuple.  Even attributes that were omitted from the
     800             :          * scan key are considered greater than -inf truncated attributes.
     801             :          * (See _bt_binsrch for an explanation of our backward scan behavior.)
     802             :          *
     803             :          * For example, if an index has the minimum two attributes (single
     804             :          * user key attribute, plus heap TID attribute), and a page's high key
     805             :          * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
     806             :          * will not descend to the page to the left.  The search will descend
     807             :          * right instead.  The truncated attribute in pivot tuple means that
     808             :          * all non-pivot tuples on the page to the left are strictly < 'foo',
     809             :          * so it isn't necessary to descend left.  In other words, search
     810             :          * doesn't have to descend left because it isn't interested in a match
     811             :          * that has a heap TID value of -inf.
     812             :          *
     813             :          * Note: the heap TID part of the test ensures that scankey is being
     814             :          * compared to a pivot tuple with one or more truncated -inf key
     815             :          * attributes.  The heap TID attribute is the last key attribute in
     816             :          * every index, of course, but other than that it isn't special.
     817             :          */
     818    15324500 :         if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
     819        9180 :             key->heapkeyspace)
     820        9180 :             return 1;
     821             : 
     822             :         /* All provided scankey arguments found to be equal */
     823    15315320 :         return 0;
     824             :     }
     825             : 
     826             :     /*
     827             :      * Treat truncated heap TID as minus infinity, since scankey has a key
     828             :      * attribute value (scantid) that would otherwise be compared directly
     829             :      */
     830             :     Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
     831     3882964 :     if (heapTid == NULL)
     832        3920 :         return 1;
     833             : 
     834             :     /*
     835             :      * Scankey must be treated as equal to a posting list tuple if its scantid
     836             :      * value falls within the range of the posting list.  In all other cases
     837             :      * there can only be a single heap TID value, which is compared directly
     838             :      * with scantid.
     839             :      */
     840             :     Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
     841     3879044 :     result = ItemPointerCompare(key->scantid, heapTid);
     842     3879044 :     if (result <= 0 || !BTreeTupleIsPosting(itup))
     843     3738496 :         return result;
     844             :     else
     845             :     {
     846      140548 :         result = ItemPointerCompare(key->scantid,
     847             :                                     BTreeTupleGetMaxHeapTID(itup));
     848      140548 :         if (result > 0)
     849      126686 :             return 1;
     850             :     }
     851             : 
     852       13862 :     return 0;
     853             : }
     854             : 
     855             : /*
     856             :  *  _bt_first() -- Find the first item in a scan.
     857             :  *
     858             :  *      We need to be clever about the direction of scan, the search
     859             :  *      conditions, and the tree ordering.  We find the first item (or,
     860             :  *      if backwards scan, the last item) in the tree that satisfies the
     861             :  *      qualifications in the scan key.  On success exit, the page containing
     862             :  *      the current index tuple is pinned but not locked, and data about
     863             :  *      the matching tuple(s) on the page has been loaded into so->currPos.
     864             :  *      scan->xs_heaptid is set to the heap TID of the current tuple, and if
     865             :  *      requested, scan->xs_itup points to a copy of the index tuple.
     866             :  *
     867             :  * If there are no matching items in the index, we return false, with no
     868             :  * pins or locks held.
     869             :  *
     870             :  * Note that scan->keyData[], and the so->keyData[] scankey built from it,
     871             :  * are both search-type scankeys (see nbtree/README for more about this).
     872             :  * Within this routine, we build a temporary insertion-type scankey to use
     873             :  * in locating the scan start position.
     874             :  */
     875             : bool
     876    11371980 : _bt_first(IndexScanDesc scan, ScanDirection dir)
     877             : {
     878    11371980 :     Relation    rel = scan->indexRelation;
     879    11371980 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     880             :     Buffer      buf;
     881             :     BTStack     stack;
     882             :     OffsetNumber offnum;
     883             :     StrategyNumber strat;
     884             :     BTScanInsertData inskey;
     885             :     ScanKey     startKeys[INDEX_MAX_KEYS];
     886             :     ScanKeyData notnullkeys[INDEX_MAX_KEYS];
     887    11371980 :     int         keysz = 0;
     888             :     int         i;
     889             :     bool        status;
     890             :     StrategyNumber strat_total;
     891             :     BTScanPosItem *currItem;
     892             :     BlockNumber blkno;
     893             : 
     894             :     Assert(!BTScanPosIsValid(so->currPos));
     895             : 
     896    11371980 :     pgstat_count_index_scan(rel);
     897             : 
     898             :     /*
     899             :      * Examine the scan keys and eliminate any redundant keys; also mark the
     900             :      * keys that must be matched to continue the scan.
     901             :      */
     902    11371980 :     _bt_preprocess_keys(scan);
     903             : 
     904             :     /*
     905             :      * Quit now if _bt_preprocess_keys() discovered that the scan keys can
     906             :      * never be satisfied (eg, x == 1 AND x > 2).
     907             :      */
     908    11371980 :     if (!so->qual_ok)
     909             :     {
     910        2708 :         _bt_parallel_done(scan);
     911        2708 :         return false;
     912             :     }
     913             : 
     914             :     /*
     915             :      * For parallel scans, get the starting page from shared state. If the
     916             :      * scan has not started, proceed to find out first leaf page in the usual
     917             :      * way while keeping other participating processes waiting.  If the scan
     918             :      * has already begun, use the page number from the shared structure.
     919             :      *
     920             :      * When a parallel scan has another primitive index scan scheduled, a
     921             :      * parallel worker will seize the scan for that purpose now.  This is
     922             :      * similar to the case where the top-level scan hasn't started.
     923             :      */
     924    11369272 :     if (scan->parallel_scan != NULL)
     925             :     {
     926         434 :         status = _bt_parallel_seize(scan, &blkno, true);
     927             : 
     928             :         /*
     929             :          * Initialize arrays (when _bt_parallel_seize didn't already set up
     930             :          * the next primitive index scan)
     931             :          */
     932         434 :         if (so->numArrayKeys && !so->needPrimScan)
     933          30 :             _bt_start_array_keys(scan, dir);
     934             : 
     935         434 :         if (!status)
     936         316 :             return false;
     937         118 :         else if (blkno == P_NONE)
     938             :         {
     939           0 :             _bt_parallel_done(scan);
     940           0 :             return false;
     941             :         }
     942         118 :         else if (blkno != InvalidBlockNumber)
     943             :         {
     944           0 :             if (!_bt_parallel_readpage(scan, blkno, dir))
     945           0 :                 return false;
     946           0 :             goto readcomplete;
     947             :         }
     948             :     }
     949    11368838 :     else if (so->numArrayKeys && !so->needPrimScan)
     950             :     {
     951             :         /*
     952             :          * First _bt_first call (for current btrescan) without parallelism.
     953             :          *
     954             :          * Initialize arrays, and the corresponding scan keys that were just
     955             :          * output by _bt_preprocess_keys.
     956             :          */
     957         730 :         _bt_start_array_keys(scan, dir);
     958             :     }
     959             : 
     960             :     /*----------
     961             :      * Examine the scan keys to discover where we need to start the scan.
     962             :      *
     963             :      * We want to identify the keys that can be used as starting boundaries;
     964             :      * these are =, >, or >= keys for a forward scan or =, <, <= keys for
     965             :      * a backwards scan.  We can use keys for multiple attributes so long as
     966             :      * the prior attributes had only =, >= (resp. =, <=) keys.  Once we accept
     967             :      * a > or < boundary or find an attribute with no boundary (which can be
     968             :      * thought of as the same as "> -infinity"), we can't use keys for any
     969             :      * attributes to its right, because it would break our simplistic notion
     970             :      * of what initial positioning strategy to use.
     971             :      *
     972             :      * When the scan keys include cross-type operators, _bt_preprocess_keys
     973             :      * may not be able to eliminate redundant keys; in such cases we will
     974             :      * arbitrarily pick a usable one for each attribute.  This is correct
     975             :      * but possibly not optimal behavior.  (For example, with keys like
     976             :      * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
     977             :      * x=5 would be more efficient.)  Since the situation only arises given
     978             :      * a poorly-worded query plus an incomplete opfamily, live with it.
     979             :      *
     980             :      * When both equality and inequality keys appear for a single attribute
     981             :      * (again, only possible when cross-type operators appear), we *must*
     982             :      * select one of the equality keys for the starting point, because
     983             :      * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
     984             :      * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
     985             :      * start at x=4, we will fail and stop before reaching x=10.  If multiple
     986             :      * equality quals survive preprocessing, however, it doesn't matter which
     987             :      * one we use --- by definition, they are either redundant or
     988             :      * contradictory.
     989             :      *
     990             :      * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
     991             :      * If the index stores nulls at the end of the index we'll be starting
     992             :      * from, and we have no boundary key for the column (which means the key
     993             :      * we deduced NOT NULL from is an inequality key that constrains the other
     994             :      * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
     995             :      * use as a boundary key.  If we didn't do this, we might find ourselves
     996             :      * traversing a lot of null entries at the start of the scan.
     997             :      *
     998             :      * In this loop, row-comparison keys are treated the same as keys on their
     999             :      * first (leftmost) columns.  We'll add on lower-order columns of the row
    1000             :      * comparison below, if possible.
    1001             :      *
    1002             :      * The selected scan keys (at most one per index column) are remembered by
    1003             :      * storing their addresses into the local startKeys[] array.
    1004             :      *
    1005             :      * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
    1006             :      * the next primitive index scan (for scans with array keys) based in part
    1007             :      * on an understanding of how it'll enable us to reposition the scan.
    1008             :      * They're directly aware of how we'll sometimes cons up an explicit
    1009             :      * SK_SEARCHNOTNULL key.  They'll even end primitive scans by applying a
    1010             :      * symmetric "deduce NOT NULL" rule of their own.  This allows top-level
    1011             :      * scans to skip large groups of NULLs through repeated deductions about
    1012             :      * key strictness (for a required inequality key) and whether NULLs in the
    1013             :      * key's index column are stored last or first (relative to non-NULLs).
    1014             :      * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
    1015             :      * need to be kept in sync.
    1016             :      *----------
    1017             :      */
    1018    11368956 :     strat_total = BTEqualStrategyNumber;
    1019    11368956 :     if (so->numberOfKeys > 0)
    1020             :     {
    1021             :         AttrNumber  curattr;
    1022             :         ScanKey     chosen;
    1023             :         ScanKey     impliesNN;
    1024             :         ScanKey     cur;
    1025             : 
    1026             :         /*
    1027             :          * chosen is the so-far-chosen key for the current attribute, if any.
    1028             :          * We don't cast the decision in stone until we reach keys for the
    1029             :          * next attribute.
    1030             :          */
    1031    11358336 :         curattr = 1;
    1032    11358336 :         chosen = NULL;
    1033             :         /* Also remember any scankey that implies a NOT NULL constraint */
    1034    11358336 :         impliesNN = NULL;
    1035             : 
    1036             :         /*
    1037             :          * Loop iterates from 0 to numberOfKeys inclusive; we use the last
    1038             :          * pass to handle after-last-key processing.  Actual exit from the
    1039             :          * loop is at one of the "break" statements below.
    1040             :          */
    1041    11358336 :         for (cur = so->keyData, i = 0;; cur++, i++)
    1042             :         {
    1043    29823266 :             if (i >= so->numberOfKeys || cur->sk_attno != curattr)
    1044             :             {
    1045             :                 /*
    1046             :                  * Done looking at keys for curattr.  If we didn't find a
    1047             :                  * usable boundary key, see if we can deduce a NOT NULL key.
    1048             :                  */
    1049    18528412 :                 if (chosen == NULL && impliesNN != NULL &&
    1050       62622 :                     ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
    1051             :                      ScanDirectionIsForward(dir) :
    1052             :                      ScanDirectionIsBackward(dir)))
    1053             :                 {
    1054             :                     /* Yes, so build the key in notnullkeys[keysz] */
    1055           6 :                     chosen = &notnullkeys[keysz];
    1056           6 :                     ScanKeyEntryInitialize(chosen,
    1057             :                                            (SK_SEARCHNOTNULL | SK_ISNULL |
    1058           6 :                                             (impliesNN->sk_flags &
    1059             :                                              (SK_BT_DESC | SK_BT_NULLS_FIRST))),
    1060             :                                            curattr,
    1061           6 :                                            ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
    1062             :                                             BTGreaterStrategyNumber :
    1063             :                                             BTLessStrategyNumber),
    1064             :                                            InvalidOid,
    1065             :                                            InvalidOid,
    1066             :                                            InvalidOid,
    1067             :                                            (Datum) 0);
    1068             :                 }
    1069             : 
    1070             :                 /*
    1071             :                  * If we still didn't find a usable boundary key, quit; else
    1072             :                  * save the boundary key pointer in startKeys.
    1073             :                  */
    1074    18465790 :                 if (chosen == NULL)
    1075       65308 :                     break;
    1076    18400482 :                 startKeys[keysz++] = chosen;
    1077             : 
    1078             :                 /*
    1079             :                  * Adjust strat_total, and quit if we have stored a > or <
    1080             :                  * key.
    1081             :                  */
    1082    18400482 :                 strat = chosen->sk_strategy;
    1083    18400482 :                 if (strat != BTEqualStrategyNumber)
    1084             :                 {
    1085     1079352 :                     strat_total = strat;
    1086     1079352 :                     if (strat == BTGreaterStrategyNumber ||
    1087             :                         strat == BTLessStrategyNumber)
    1088             :                         break;
    1089             :                 }
    1090             : 
    1091             :                 /*
    1092             :                  * Done if that was the last attribute, or if next key is not
    1093             :                  * in sequence (implying no boundary key is available for the
    1094             :                  * next attribute).
    1095             :                  */
    1096    17325458 :                 if (i >= so->numberOfKeys ||
    1097     7107934 :                     cur->sk_attno != curattr + 1)
    1098             :                     break;
    1099             : 
    1100             :                 /*
    1101             :                  * Reset for next attr.
    1102             :                  */
    1103     7107454 :                 curattr = cur->sk_attno;
    1104     7107454 :                 chosen = NULL;
    1105     7107454 :                 impliesNN = NULL;
    1106             :             }
    1107             : 
    1108             :             /*
    1109             :              * Can we use this key as a starting boundary for this attr?
    1110             :              *
    1111             :              * If not, does it imply a NOT NULL constraint?  (Because
    1112             :              * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
    1113             :              * *any* inequality key works for that; we need not test.)
    1114             :              */
    1115    18464930 :             switch (cur->sk_strategy)
    1116             :             {
    1117      113786 :                 case BTLessStrategyNumber:
    1118             :                 case BTLessEqualStrategyNumber:
    1119      113786 :                     if (chosen == NULL)
    1120             :                     {
    1121      111966 :                         if (ScanDirectionIsBackward(dir))
    1122       49362 :                             chosen = cur;
    1123             :                         else
    1124       62604 :                             impliesNN = cur;
    1125             :                     }
    1126      113786 :                     break;
    1127    17321130 :                 case BTEqualStrategyNumber:
    1128             :                     /* override any non-equality choice */
    1129    17321130 :                     chosen = cur;
    1130    17321130 :                     break;
    1131     1030014 :                 case BTGreaterEqualStrategyNumber:
    1132             :                 case BTGreaterStrategyNumber:
    1133     1030014 :                     if (chosen == NULL)
    1134             :                     {
    1135     1030014 :                         if (ScanDirectionIsForward(dir))
    1136     1029984 :                             chosen = cur;
    1137             :                         else
    1138          30 :                             impliesNN = cur;
    1139             :                     }
    1140     1030014 :                     break;
    1141             :             }
    1142    18464930 :         }
    1143             :     }
    1144             : 
    1145             :     /*
    1146             :      * If we found no usable boundary keys, we have to start from one end of
    1147             :      * the tree.  Walk down that edge to the first or last key, and scan from
    1148             :      * there.
    1149             :      */
    1150    11368956 :     if (keysz == 0)
    1151             :     {
    1152             :         bool        match;
    1153             : 
    1154       75776 :         match = _bt_endpoint(scan, dir);
    1155             : 
    1156       75776 :         if (!match)
    1157             :         {
    1158             :             /* No match, so mark (parallel) scan finished */
    1159        8026 :             _bt_parallel_done(scan);
    1160             :         }
    1161             : 
    1162       75776 :         return match;
    1163             :     }
    1164             : 
    1165             :     /*
    1166             :      * We want to start the scan somewhere within the index.  Set up an
    1167             :      * insertion scankey we can use to search for the boundary point we
    1168             :      * identified above.  The insertion scankey is built using the keys
    1169             :      * identified by startKeys[].  (Remaining insertion scankey fields are
    1170             :      * initialized after initial-positioning strategy is finalized.)
    1171             :      */
    1172             :     Assert(keysz <= INDEX_MAX_KEYS);
    1173    29693638 :     for (i = 0; i < keysz; i++)
    1174             :     {
    1175    18400482 :         ScanKey     cur = startKeys[i];
    1176             : 
    1177             :         Assert(cur->sk_attno == i + 1);
    1178             : 
    1179    18400482 :         if (cur->sk_flags & SK_ROW_HEADER)
    1180             :         {
    1181             :             /*
    1182             :              * Row comparison header: look to the first row member instead.
    1183             :              *
    1184             :              * The member scankeys are already in insertion format (ie, they
    1185             :              * have sk_func = 3-way-comparison function), but we have to watch
    1186             :              * out for nulls, which _bt_preprocess_keys didn't check. A null
    1187             :              * in the first row member makes the condition unmatchable, just
    1188             :              * like qual_ok = false.
    1189             :              */
    1190          24 :             ScanKey     subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
    1191             : 
    1192             :             Assert(subkey->sk_flags & SK_ROW_MEMBER);
    1193          24 :             if (subkey->sk_flags & SK_ISNULL)
    1194             :             {
    1195           0 :                 _bt_parallel_done(scan);
    1196           0 :                 return false;
    1197             :             }
    1198          24 :             memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
    1199             : 
    1200             :             /*
    1201             :              * If the row comparison is the last positioning key we accepted,
    1202             :              * try to add additional keys from the lower-order row members.
    1203             :              * (If we accepted independent conditions on additional index
    1204             :              * columns, we use those instead --- doesn't seem worth trying to
    1205             :              * determine which is more restrictive.)  Note that this is OK
    1206             :              * even if the row comparison is of ">" or "<" type, because the
    1207             :              * condition applied to all but the last row member is effectively
    1208             :              * ">=" or "<=", and so the extra keys don't break the positioning
    1209             :              * scheme.  But, by the same token, if we aren't able to use all
    1210             :              * the row members, then the part of the row comparison that we
    1211             :              * did use has to be treated as just a ">=" or "<=" condition, and
    1212             :              * so we'd better adjust strat_total accordingly.
    1213             :              */
    1214          24 :             if (i == keysz - 1)
    1215             :             {
    1216          24 :                 bool        used_all_subkeys = false;
    1217             : 
    1218             :                 Assert(!(subkey->sk_flags & SK_ROW_END));
    1219             :                 for (;;)
    1220             :                 {
    1221          24 :                     subkey++;
    1222             :                     Assert(subkey->sk_flags & SK_ROW_MEMBER);
    1223          24 :                     if (subkey->sk_attno != keysz + 1)
    1224           0 :                         break;  /* out-of-sequence, can't use it */
    1225          24 :                     if (subkey->sk_strategy != cur->sk_strategy)
    1226           0 :                         break;  /* wrong direction, can't use it */
    1227          24 :                     if (subkey->sk_flags & SK_ISNULL)
    1228           0 :                         break;  /* can't use null keys */
    1229             :                     Assert(keysz < INDEX_MAX_KEYS);
    1230          24 :                     memcpy(inskey.scankeys + keysz, subkey,
    1231             :                            sizeof(ScanKeyData));
    1232          24 :                     keysz++;
    1233          24 :                     if (subkey->sk_flags & SK_ROW_END)
    1234             :                     {
    1235          24 :                         used_all_subkeys = true;
    1236          24 :                         break;
    1237             :                     }
    1238             :                 }
    1239          24 :                 if (!used_all_subkeys)
    1240             :                 {
    1241           0 :                     switch (strat_total)
    1242             :                     {
    1243           0 :                         case BTLessStrategyNumber:
    1244           0 :                             strat_total = BTLessEqualStrategyNumber;
    1245           0 :                             break;
    1246           0 :                         case BTGreaterStrategyNumber:
    1247           0 :                             strat_total = BTGreaterEqualStrategyNumber;
    1248           0 :                             break;
    1249             :                     }
    1250          24 :                 }
    1251          24 :                 break;          /* done with outer loop */
    1252             :             }
    1253             :         }
    1254             :         else
    1255             :         {
    1256             :             /*
    1257             :              * Ordinary comparison key.  Transform the search-style scan key
    1258             :              * to an insertion scan key by replacing the sk_func with the
    1259             :              * appropriate btree comparison function.
    1260             :              *
    1261             :              * If scankey operator is not a cross-type comparison, we can use
    1262             :              * the cached comparison function; otherwise gotta look it up in
    1263             :              * the catalogs.  (That can't lead to infinite recursion, since no
    1264             :              * indexscan initiated by syscache lookup will use cross-data-type
    1265             :              * operators.)
    1266             :              *
    1267             :              * We support the convention that sk_subtype == InvalidOid means
    1268             :              * the opclass input type; this is a hack to simplify life for
    1269             :              * ScanKeyInit().
    1270             :              */
    1271    18400458 :             if (cur->sk_subtype == rel->rd_opcintype[i] ||
    1272    17795268 :                 cur->sk_subtype == InvalidOid)
    1273    18386800 :             {
    1274             :                 FmgrInfo   *procinfo;
    1275             : 
    1276    18386800 :                 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
    1277    18386800 :                 ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
    1278             :                                                cur->sk_flags,
    1279    18386800 :                                                cur->sk_attno,
    1280             :                                                InvalidStrategy,
    1281             :                                                cur->sk_subtype,
    1282             :                                                cur->sk_collation,
    1283             :                                                procinfo,
    1284             :                                                cur->sk_argument);
    1285             :             }
    1286             :             else
    1287             :             {
    1288             :                 RegProcedure cmp_proc;
    1289             : 
    1290       13658 :                 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
    1291       13658 :                                              rel->rd_opcintype[i],
    1292             :                                              cur->sk_subtype,
    1293             :                                              BTORDER_PROC);
    1294       13658 :                 if (!RegProcedureIsValid(cmp_proc))
    1295           0 :                     elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
    1296             :                          BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
    1297             :                          cur->sk_attno, RelationGetRelationName(rel));
    1298       13658 :                 ScanKeyEntryInitialize(inskey.scankeys + i,
    1299             :                                        cur->sk_flags,
    1300       13658 :                                        cur->sk_attno,
    1301             :                                        InvalidStrategy,
    1302             :                                        cur->sk_subtype,
    1303             :                                        cur->sk_collation,
    1304             :                                        cmp_proc,
    1305             :                                        cur->sk_argument);
    1306             :             }
    1307             :         }
    1308             :     }
    1309             : 
    1310             :     /*----------
    1311             :      * Examine the selected initial-positioning strategy to determine exactly
    1312             :      * where we need to start the scan, and set flag variables to control the
    1313             :      * initial descent by _bt_search (and our _bt_binsrch call for the leaf
    1314             :      * page _bt_search returns).
    1315             :      *----------
    1316             :      */
    1317    11293180 :     _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
    1318    11293180 :     inskey.anynullkeys = false; /* unused */
    1319    11293180 :     inskey.scantid = NULL;
    1320    11293180 :     inskey.keysz = keysz;
    1321    11293180 :     switch (strat_total)
    1322             :     {
    1323       49362 :         case BTLessStrategyNumber:
    1324             : 
    1325       49362 :             inskey.nextkey = false;
    1326       49362 :             inskey.backward = true;
    1327       49362 :             break;
    1328             : 
    1329           6 :         case BTLessEqualStrategyNumber:
    1330             : 
    1331           6 :             inskey.nextkey = true;
    1332           6 :             inskey.backward = true;
    1333           6 :             break;
    1334             : 
    1335    10213828 :         case BTEqualStrategyNumber:
    1336             : 
    1337             :             /*
    1338             :              * If a backward scan was specified, need to start with last equal
    1339             :              * item not first one.
    1340             :              */
    1341    10213828 :             if (ScanDirectionIsBackward(dir))
    1342             :             {
    1343             :                 /*
    1344             :                  * This is the same as the <= strategy
    1345             :                  */
    1346         148 :                 inskey.nextkey = true;
    1347         148 :                 inskey.backward = true;
    1348             :             }
    1349             :             else
    1350             :             {
    1351             :                 /*
    1352             :                  * This is the same as the >= strategy
    1353             :                  */
    1354    10213680 :                 inskey.nextkey = false;
    1355    10213680 :                 inskey.backward = false;
    1356             :             }
    1357    10213828 :             break;
    1358             : 
    1359        4322 :         case BTGreaterEqualStrategyNumber:
    1360             : 
    1361             :             /*
    1362             :              * Find first item >= scankey
    1363             :              */
    1364        4322 :             inskey.nextkey = false;
    1365        4322 :             inskey.backward = false;
    1366        4322 :             break;
    1367             : 
    1368     1025662 :         case BTGreaterStrategyNumber:
    1369             : 
    1370             :             /*
    1371             :              * Find first item > scankey
    1372             :              */
    1373     1025662 :             inskey.nextkey = true;
    1374     1025662 :             inskey.backward = false;
    1375     1025662 :             break;
    1376             : 
    1377           0 :         default:
    1378             :             /* can't get here, but keep compiler quiet */
    1379           0 :             elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
    1380             :             return false;
    1381             :     }
    1382             : 
    1383             :     /*
    1384             :      * Use the manufactured insertion scan key to descend the tree and
    1385             :      * position ourselves on the target leaf page.
    1386             :      */
    1387             :     Assert(ScanDirectionIsBackward(dir) == inskey.backward);
    1388    11293180 :     stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
    1389             : 
    1390             :     /* don't need to keep the stack around... */
    1391    11293180 :     _bt_freestack(stack);
    1392             : 
    1393    11293180 :     if (!BufferIsValid(buf))
    1394             :     {
    1395             :         /*
    1396             :          * We only get here if the index is completely empty. Lock relation
    1397             :          * because nothing finer to lock exists.  Without a buffer lock, it's
    1398             :          * possible for another transaction to insert data between
    1399             :          * _bt_search() and PredicateLockRelation().  We have to try again
    1400             :          * after taking the relation-level predicate lock, to close a narrow
    1401             :          * window where we wouldn't scan concurrently inserted tuples, but the
    1402             :          * writer wouldn't see our predicate lock.
    1403             :          */
    1404      468380 :         if (IsolationIsSerializable())
    1405             :         {
    1406        5688 :             PredicateLockRelation(rel, scan->xs_snapshot);
    1407        5688 :             stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
    1408        5688 :             _bt_freestack(stack);
    1409             :         }
    1410             : 
    1411      468380 :         if (!BufferIsValid(buf))
    1412             :         {
    1413             :             /*
    1414             :              * Mark parallel scan as done, so that all the workers can finish
    1415             :              * their scan.
    1416             :              */
    1417      468380 :             _bt_parallel_done(scan);
    1418      468380 :             BTScanPosInvalidate(so->currPos);
    1419      468380 :             return false;
    1420             :         }
    1421             :     }
    1422             : 
    1423    10824800 :     PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
    1424             : 
    1425    10824800 :     _bt_initialize_more_data(so, dir);
    1426             : 
    1427             :     /* position to the precise item on the page */
    1428    10824800 :     offnum = _bt_binsrch(rel, &inskey, buf);
    1429             :     Assert(!BTScanPosIsValid(so->currPos));
    1430    10824800 :     so->currPos.buf = buf;
    1431             : 
    1432             :     /*
    1433             :      * Now load data from the first page of the scan.
    1434             :      *
    1435             :      * If inskey.nextkey = false and inskey.backward = false, offnum is
    1436             :      * positioned at the first non-pivot tuple >= inskey.scankeys.
    1437             :      *
    1438             :      * If inskey.nextkey = false and inskey.backward = true, offnum is
    1439             :      * positioned at the last non-pivot tuple < inskey.scankeys.
    1440             :      *
    1441             :      * If inskey.nextkey = true and inskey.backward = false, offnum is
    1442             :      * positioned at the first non-pivot tuple > inskey.scankeys.
    1443             :      *
    1444             :      * If inskey.nextkey = true and inskey.backward = true, offnum is
    1445             :      * positioned at the last non-pivot tuple <= inskey.scankeys.
    1446             :      *
    1447             :      * It's possible that _bt_binsrch returned an offnum that is out of bounds
    1448             :      * for the page.  For example, when inskey is both < the leaf page's high
    1449             :      * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
    1450             :      */
    1451    10824800 :     if (!_bt_readpage(scan, dir, offnum, true))
    1452             :     {
    1453             :         /*
    1454             :          * There's no actually-matching data on this page.  Try to advance to
    1455             :          * the next page.  Return false if there's no matching data at all.
    1456             :          */
    1457     3187342 :         _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
    1458     3187342 :         if (!_bt_steppage(scan, dir))
    1459     3187290 :             return false;
    1460             :     }
    1461             :     else
    1462             :     {
    1463             :         /* We have at least one item to return as scan's first item */
    1464     7637458 :         _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
    1465             :     }
    1466             : 
    1467     7637510 : readcomplete:
    1468             :     /* OK, itemIndex says what to return */
    1469     7637510 :     currItem = &so->currPos.items[so->currPos.itemIndex];
    1470     7637510 :     scan->xs_heaptid = currItem->heapTid;
    1471     7637510 :     if (scan->xs_want_itup)
    1472      142302 :         scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
    1473             : 
    1474     7637510 :     return true;
    1475             : }
    1476             : 
    1477             : /*
    1478             :  *  _bt_next() -- Get the next item in a scan.
    1479             :  *
    1480             :  *      On entry, so->currPos describes the current page, which may be pinned
    1481             :  *      but is not locked, and so->currPos.itemIndex identifies which item was
    1482             :  *      previously returned.
    1483             :  *
    1484             :  *      On successful exit, scan->xs_heaptid is set to the TID of the next
    1485             :  *      heap tuple, and if requested, scan->xs_itup points to a copy of the
    1486             :  *      index tuple.  so->currPos is updated as needed.
    1487             :  *
    1488             :  *      On failure exit (no more tuples), we release pin and set
    1489             :  *      so->currPos.buf to InvalidBuffer.
    1490             :  */
    1491             : bool
    1492    15261244 : _bt_next(IndexScanDesc scan, ScanDirection dir)
    1493             : {
    1494    15261244 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1495             :     BTScanPosItem *currItem;
    1496             : 
    1497             :     /*
    1498             :      * Advance to next tuple on current page; or if there's no more, try to
    1499             :      * step to the next page with data.
    1500             :      */
    1501    15261244 :     if (ScanDirectionIsForward(dir))
    1502             :     {
    1503    15242838 :         if (++so->currPos.itemIndex > so->currPos.lastItem)
    1504             :         {
    1505     1903918 :             if (!_bt_steppage(scan, dir))
    1506     1875264 :                 return false;
    1507             :         }
    1508             :     }
    1509             :     else
    1510             :     {
    1511       18406 :         if (--so->currPos.itemIndex < so->currPos.firstItem)
    1512             :         {
    1513         108 :             if (!_bt_steppage(scan, dir))
    1514          80 :                 return false;
    1515             :         }
    1516             :     }
    1517             : 
    1518             :     /* OK, itemIndex says what to return */
    1519    13385900 :     currItem = &so->currPos.items[so->currPos.itemIndex];
    1520    13385900 :     scan->xs_heaptid = currItem->heapTid;
    1521    13385900 :     if (scan->xs_want_itup)
    1522     3647610 :         scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
    1523             : 
    1524    13385900 :     return true;
    1525             : }
    1526             : 
    1527             : /*
    1528             :  *  _bt_readpage() -- Load data from current index page into so->currPos
    1529             :  *
    1530             :  * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
    1531             :  * is not changed here.  Also, currPos.moreLeft and moreRight must be valid;
    1532             :  * they are updated as appropriate.  All other fields of so->currPos are
    1533             :  * initialized from scratch here.
    1534             :  *
    1535             :  * We scan the current page starting at offnum and moving in the indicated
    1536             :  * direction.  All items matching the scan keys are loaded into currPos.items.
    1537             :  * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
    1538             :  * that there can be no more matching tuples in the current scan direction
    1539             :  * (could just be for the current primitive index scan when scan has arrays).
    1540             :  *
    1541             :  * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
    1542             :  * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
    1543             :  * _bt_checkkeys will stop the scan as soon as an equality qual fails (when
    1544             :  * its scan key was marked required), so _bt_first _must_ pass us an offnum
    1545             :  * exactly at the beginning of where equal tuples are to be found.  When we're
    1546             :  * passed an offnum past the end of the page, we might still manage to stop
    1547             :  * the scan on this page by calling _bt_checkkeys against the high key.
    1548             :  *
    1549             :  * In the case of a parallel scan, caller must have called _bt_parallel_seize
    1550             :  * prior to calling this function; this function will invoke
    1551             :  * _bt_parallel_release before returning.
    1552             :  *
    1553             :  * Returns true if any matching items found on the page, false if none.
    1554             :  */
    1555             : static bool
    1556    10924590 : _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
    1557             :              bool firstPage)
    1558             : {
    1559    10924590 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1560             :     Page        page;
    1561             :     BTPageOpaque opaque;
    1562             :     OffsetNumber minoff;
    1563             :     OffsetNumber maxoff;
    1564             :     BTReadPageState pstate;
    1565             :     bool        arrayKeys;
    1566             :     int         itemIndex,
    1567             :                 indnatts;
    1568             : 
    1569             :     /*
    1570             :      * We must have the buffer pinned and locked, but the usual macro can't be
    1571             :      * used here; this function is what makes it good for currPos.
    1572             :      */
    1573             :     Assert(BufferIsValid(so->currPos.buf));
    1574             : 
    1575    10924590 :     page = BufferGetPage(so->currPos.buf);
    1576    10924590 :     opaque = BTPageGetOpaque(page);
    1577             : 
    1578             :     /* allow next page be processed by parallel worker */
    1579    10924590 :     if (scan->parallel_scan)
    1580             :     {
    1581        1330 :         if (ScanDirectionIsForward(dir))
    1582        1330 :             pstate.prev_scan_page = opaque->btpo_next;
    1583             :         else
    1584           0 :             pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf);
    1585             : 
    1586        1330 :         _bt_parallel_release(scan, pstate.prev_scan_page);
    1587             :     }
    1588             : 
    1589    10924590 :     indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
    1590    10924590 :     arrayKeys = so->numArrayKeys != 0;
    1591    10924590 :     minoff = P_FIRSTDATAKEY(opaque);
    1592    10924590 :     maxoff = PageGetMaxOffsetNumber(page);
    1593             : 
    1594             :     /* initialize page-level state that we'll pass to _bt_checkkeys */
    1595    10924590 :     pstate.dir = dir;
    1596    10924590 :     pstate.minoff = minoff;
    1597    10924590 :     pstate.maxoff = maxoff;
    1598    10924590 :     pstate.finaltup = NULL;
    1599    10924590 :     pstate.page = page;
    1600    10924590 :     pstate.offnum = InvalidOffsetNumber;
    1601    10924590 :     pstate.skip = InvalidOffsetNumber;
    1602    10924590 :     pstate.continuescan = true; /* default assumption */
    1603    10924590 :     pstate.prechecked = false;
    1604    10924590 :     pstate.firstmatch = false;
    1605    10924590 :     pstate.rechecks = 0;
    1606    10924590 :     pstate.targetdistance = 0;
    1607             : 
    1608             :     /*
    1609             :      * We note the buffer's block number so that we can release the pin later.
    1610             :      * This allows us to re-read the buffer if it is needed again for hinting.
    1611             :      */
    1612    10924590 :     so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
    1613             : 
    1614             :     /*
    1615             :      * We save the LSN of the page as we read it, so that we know whether it
    1616             :      * safe to apply LP_DEAD hints to the page later.  This allows us to drop
    1617             :      * the pin for MVCC scans, which allows vacuum to avoid blocking.
    1618             :      */
    1619    10924590 :     so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
    1620             : 
    1621             :     /*
    1622             :      * we must save the page's right-link while scanning it; this tells us
    1623             :      * where to step right to after we're done with these items.  There is no
    1624             :      * corresponding need for the left-link, since splits always go right.
    1625             :      */
    1626    10924590 :     so->currPos.nextPage = opaque->btpo_next;
    1627             : 
    1628             :     /* initialize tuple workspace to empty */
    1629    10924590 :     so->currPos.nextTupleOffset = 0;
    1630             : 
    1631             :     /*
    1632             :      * Now that the current page has been made consistent, the macro should be
    1633             :      * good.
    1634             :      */
    1635             :     Assert(BTScanPosIsPinned(so->currPos));
    1636             : 
    1637             :     /*
    1638             :      * Prechecking the value of the continuescan flag for the last item on the
    1639             :      * page (for backwards scan it will be the first item on a page).  If we
    1640             :      * observe it to be true, then it should be true for all other items. This
    1641             :      * allows us to do significant optimizations in the _bt_checkkeys()
    1642             :      * function for all the items on the page.
    1643             :      *
    1644             :      * With the forward scan, we do this check for the last item on the page
    1645             :      * instead of the high key.  It's relatively likely that the most
    1646             :      * significant column in the high key will be different from the
    1647             :      * corresponding value from the last item on the page.  So checking with
    1648             :      * the last item on the page would give a more precise answer.
    1649             :      *
    1650             :      * We skip this for the first page read by each (primitive) scan, to avoid
    1651             :      * slowing down point queries.  They typically don't stand to gain much
    1652             :      * when the optimization can be applied, and are more likely to notice the
    1653             :      * overhead of the precheck.
    1654             :      *
    1655             :      * The optimization is unsafe and must be avoided whenever _bt_checkkeys
    1656             :      * just set a low-order required array's key to the best available match
    1657             :      * for a truncated -inf attribute value from the prior page's high key
    1658             :      * (array element 0 is always the best available match in this scenario).
    1659             :      * It's quite likely that matches for array element 0 begin on this page,
    1660             :      * but the start of matches won't necessarily align with page boundaries.
    1661             :      * When the start of matches is somewhere in the middle of this page, it
    1662             :      * would be wrong to treat page's final non-pivot tuple as representative.
    1663             :      * Doing so might lead us to treat some of the page's earlier tuples as
    1664             :      * being part of a group of tuples thought to satisfy the required keys.
    1665             :      *
    1666             :      * Note: Conversely, in the case where the scan's arrays just advanced
    1667             :      * using the prior page's HIKEY _without_ advancement setting scanBehind,
    1668             :      * the start of matches must be aligned with page boundaries, which makes
    1669             :      * it safe to attempt the optimization here now.  It's also safe when the
    1670             :      * prior page's HIKEY simply didn't need to advance any required array. In
    1671             :      * both cases we can safely assume that the _first_ tuple from this page
    1672             :      * must be >= the current set of array keys/equality constraints. And so
    1673             :      * if the final tuple is == those same keys (and also satisfies any
    1674             :      * required < or <= strategy scan keys) during the precheck, we can safely
    1675             :      * assume that this must also be true of all earlier tuples from the page.
    1676             :      */
    1677    10924590 :     if (!firstPage && !so->scanBehind && minoff < maxoff)
    1678             :     {
    1679             :         ItemId      iid;
    1680             :         IndexTuple  itup;
    1681             : 
    1682       30454 :         iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
    1683       30454 :         itup = (IndexTuple) PageGetItem(page, iid);
    1684             : 
    1685             :         /* Call with arrayKeys=false to avoid undesirable side-effects */
    1686       30454 :         _bt_checkkeys(scan, &pstate, false, itup, indnatts);
    1687       30454 :         pstate.prechecked = pstate.continuescan;
    1688       30454 :         pstate.continuescan = true; /* reset */
    1689             :     }
    1690             : 
    1691    10924590 :     if (ScanDirectionIsForward(dir))
    1692             :     {
    1693             :         /* SK_SEARCHARRAY forward scans must provide high key up front */
    1694    10874952 :         if (arrayKeys && !P_RIGHTMOST(opaque))
    1695             :         {
    1696        1080 :             ItemId      iid = PageGetItemId(page, P_HIKEY);
    1697             : 
    1698        1080 :             pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
    1699             :         }
    1700             : 
    1701             :         /* load items[] in ascending order */
    1702    10874952 :         itemIndex = 0;
    1703             : 
    1704    10874952 :         offnum = Max(offnum, minoff);
    1705             : 
    1706    45424904 :         while (offnum <= maxoff)
    1707             :         {
    1708    42748452 :             ItemId      iid = PageGetItemId(page, offnum);
    1709             :             IndexTuple  itup;
    1710             :             bool        passes_quals;
    1711             : 
    1712             :             /*
    1713             :              * If the scan specifies not to return killed tuples, then we
    1714             :              * treat a killed tuple as not passing the qual
    1715             :              */
    1716    42748452 :             if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
    1717             :             {
    1718     2298884 :                 offnum = OffsetNumberNext(offnum);
    1719     2298884 :                 continue;
    1720             :             }
    1721             : 
    1722    40449568 :             itup = (IndexTuple) PageGetItem(page, iid);
    1723             :             Assert(!BTreeTupleIsPivot(itup));
    1724             : 
    1725    40449568 :             pstate.offnum = offnum;
    1726    40449568 :             passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
    1727             :                                          itup, indnatts);
    1728             : 
    1729             :             /*
    1730             :              * Check if we need to skip ahead to a later tuple (only possible
    1731             :              * when the scan uses array keys)
    1732             :              */
    1733    40449568 :             if (arrayKeys && OffsetNumberIsValid(pstate.skip))
    1734             :             {
    1735             :                 Assert(!passes_quals && pstate.continuescan);
    1736             :                 Assert(offnum < pstate.skip);
    1737             : 
    1738         458 :                 offnum = pstate.skip;
    1739         458 :                 pstate.skip = InvalidOffsetNumber;
    1740         458 :                 continue;
    1741             :             }
    1742             : 
    1743    40449110 :             if (passes_quals)
    1744             :             {
    1745             :                 /* tuple passes all scan key conditions */
    1746    31847684 :                 pstate.firstmatch = true;
    1747    31847684 :                 if (!BTreeTupleIsPosting(itup))
    1748             :                 {
    1749             :                     /* Remember it */
    1750    31382288 :                     _bt_saveitem(so, itemIndex, offnum, itup);
    1751    31382288 :                     itemIndex++;
    1752             :                 }
    1753             :                 else
    1754             :                 {
    1755             :                     int         tupleOffset;
    1756             : 
    1757             :                     /*
    1758             :                      * Set up state to return posting list, and remember first
    1759             :                      * TID
    1760             :                      */
    1761             :                     tupleOffset =
    1762      465396 :                         _bt_setuppostingitems(so, itemIndex, offnum,
    1763             :                                               BTreeTupleGetPostingN(itup, 0),
    1764             :                                               itup);
    1765      465396 :                     itemIndex++;
    1766             :                     /* Remember additional TIDs */
    1767     2719996 :                     for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
    1768             :                     {
    1769     2254600 :                         _bt_savepostingitem(so, itemIndex, offnum,
    1770             :                                             BTreeTupleGetPostingN(itup, i),
    1771             :                                             tupleOffset);
    1772     2254600 :                         itemIndex++;
    1773             :                     }
    1774             :                 }
    1775             :             }
    1776             :             /* When !continuescan, there can't be any more matches, so stop */
    1777    40449110 :             if (!pstate.continuescan)
    1778     8198500 :                 break;
    1779             : 
    1780    32250610 :             offnum = OffsetNumberNext(offnum);
    1781             :         }
    1782             : 
    1783             :         /*
    1784             :          * We don't need to visit page to the right when the high key
    1785             :          * indicates that no more matches will be found there.
    1786             :          *
    1787             :          * Checking the high key like this works out more often than you might
    1788             :          * think.  Leaf page splits pick a split point between the two most
    1789             :          * dissimilar tuples (this is weighed against the need to evenly share
    1790             :          * free space).  Leaf pages with high key attribute values that can
    1791             :          * only appear on non-pivot tuples on the right sibling page are
    1792             :          * common.
    1793             :          */
    1794    10874952 :         if (pstate.continuescan && !P_RIGHTMOST(opaque))
    1795             :         {
    1796      117692 :             ItemId      iid = PageGetItemId(page, P_HIKEY);
    1797      117692 :             IndexTuple  itup = (IndexTuple) PageGetItem(page, iid);
    1798             :             int         truncatt;
    1799             : 
    1800      117692 :             truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
    1801      117692 :             pstate.prechecked = false;  /* precheck didn't cover HIKEY */
    1802      117692 :             _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
    1803             :         }
    1804             : 
    1805    10874952 :         if (!pstate.continuescan)
    1806     8258402 :             so->currPos.moreRight = false;
    1807             : 
    1808             :         Assert(itemIndex <= MaxTIDsPerBTreePage);
    1809    10874952 :         so->currPos.firstItem = 0;
    1810    10874952 :         so->currPos.lastItem = itemIndex - 1;
    1811    10874952 :         so->currPos.itemIndex = 0;
    1812             :     }
    1813             :     else
    1814             :     {
    1815             :         /* SK_SEARCHARRAY backward scans must provide final tuple up front */
    1816       49638 :         if (arrayKeys && minoff <= maxoff && !P_LEFTMOST(opaque))
    1817             :         {
    1818          24 :             ItemId      iid = PageGetItemId(page, minoff);
    1819             : 
    1820          24 :             pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
    1821             :         }
    1822             : 
    1823             :         /* load items[] in descending order */
    1824       49638 :         itemIndex = MaxTIDsPerBTreePage;
    1825             : 
    1826       49638 :         offnum = Min(offnum, maxoff);
    1827             : 
    1828     8706212 :         while (offnum >= minoff)
    1829             :         {
    1830     8656670 :             ItemId      iid = PageGetItemId(page, offnum);
    1831             :             IndexTuple  itup;
    1832             :             bool        tuple_alive;
    1833             :             bool        passes_quals;
    1834             : 
    1835             :             /*
    1836             :              * If the scan specifies not to return killed tuples, then we
    1837             :              * treat a killed tuple as not passing the qual.  Most of the
    1838             :              * time, it's a win to not bother examining the tuple's index
    1839             :              * keys, but just skip to the next tuple (previous, actually,
    1840             :              * since we're scanning backwards).  However, if this is the first
    1841             :              * tuple on the page, we do check the index keys, to prevent
    1842             :              * uselessly advancing to the page to the left.  This is similar
    1843             :              * to the high key optimization used by forward scans.
    1844             :              */
    1845     8656670 :             if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
    1846             :             {
    1847             :                 Assert(offnum >= P_FIRSTDATAKEY(opaque));
    1848      905418 :                 if (offnum > P_FIRSTDATAKEY(opaque))
    1849             :                 {
    1850      904784 :                     offnum = OffsetNumberPrev(offnum);
    1851      904784 :                     continue;
    1852             :                 }
    1853             : 
    1854         634 :                 tuple_alive = false;
    1855             :             }
    1856             :             else
    1857     7751252 :                 tuple_alive = true;
    1858             : 
    1859     7751886 :             itup = (IndexTuple) PageGetItem(page, iid);
    1860             :             Assert(!BTreeTupleIsPivot(itup));
    1861             : 
    1862     7751886 :             pstate.offnum = offnum;
    1863     7751886 :             passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
    1864             :                                          itup, indnatts);
    1865             : 
    1866             :             /*
    1867             :              * Check if we need to skip ahead to a later tuple (only possible
    1868             :              * when the scan uses array keys)
    1869             :              */
    1870     7751886 :             if (arrayKeys && OffsetNumberIsValid(pstate.skip))
    1871             :             {
    1872             :                 Assert(!passes_quals && pstate.continuescan);
    1873             :                 Assert(offnum > pstate.skip);
    1874             : 
    1875           6 :                 offnum = pstate.skip;
    1876           6 :                 pstate.skip = InvalidOffsetNumber;
    1877           6 :                 continue;
    1878             :             }
    1879             : 
    1880     7751880 :             if (passes_quals && tuple_alive)
    1881             :             {
    1882             :                 /* tuple passes all scan key conditions */
    1883     7751024 :                 pstate.firstmatch = true;
    1884     7751024 :                 if (!BTreeTupleIsPosting(itup))
    1885             :                 {
    1886             :                     /* Remember it */
    1887     7705816 :                     itemIndex--;
    1888     7705816 :                     _bt_saveitem(so, itemIndex, offnum, itup);
    1889             :                 }
    1890             :                 else
    1891             :                 {
    1892             :                     int         tupleOffset;
    1893             : 
    1894             :                     /*
    1895             :                      * Set up state to return posting list, and remember first
    1896             :                      * TID.
    1897             :                      *
    1898             :                      * Note that we deliberately save/return items from
    1899             :                      * posting lists in ascending heap TID order for backwards
    1900             :                      * scans.  This allows _bt_killitems() to make a
    1901             :                      * consistent assumption about the order of items
    1902             :                      * associated with the same posting list tuple.
    1903             :                      */
    1904       45208 :                     itemIndex--;
    1905             :                     tupleOffset =
    1906       45208 :                         _bt_setuppostingitems(so, itemIndex, offnum,
    1907             :                                               BTreeTupleGetPostingN(itup, 0),
    1908             :                                               itup);
    1909             :                     /* Remember additional TIDs */
    1910      178206 :                     for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
    1911             :                     {
    1912      132998 :                         itemIndex--;
    1913      132998 :                         _bt_savepostingitem(so, itemIndex, offnum,
    1914             :                                             BTreeTupleGetPostingN(itup, i),
    1915             :                                             tupleOffset);
    1916             :                     }
    1917             :                 }
    1918             :             }
    1919     7751880 :             if (!pstate.continuescan)
    1920             :             {
    1921             :                 /* there can't be any more matches, so stop */
    1922          96 :                 so->currPos.moreLeft = false;
    1923          96 :                 break;
    1924             :             }
    1925             : 
    1926     7751784 :             offnum = OffsetNumberPrev(offnum);
    1927             :         }
    1928             : 
    1929             :         Assert(itemIndex >= 0);
    1930       49638 :         so->currPos.firstItem = itemIndex;
    1931       49638 :         so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
    1932       49638 :         so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
    1933             :     }
    1934             : 
    1935    10924590 :     return (so->currPos.firstItem <= so->currPos.lastItem);
    1936             : }
    1937             : 
    1938             : /* Save an index item into so->currPos.items[itemIndex] */
    1939             : static void
    1940    39088104 : _bt_saveitem(BTScanOpaque so, int itemIndex,
    1941             :              OffsetNumber offnum, IndexTuple itup)
    1942             : {
    1943    39088104 :     BTScanPosItem *currItem = &so->currPos.items[itemIndex];
    1944             : 
    1945             :     Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
    1946             : 
    1947    39088104 :     currItem->heapTid = itup->t_tid;
    1948    39088104 :     currItem->indexOffset = offnum;
    1949    39088104 :     if (so->currTuples)
    1950             :     {
    1951    21185814 :         Size        itupsz = IndexTupleSize(itup);
    1952             : 
    1953    21185814 :         currItem->tupleOffset = so->currPos.nextTupleOffset;
    1954    21185814 :         memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
    1955    21185814 :         so->currPos.nextTupleOffset += MAXALIGN(itupsz);
    1956             :     }
    1957    39088104 : }
    1958             : 
    1959             : /*
    1960             :  * Setup state to save TIDs/items from a single posting list tuple.
    1961             :  *
    1962             :  * Saves an index item into so->currPos.items[itemIndex] for TID that is
    1963             :  * returned to scan first.  Second or subsequent TIDs for posting list should
    1964             :  * be saved by calling _bt_savepostingitem().
    1965             :  *
    1966             :  * Returns an offset into tuple storage space that main tuple is stored at if
    1967             :  * needed.
    1968             :  */
    1969             : static int
    1970      510604 : _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
    1971             :                       ItemPointer heapTid, IndexTuple itup)
    1972             : {
    1973      510604 :     BTScanPosItem *currItem = &so->currPos.items[itemIndex];
    1974             : 
    1975             :     Assert(BTreeTupleIsPosting(itup));
    1976             : 
    1977      510604 :     currItem->heapTid = *heapTid;
    1978      510604 :     currItem->indexOffset = offnum;
    1979      510604 :     if (so->currTuples)
    1980             :     {
    1981             :         /* Save base IndexTuple (truncate posting list) */
    1982             :         IndexTuple  base;
    1983      265874 :         Size        itupsz = BTreeTupleGetPostingOffset(itup);
    1984             : 
    1985      265874 :         itupsz = MAXALIGN(itupsz);
    1986      265874 :         currItem->tupleOffset = so->currPos.nextTupleOffset;
    1987      265874 :         base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
    1988      265874 :         memcpy(base, itup, itupsz);
    1989             :         /* Defensively reduce work area index tuple header size */
    1990      265874 :         base->t_info &= ~INDEX_SIZE_MASK;
    1991      265874 :         base->t_info |= itupsz;
    1992      265874 :         so->currPos.nextTupleOffset += itupsz;
    1993             : 
    1994      265874 :         return currItem->tupleOffset;
    1995             :     }
    1996             : 
    1997      244730 :     return 0;
    1998             : }
    1999             : 
    2000             : /*
    2001             :  * Save an index item into so->currPos.items[itemIndex] for current posting
    2002             :  * tuple.
    2003             :  *
    2004             :  * Assumes that _bt_setuppostingitems() has already been called for current
    2005             :  * posting list tuple.  Caller passes its return value as tupleOffset.
    2006             :  */
    2007             : static inline void
    2008     2387598 : _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
    2009             :                     ItemPointer heapTid, int tupleOffset)
    2010             : {
    2011     2387598 :     BTScanPosItem *currItem = &so->currPos.items[itemIndex];
    2012             : 
    2013     2387598 :     currItem->heapTid = *heapTid;
    2014     2387598 :     currItem->indexOffset = offnum;
    2015             : 
    2016             :     /*
    2017             :      * Have index-only scans return the same base IndexTuple for every TID
    2018             :      * that originates from the same posting list
    2019             :      */
    2020     2387598 :     if (so->currTuples)
    2021     1019650 :         currItem->tupleOffset = tupleOffset;
    2022     2387598 : }
    2023             : 
    2024             : /*
    2025             :  *  _bt_steppage() -- Step to next page containing valid data for scan
    2026             :  *
    2027             :  * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
    2028             :  * if pinned, we'll drop the pin before moving to next page.  The buffer is
    2029             :  * not locked on entry.
    2030             :  *
    2031             :  * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
    2032             :  * read lock, on that page.  If we do not hold the pin, we set so->currPos.buf
    2033             :  * to InvalidBuffer.  We return true to indicate success.
    2034             :  */
    2035             : static bool
    2036     5093034 : _bt_steppage(IndexScanDesc scan, ScanDirection dir)
    2037             : {
    2038     5093034 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    2039     5093034 :     BlockNumber blkno = InvalidBlockNumber;
    2040             :     bool        status;
    2041             : 
    2042             :     Assert(BTScanPosIsValid(so->currPos));
    2043             : 
    2044             :     /* Before leaving current page, deal with any killed items */
    2045     5093034 :     if (so->numKilled > 0)
    2046       70338 :         _bt_killitems(scan);
    2047             : 
    2048             :     /*
    2049             :      * Before we modify currPos, make a copy of the page data if there was a
    2050             :      * mark position that needs it.
    2051             :      */
    2052     5093034 :     if (so->markItemIndex >= 0)
    2053             :     {
    2054             :         /* bump pin on current buffer for assignment to mark buffer */
    2055         362 :         if (BTScanPosIsPinned(so->currPos))
    2056         348 :             IncrBufferRefCount(so->currPos.buf);
    2057         362 :         memcpy(&so->markPos, &so->currPos,
    2058             :                offsetof(BTScanPosData, items[1]) +
    2059         362 :                so->currPos.lastItem * sizeof(BTScanPosItem));
    2060         362 :         if (so->markTuples)
    2061         348 :             memcpy(so->markTuples, so->currTuples,
    2062         348 :                    so->currPos.nextTupleOffset);
    2063         362 :         so->markPos.itemIndex = so->markItemIndex;
    2064         362 :         so->markItemIndex = -1;
    2065             : 
    2066             :         /*
    2067             :          * If we're just about to start the next primitive index scan
    2068             :          * (possible with a scan that has arrays keys, and needs to skip to
    2069             :          * continue in the current scan direction), moreLeft/moreRight only
    2070             :          * indicate the end of the current primitive index scan.  They must
    2071             :          * never be taken to indicate that the top-level index scan has ended
    2072             :          * (that would be wrong).
    2073             :          *
    2074             :          * We could handle this case by treating the current array keys as
    2075             :          * markPos state.  But depending on the current array state like this
    2076             :          * would add complexity.  Instead, we just unset markPos's copy of
    2077             :          * moreRight or moreLeft (whichever might be affected), while making
    2078             :          * btrestpos reset the scan's arrays to their initial scan positions.
    2079             :          * In effect, btrestpos leaves advancing the arrays up to the first
    2080             :          * _bt_readpage call (that takes place after it has restored markPos).
    2081             :          */
    2082             :         Assert(so->markPos.dir == dir);
    2083         362 :         if (so->needPrimScan)
    2084             :         {
    2085           0 :             if (ScanDirectionIsForward(dir))
    2086           0 :                 so->markPos.moreRight = true;
    2087             :             else
    2088           0 :                 so->markPos.moreLeft = true;
    2089             :         }
    2090             :     }
    2091             : 
    2092     5093034 :     if (ScanDirectionIsForward(dir))
    2093             :     {
    2094             :         /* Walk right to the next page with data */
    2095     5092884 :         if (scan->parallel_scan != NULL)
    2096             :         {
    2097             :             /*
    2098             :              * Seize the scan to get the next block number; if the scan has
    2099             :              * ended already, bail out.
    2100             :              */
    2101        1330 :             status = _bt_parallel_seize(scan, &blkno, false);
    2102        1330 :             if (!status)
    2103             :             {
    2104             :                 /* release the previous buffer, if pinned */
    2105          36 :                 BTScanPosUnpinIfPinned(so->currPos);
    2106          36 :                 BTScanPosInvalidate(so->currPos);
    2107          36 :                 return false;
    2108             :             }
    2109             :         }
    2110             :         else
    2111             :         {
    2112             :             /* Not parallel, so use the previously-saved nextPage link. */
    2113     5091554 :             blkno = so->currPos.nextPage;
    2114             :         }
    2115             : 
    2116             :         /* Remember we left a page with data */
    2117     5092848 :         so->currPos.moreLeft = true;
    2118             : 
    2119             :         /* release the previous buffer, if pinned */
    2120     5092848 :         BTScanPosUnpinIfPinned(so->currPos);
    2121             :     }
    2122             :     else
    2123             :     {
    2124             :         /* Remember we left a page with data */
    2125         150 :         so->currPos.moreRight = true;
    2126             : 
    2127         150 :         if (scan->parallel_scan != NULL)
    2128             :         {
    2129             :             /*
    2130             :              * Seize the scan to get the current block number; if the scan has
    2131             :              * ended already, bail out.
    2132             :              */
    2133           0 :             status = _bt_parallel_seize(scan, &blkno, false);
    2134           0 :             BTScanPosUnpinIfPinned(so->currPos);
    2135           0 :             if (!status)
    2136             :             {
    2137           0 :                 BTScanPosInvalidate(so->currPos);
    2138           0 :                 return false;
    2139             :             }
    2140             :         }
    2141             :         else
    2142             :         {
    2143             :             /* Not parallel, so just use our own notion of the current page */
    2144         150 :             blkno = so->currPos.currPage;
    2145             :         }
    2146             :     }
    2147             : 
    2148     5092998 :     if (!_bt_readnextpage(scan, blkno, dir))
    2149     5064178 :         return false;
    2150             : 
    2151             :     /* We have at least one item to return as scan's next item */
    2152       28820 :     _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
    2153             : 
    2154       28820 :     return true;
    2155             : }
    2156             : 
    2157             : /*
    2158             :  *  _bt_readnextpage() -- Read next page containing valid data for scan
    2159             :  *
    2160             :  * On success exit, so->currPos is updated to contain data from the next
    2161             :  * interesting page, and we return true.  Caller must release the lock (and
    2162             :  * maybe the pin) on the buffer on success exit.
    2163             :  *
    2164             :  * If there are no more matching records in the given direction, we drop all
    2165             :  * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
    2166             :  */
    2167             : static bool
    2168     5092998 : _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
    2169             : {
    2170     5092998 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    2171             :     Relation    rel;
    2172             :     Page        page;
    2173             :     BTPageOpaque opaque;
    2174             :     bool        status;
    2175             : 
    2176     5092998 :     rel = scan->indexRelation;
    2177             : 
    2178     5092998 :     if (ScanDirectionIsForward(dir))
    2179             :     {
    2180             :         for (;;)
    2181             :         {
    2182             :             /*
    2183             :              * if we're at end of scan, give up and mark parallel scan as
    2184             :              * done, so that all the workers can finish their scan
    2185             :              */
    2186     5094482 :             if (blkno == P_NONE || !so->currPos.moreRight)
    2187             :             {
    2188     5064092 :                 _bt_parallel_done(scan);
    2189     5064092 :                 BTScanPosInvalidate(so->currPos);
    2190     5064092 :                 return false;
    2191             :             }
    2192             :             /* check for interrupts while we're not holding any buffer lock */
    2193       30390 :             CHECK_FOR_INTERRUPTS();
    2194             :             /* step right one page */
    2195       30390 :             so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
    2196       30390 :             page = BufferGetPage(so->currPos.buf);
    2197       30390 :             opaque = BTPageGetOpaque(page);
    2198             :             /* check for deleted page */
    2199       30390 :             if (!P_IGNORE(opaque))
    2200             :             {
    2201       30390 :                 PredicateLockPage(rel, blkno, scan->xs_snapshot);
    2202             :                 /* see if there are any matches on this page */
    2203             :                 /* note that this will clear moreRight if we can stop */
    2204       30390 :                 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), false))
    2205       28756 :                     break;
    2206             :             }
    2207           0 :             else if (scan->parallel_scan != NULL)
    2208             :             {
    2209             :                 /* allow next page be processed by parallel worker */
    2210           0 :                 _bt_parallel_release(scan, opaque->btpo_next);
    2211             :             }
    2212             : 
    2213             :             /* nope, keep going */
    2214        1634 :             if (scan->parallel_scan != NULL)
    2215             :             {
    2216           0 :                 _bt_relbuf(rel, so->currPos.buf);
    2217           0 :                 status = _bt_parallel_seize(scan, &blkno, false);
    2218           0 :                 if (!status)
    2219             :                 {
    2220           0 :                     BTScanPosInvalidate(so->currPos);
    2221           0 :                     return false;
    2222             :                 }
    2223             :             }
    2224             :             else
    2225             :             {
    2226        1634 :                 blkno = opaque->btpo_next;
    2227        1634 :                 _bt_relbuf(rel, so->currPos.buf);
    2228             :             }
    2229             :         }
    2230             :     }
    2231             :     else
    2232             :     {
    2233             :         /*
    2234             :          * Should only happen in parallel cases, when some other backend
    2235             :          * advanced the scan.
    2236             :          */
    2237         150 :         if (so->currPos.currPage != blkno)
    2238             :         {
    2239           0 :             BTScanPosUnpinIfPinned(so->currPos);
    2240           0 :             so->currPos.currPage = blkno;
    2241             :         }
    2242             : 
    2243             :         /*
    2244             :          * Walk left to the next page with data.  This is much more complex
    2245             :          * than the walk-right case because of the possibility that the page
    2246             :          * to our left splits while we are in flight to it, plus the
    2247             :          * possibility that the page we were on gets deleted after we leave
    2248             :          * it.  See nbtree/README for details.
    2249             :          *
    2250             :          * It might be possible to rearrange this code to have less overhead
    2251             :          * in pinning and locking, but that would require capturing the left
    2252             :          * sibling block number when the page is initially read, and then
    2253             :          * optimistically starting there (rather than pinning the page twice).
    2254             :          * It is not clear that this would be worth the complexity.
    2255             :          */
    2256         150 :         if (BTScanPosIsPinned(so->currPos))
    2257         106 :             _bt_lockbuf(rel, so->currPos.buf, BT_READ);
    2258             :         else
    2259          44 :             so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
    2260             : 
    2261             :         for (;;)
    2262             :         {
    2263             :             /* Done if we know there are no matching keys to the left */
    2264         156 :             if (!so->currPos.moreLeft)
    2265             :             {
    2266          54 :                 _bt_relbuf(rel, so->currPos.buf);
    2267          54 :                 _bt_parallel_done(scan);
    2268          54 :                 BTScanPosInvalidate(so->currPos);
    2269          54 :                 return false;
    2270             :             }
    2271             : 
    2272             :             /* Step to next physical page */
    2273         102 :             so->currPos.buf = _bt_walk_left(rel, so->currPos.buf);
    2274             : 
    2275             :             /* if we're physically at end of index, return failure */
    2276         102 :             if (so->currPos.buf == InvalidBuffer)
    2277             :             {
    2278          32 :                 _bt_parallel_done(scan);
    2279          32 :                 BTScanPosInvalidate(so->currPos);
    2280          32 :                 return false;
    2281             :             }
    2282             : 
    2283             :             /*
    2284             :              * Okay, we managed to move left to a non-deleted page. Done if
    2285             :              * it's not half-dead and contains matching tuples. Else loop back
    2286             :              * and do it all again.
    2287             :              */
    2288          70 :             page = BufferGetPage(so->currPos.buf);
    2289          70 :             opaque = BTPageGetOpaque(page);
    2290          70 :             if (!P_IGNORE(opaque))
    2291             :             {
    2292          70 :                 PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
    2293             :                 /* see if there are any matches on this page */
    2294             :                 /* note that this will clear moreLeft if we can stop */
    2295          70 :                 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), false))
    2296          64 :                     break;
    2297             :             }
    2298           0 :             else if (scan->parallel_scan != NULL)
    2299             :             {
    2300             :                 /* allow next page be processed by parallel worker */
    2301           0 :                 _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
    2302             :             }
    2303             : 
    2304             :             /*
    2305             :              * For parallel scans, get the last page scanned as it is quite
    2306             :              * possible that by the time we try to seize the scan, some other
    2307             :              * worker has already advanced the scan to a different page.  We
    2308             :              * must continue based on the latest page scanned by any worker.
    2309             :              */
    2310           6 :             if (scan->parallel_scan != NULL)
    2311             :             {
    2312           0 :                 _bt_relbuf(rel, so->currPos.buf);
    2313           0 :                 status = _bt_parallel_seize(scan, &blkno, false);
    2314           0 :                 if (!status)
    2315             :                 {
    2316           0 :                     BTScanPosInvalidate(so->currPos);
    2317           0 :                     return false;
    2318             :                 }
    2319           0 :                 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
    2320             :             }
    2321             :         }
    2322             :     }
    2323             : 
    2324       28820 :     return true;
    2325             : }
    2326             : 
    2327             : /*
    2328             :  *  _bt_parallel_readpage() -- Read current page containing valid data for scan
    2329             :  *
    2330             :  * On success, release lock and maybe pin on buffer.  We return true to
    2331             :  * indicate success.
    2332             :  */
    2333             : static bool
    2334           0 : _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
    2335             : {
    2336           0 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    2337             : 
    2338             :     Assert(!so->needPrimScan);
    2339             : 
    2340           0 :     _bt_initialize_more_data(so, dir);
    2341             : 
    2342           0 :     if (!_bt_readnextpage(scan, blkno, dir))
    2343           0 :         return false;
    2344             : 
    2345             :     /* We have at least one item to return as scan's next item */
    2346           0 :     _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
    2347             : 
    2348           0 :     return true;
    2349             : }
    2350             : 
    2351             : /*
    2352             :  * _bt_walk_left() -- step left one page, if possible
    2353             :  *
    2354             :  * The given buffer must be pinned and read-locked.  This will be dropped
    2355             :  * before stepping left.  On return, we have pin and read lock on the
    2356             :  * returned page, instead.
    2357             :  *
    2358             :  * Returns InvalidBuffer if there is no page to the left (no lock is held
    2359             :  * in that case).
    2360             :  *
    2361             :  * It is possible for the returned leaf page to be half-dead; caller must
    2362             :  * check that condition and step left again when required.
    2363             :  */
    2364             : static Buffer
    2365         102 : _bt_walk_left(Relation rel, Buffer buf)
    2366             : {
    2367             :     Page        page;
    2368             :     BTPageOpaque opaque;
    2369             : 
    2370         102 :     page = BufferGetPage(buf);
    2371         102 :     opaque = BTPageGetOpaque(page);
    2372             : 
    2373             :     for (;;)
    2374           0 :     {
    2375             :         BlockNumber obknum;
    2376             :         BlockNumber lblkno;
    2377             :         BlockNumber blkno;
    2378             :         int         tries;
    2379             : 
    2380             :         /* if we're at end of tree, release buf and return failure */
    2381         102 :         if (P_LEFTMOST(opaque))
    2382             :         {
    2383          32 :             _bt_relbuf(rel, buf);
    2384          32 :             break;
    2385             :         }
    2386             :         /* remember original page we are stepping left from */
    2387          70 :         obknum = BufferGetBlockNumber(buf);
    2388             :         /* step left */
    2389          70 :         blkno = lblkno = opaque->btpo_prev;
    2390          70 :         _bt_relbuf(rel, buf);
    2391             :         /* check for interrupts while we're not holding any buffer lock */
    2392          70 :         CHECK_FOR_INTERRUPTS();
    2393          70 :         buf = _bt_getbuf(rel, blkno, BT_READ);
    2394          70 :         page = BufferGetPage(buf);
    2395          70 :         opaque = BTPageGetOpaque(page);
    2396             : 
    2397             :         /*
    2398             :          * If this isn't the page we want, walk right till we find what we
    2399             :          * want --- but go no more than four hops (an arbitrary limit). If we
    2400             :          * don't find the correct page by then, the most likely bet is that
    2401             :          * the original page got deleted and isn't in the sibling chain at all
    2402             :          * anymore, not that its left sibling got split more than four times.
    2403             :          *
    2404             :          * Note that it is correct to test P_ISDELETED not P_IGNORE here,
    2405             :          * because half-dead pages are still in the sibling chain.
    2406             :          */
    2407          70 :         tries = 0;
    2408             :         for (;;)
    2409             :         {
    2410          70 :             if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
    2411             :             {
    2412             :                 /* Found desired page, return it */
    2413          70 :                 return buf;
    2414             :             }
    2415           0 :             if (P_RIGHTMOST(opaque) || ++tries > 4)
    2416             :                 break;
    2417           0 :             blkno = opaque->btpo_next;
    2418           0 :             buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    2419           0 :             page = BufferGetPage(buf);
    2420           0 :             opaque = BTPageGetOpaque(page);
    2421             :         }
    2422             : 
    2423             :         /* Return to the original page to see what's up */
    2424           0 :         buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
    2425           0 :         page = BufferGetPage(buf);
    2426           0 :         opaque = BTPageGetOpaque(page);
    2427           0 :         if (P_ISDELETED(opaque))
    2428             :         {
    2429             :             /*
    2430             :              * It was deleted.  Move right to first nondeleted page (there
    2431             :              * must be one); that is the page that has acquired the deleted
    2432             :              * one's keyspace, so stepping left from it will take us where we
    2433             :              * want to be.
    2434             :              */
    2435             :             for (;;)
    2436             :             {
    2437           0 :                 if (P_RIGHTMOST(opaque))
    2438           0 :                     elog(ERROR, "fell off the end of index \"%s\"",
    2439             :                          RelationGetRelationName(rel));
    2440           0 :                 blkno = opaque->btpo_next;
    2441           0 :                 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    2442           0 :                 page = BufferGetPage(buf);
    2443           0 :                 opaque = BTPageGetOpaque(page);
    2444           0 :                 if (!P_ISDELETED(opaque))
    2445           0 :                     break;
    2446             :             }
    2447             : 
    2448             :             /*
    2449             :              * Now return to top of loop, resetting obknum to point to this
    2450             :              * nondeleted page, and try again.
    2451             :              */
    2452             :         }
    2453             :         else
    2454             :         {
    2455             :             /*
    2456             :              * It wasn't deleted; the explanation had better be that the page
    2457             :              * to the left got split or deleted. Without this check, we'd go
    2458             :              * into an infinite loop if there's anything wrong.
    2459             :              */
    2460           0 :             if (opaque->btpo_prev == lblkno)
    2461           0 :                 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
    2462             :                      obknum, RelationGetRelationName(rel));
    2463             :             /* Okay to try again with new lblkno value */
    2464             :         }
    2465             :     }
    2466             : 
    2467          32 :     return InvalidBuffer;
    2468             : }
    2469             : 
    2470             : /*
    2471             :  * _bt_get_endpoint() -- Find the first or last page on a given tree level
    2472             :  *
    2473             :  * If the index is empty, we will return InvalidBuffer; any other failure
    2474             :  * condition causes ereport().  We will not return a dead page.
    2475             :  *
    2476             :  * The returned buffer is pinned and read-locked.
    2477             :  */
    2478             : Buffer
    2479       75800 : _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
    2480             : {
    2481             :     Buffer      buf;
    2482             :     Page        page;
    2483             :     BTPageOpaque opaque;
    2484             :     OffsetNumber offnum;
    2485             :     BlockNumber blkno;
    2486             :     IndexTuple  itup;
    2487             : 
    2488             :     /*
    2489             :      * If we are looking for a leaf page, okay to descend from fast root;
    2490             :      * otherwise better descend from true root.  (There is no point in being
    2491             :      * smarter about intermediate levels.)
    2492             :      */
    2493       75800 :     if (level == 0)
    2494       75776 :         buf = _bt_getroot(rel, NULL, BT_READ);
    2495             :     else
    2496          24 :         buf = _bt_gettrueroot(rel);
    2497             : 
    2498       75800 :     if (!BufferIsValid(buf))
    2499        6446 :         return InvalidBuffer;
    2500             : 
    2501       69354 :     page = BufferGetPage(buf);
    2502       69354 :     opaque = BTPageGetOpaque(page);
    2503             : 
    2504             :     for (;;)
    2505             :     {
    2506             :         /*
    2507             :          * If we landed on a deleted page, step right to find a live page
    2508             :          * (there must be one).  Also, if we want the rightmost page, step
    2509             :          * right if needed to get to it (this could happen if the page split
    2510             :          * since we obtained a pointer to it).
    2511             :          */
    2512       98284 :         while (P_IGNORE(opaque) ||
    2513          66 :                (rightmost && !P_RIGHTMOST(opaque)))
    2514             :         {
    2515           0 :             blkno = opaque->btpo_next;
    2516           0 :             if (blkno == P_NONE)
    2517           0 :                 elog(ERROR, "fell off the end of index \"%s\"",
    2518             :                      RelationGetRelationName(rel));
    2519           0 :             buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    2520           0 :             page = BufferGetPage(buf);
    2521           0 :             opaque = BTPageGetOpaque(page);
    2522             :         }
    2523             : 
    2524             :         /* Done? */
    2525       98284 :         if (opaque->btpo_level == level)
    2526       69354 :             break;
    2527       28930 :         if (opaque->btpo_level < level)
    2528           0 :             ereport(ERROR,
    2529             :                     (errcode(ERRCODE_INDEX_CORRUPTED),
    2530             :                      errmsg_internal("btree level %u not found in index \"%s\"",
    2531             :                                      level, RelationGetRelationName(rel))));
    2532             : 
    2533             :         /* Descend to leftmost or rightmost child page */
    2534       28930 :         if (rightmost)
    2535           6 :             offnum = PageGetMaxOffsetNumber(page);
    2536             :         else
    2537       28924 :             offnum = P_FIRSTDATAKEY(opaque);
    2538             : 
    2539       28930 :         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
    2540       28930 :         blkno = BTreeTupleGetDownLink(itup);
    2541             : 
    2542       28930 :         buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    2543       28930 :         page = BufferGetPage(buf);
    2544       28930 :         opaque = BTPageGetOpaque(page);
    2545             :     }
    2546             : 
    2547       69354 :     return buf;
    2548             : }
    2549             : 
    2550             : /*
    2551             :  *  _bt_endpoint() -- Find the first or last page in the index, and scan
    2552             :  * from there to the first key satisfying all the quals.
    2553             :  *
    2554             :  * This is used by _bt_first() to set up a scan when we've determined
    2555             :  * that the scan must start at the beginning or end of the index (for
    2556             :  * a forward or backward scan respectively).  Exit conditions are the
    2557             :  * same as for _bt_first().
    2558             :  */
    2559             : static bool
    2560       75776 : _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
    2561             : {
    2562       75776 :     Relation    rel = scan->indexRelation;
    2563       75776 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    2564             :     Buffer      buf;
    2565             :     Page        page;
    2566             :     BTPageOpaque opaque;
    2567             :     OffsetNumber start;
    2568             :     BTScanPosItem *currItem;
    2569             : 
    2570             :     /*
    2571             :      * Scan down to the leftmost or rightmost leaf page.  This is a simplified
    2572             :      * version of _bt_search().  We don't maintain a stack since we know we
    2573             :      * won't need it.
    2574             :      */
    2575       75776 :     buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
    2576             : 
    2577       75776 :     if (!BufferIsValid(buf))
    2578             :     {
    2579             :         /*
    2580             :          * Empty index. Lock the whole relation, as nothing finer to lock
    2581             :          * exists.
    2582             :          */
    2583        6446 :         PredicateLockRelation(rel, scan->xs_snapshot);
    2584        6446 :         BTScanPosInvalidate(so->currPos);
    2585        6446 :         return false;
    2586             :     }
    2587             : 
    2588       69330 :     PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
    2589       69330 :     page = BufferGetPage(buf);
    2590       69330 :     opaque = BTPageGetOpaque(page);
    2591             :     Assert(P_ISLEAF(opaque));
    2592             : 
    2593       69330 :     if (ScanDirectionIsForward(dir))
    2594             :     {
    2595             :         /* There could be dead pages to the left, so not this: */
    2596             :         /* Assert(P_LEFTMOST(opaque)); */
    2597             : 
    2598       69270 :         start = P_FIRSTDATAKEY(opaque);
    2599             :     }
    2600          60 :     else if (ScanDirectionIsBackward(dir))
    2601             :     {
    2602             :         Assert(P_RIGHTMOST(opaque));
    2603             : 
    2604          60 :         start = PageGetMaxOffsetNumber(page);
    2605             :     }
    2606             :     else
    2607             :     {
    2608           0 :         elog(ERROR, "invalid scan direction: %d", (int) dir);
    2609             :         start = 0;              /* keep compiler quiet */
    2610             :     }
    2611             : 
    2612             :     /* remember which buffer we have pinned */
    2613       69330 :     so->currPos.buf = buf;
    2614             : 
    2615       69330 :     _bt_initialize_more_data(so, dir);
    2616             : 
    2617             :     /*
    2618             :      * Now load data from the first page of the scan.
    2619             :      */
    2620       69330 :     if (!_bt_readpage(scan, dir, start, true))
    2621             :     {
    2622             :         /*
    2623             :          * There's no actually-matching data on this page.  Try to advance to
    2624             :          * the next page.  Return false if there's no matching data at all.
    2625             :          */
    2626        1666 :         _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
    2627        1666 :         if (!_bt_steppage(scan, dir))
    2628        1580 :             return false;
    2629             :     }
    2630             :     else
    2631             :     {
    2632             :         /* We have at least one item to return as scan's first item */
    2633       67664 :         _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
    2634             :     }
    2635             : 
    2636             :     /* OK, itemIndex says what to return */
    2637       67750 :     currItem = &so->currPos.items[so->currPos.itemIndex];
    2638       67750 :     scan->xs_heaptid = currItem->heapTid;
    2639       67750 :     if (scan->xs_want_itup)
    2640       62674 :         scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
    2641             : 
    2642       67750 :     return true;
    2643             : }
    2644             : 
    2645             : /*
    2646             :  * _bt_initialize_more_data() -- initialize moreLeft, moreRight and scan dir
    2647             :  * from currPos
    2648             :  */
    2649             : static inline void
    2650    10894130 : _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
    2651             : {
    2652    10894130 :     so->currPos.dir = dir;
    2653    10894130 :     if (so->needPrimScan)
    2654             :     {
    2655             :         Assert(so->numArrayKeys);
    2656             : 
    2657         564 :         so->currPos.moreLeft = true;
    2658         564 :         so->currPos.moreRight = true;
    2659         564 :         so->needPrimScan = false;
    2660             :     }
    2661    10893566 :     else if (ScanDirectionIsForward(dir))
    2662             :     {
    2663    10844004 :         so->currPos.moreLeft = false;
    2664    10844004 :         so->currPos.moreRight = true;
    2665             :     }
    2666             :     else
    2667             :     {
    2668       49562 :         so->currPos.moreLeft = true;
    2669       49562 :         so->currPos.moreRight = false;
    2670             :     }
    2671    10894130 :     so->numKilled = 0;           /* just paranoia */
    2672    10894130 :     so->markItemIndex = -1;      /* ditto */
    2673    10894130 : }

Generated by: LCOV version 1.14