LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtsearch.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 614 685 89.6 %
Date: 2025-04-01 16:15:31 Functions: 20 20 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtsearch.c
       4             :  *    Search code for postgres btrees.
       5             :  *
       6             :  *
       7             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       8             :  * Portions Copyright (c) 1994, Regents of the University of California
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/access/nbtree/nbtsearch.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : 
      16             : #include "postgres.h"
      17             : 
      18             : #include "access/nbtree.h"
      19             : #include "access/relscan.h"
      20             : #include "access/xact.h"
      21             : #include "miscadmin.h"
      22             : #include "pgstat.h"
      23             : #include "storage/predicate.h"
      24             : #include "utils/lsyscache.h"
      25             : #include "utils/rel.h"
      26             : 
      27             : 
      28             : static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
      29             : static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
      30             :                             Buffer buf, bool forupdate, BTStack stack,
      31             :                             int access);
      32             : static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
      33             : static int  _bt_binsrch_posting(BTScanInsert key, Page page,
      34             :                                 OffsetNumber offnum);
      35             : static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
      36             :                          OffsetNumber offnum, bool firstpage);
      37             : static void _bt_saveitem(BTScanOpaque so, int itemIndex,
      38             :                          OffsetNumber offnum, IndexTuple itup);
      39             : static int  _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
      40             :                                   OffsetNumber offnum, ItemPointer heapTid,
      41             :                                   IndexTuple itup);
      42             : static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
      43             :                                        OffsetNumber offnum,
      44             :                                        ItemPointer heapTid, int tupleOffset);
      45             : static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so);
      46             : static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
      47             : static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,
      48             :                               ScanDirection dir);
      49             : static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
      50             :                              BlockNumber lastcurrblkno, ScanDirection dir,
      51             :                              bool seized);
      52             : static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
      53             :                                          BlockNumber lastcurrblkno);
      54             : static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
      55             : 
      56             : 
      57             : /*
      58             :  *  _bt_drop_lock_and_maybe_pin()
      59             :  *
      60             :  * Unlock the buffer; and if it is safe to release the pin, do that, too.
      61             :  * This will prevent vacuum from stalling in a blocked state trying to read a
      62             :  * page when a cursor is sitting on it.
      63             :  *
      64             :  * See nbtree/README section on making concurrent TID recycling safe.
      65             :  */
      66             : static void
      67    10178352 : _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
      68             : {
      69    10178352 :     _bt_unlockbuf(scan->indexRelation, sp->buf);
      70             : 
      71    10178352 :     if (IsMVCCSnapshot(scan->xs_snapshot) &&
      72     9860174 :         RelationNeedsWAL(scan->indexRelation) &&
      73     9855230 :         !scan->xs_want_itup)
      74             :     {
      75     9720320 :         ReleaseBuffer(sp->buf);
      76     9720320 :         sp->buf = InvalidBuffer;
      77             :     }
      78    10178352 : }
      79             : 
      80             : /*
      81             :  *  _bt_search() -- Search the tree for a particular scankey,
      82             :  *      or more precisely for the first leaf page it could be on.
      83             :  *
      84             :  * The passed scankey is an insertion-type scankey (see nbtree/README),
      85             :  * but it can omit the rightmost column(s) of the index.
      86             :  *
      87             :  * Return value is a stack of parent-page pointers (i.e. there is no entry for
      88             :  * the leaf level/page).  *bufP is set to the address of the leaf-page buffer,
      89             :  * which is locked and pinned.  No locks are held on the parent pages,
      90             :  * however!
      91             :  *
      92             :  * The returned buffer is locked according to access parameter.  Additionally,
      93             :  * access = BT_WRITE will allow an empty root page to be created and returned.
      94             :  * When access = BT_READ, an empty index will result in *bufP being set to
      95             :  * InvalidBuffer.  Also, in BT_WRITE mode, any incomplete splits encountered
      96             :  * during the search will be finished.
      97             :  *
      98             :  * heaprel must be provided by callers that pass access = BT_WRITE, since we
      99             :  * might need to allocate a new root page for caller -- see _bt_allocbuf.
     100             :  */
     101             : BTStack
     102    21678744 : _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
     103             :            int access)
     104             : {
     105    21678744 :     BTStack     stack_in = NULL;
     106    21678744 :     int         page_access = BT_READ;
     107             : 
     108             :     /* heaprel must be set whenever _bt_allocbuf is reachable */
     109             :     Assert(access == BT_READ || access == BT_WRITE);
     110             :     Assert(access == BT_READ || heaprel != NULL);
     111             : 
     112             :     /* Get the root page to start with */
     113    21678744 :     *bufP = _bt_getroot(rel, heaprel, access);
     114             : 
     115             :     /* If index is empty and access = BT_READ, no root page is created. */
     116    21678744 :     if (!BufferIsValid(*bufP))
     117      538490 :         return (BTStack) NULL;
     118             : 
     119             :     /* Loop iterates once per level descended in the tree */
     120             :     for (;;)
     121    17475698 :     {
     122             :         Page        page;
     123             :         BTPageOpaque opaque;
     124             :         OffsetNumber offnum;
     125             :         ItemId      itemid;
     126             :         IndexTuple  itup;
     127             :         BlockNumber child;
     128             :         BTStack     new_stack;
     129             : 
     130             :         /*
     131             :          * Race -- the page we just grabbed may have split since we read its
     132             :          * downlink in its parent page (or the metapage).  If it has, we may
     133             :          * need to move right to its new sibling.  Do that.
     134             :          *
     135             :          * In write-mode, allow _bt_moveright to finish any incomplete splits
     136             :          * along the way.  Strictly speaking, we'd only need to finish an
     137             :          * incomplete split on the leaf page we're about to insert to, not on
     138             :          * any of the upper levels (internal pages with incomplete splits are
     139             :          * also taken care of in _bt_getstackbuf).  But this is a good
     140             :          * opportunity to finish splits of internal pages too.
     141             :          */
     142    38615952 :         *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
     143             :                               stack_in, page_access);
     144             : 
     145             :         /* if this is a leaf page, we're done */
     146    38615952 :         page = BufferGetPage(*bufP);
     147    38615952 :         opaque = BTPageGetOpaque(page);
     148    38615952 :         if (P_ISLEAF(opaque))
     149    21140254 :             break;
     150             : 
     151             :         /*
     152             :          * Find the appropriate pivot tuple on this page.  Its downlink points
     153             :          * to the child page that we're about to descend to.
     154             :          */
     155    17475698 :         offnum = _bt_binsrch(rel, key, *bufP);
     156    17475698 :         itemid = PageGetItemId(page, offnum);
     157    17475698 :         itup = (IndexTuple) PageGetItem(page, itemid);
     158             :         Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
     159    17475698 :         child = BTreeTupleGetDownLink(itup);
     160             : 
     161             :         /*
     162             :          * We need to save the location of the pivot tuple we chose in a new
     163             :          * stack entry for this page/level.  If caller ends up splitting a
     164             :          * page one level down, it usually ends up inserting a new pivot
     165             :          * tuple/downlink immediately after the location recorded here.
     166             :          */
     167    17475698 :         new_stack = (BTStack) palloc(sizeof(BTStackData));
     168    17475698 :         new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
     169    17475698 :         new_stack->bts_offset = offnum;
     170    17475698 :         new_stack->bts_parent = stack_in;
     171             : 
     172             :         /*
     173             :          * Page level 1 is lowest non-leaf page level prior to leaves.  So, if
     174             :          * we're on the level 1 and asked to lock leaf page in write mode,
     175             :          * then lock next page in write mode, because it must be a leaf.
     176             :          */
     177    17475698 :         if (opaque->btpo_level == 1 && access == BT_WRITE)
     178     6054368 :             page_access = BT_WRITE;
     179             : 
     180             :         /* drop the read lock on the page, then acquire one on its child */
     181    17475698 :         *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
     182             : 
     183             :         /* okay, all set to move down a level */
     184    17475698 :         stack_in = new_stack;
     185             :     }
     186             : 
     187             :     /*
     188             :      * If we're asked to lock leaf in write mode, but didn't manage to, then
     189             :      * relock.  This should only happen when the root page is a leaf page (and
     190             :      * the only page in the index other than the metapage).
     191             :      */
     192    21140254 :     if (access == BT_WRITE && page_access == BT_READ)
     193             :     {
     194             :         /* trade in our read lock for a write lock */
     195      840878 :         _bt_unlockbuf(rel, *bufP);
     196      840878 :         _bt_lockbuf(rel, *bufP, BT_WRITE);
     197             : 
     198             :         /*
     199             :          * Race -- the leaf page may have split after we dropped the read lock
     200             :          * but before we acquired a write lock.  If it has, we may need to
     201             :          * move right to its new sibling.  Do that.
     202             :          */
     203      840878 :         *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
     204             :     }
     205             : 
     206    21140254 :     return stack_in;
     207             : }
     208             : 
     209             : /*
     210             :  *  _bt_moveright() -- move right in the btree if necessary.
     211             :  *
     212             :  * When we follow a pointer to reach a page, it is possible that
     213             :  * the page has changed in the meanwhile.  If this happens, we're
     214             :  * guaranteed that the page has "split right" -- that is, that any
     215             :  * data that appeared on the page originally is either on the page
     216             :  * or strictly to the right of it.
     217             :  *
     218             :  * This routine decides whether or not we need to move right in the
     219             :  * tree by examining the high key entry on the page.  If that entry is
     220             :  * strictly less than the scankey, or <= the scankey in the
     221             :  * key.nextkey=true case, then we followed the wrong link and we need
     222             :  * to move right.
     223             :  *
     224             :  * The passed insertion-type scankey can omit the rightmost column(s) of the
     225             :  * index. (see nbtree/README)
     226             :  *
     227             :  * When key.nextkey is false (the usual case), we are looking for the first
     228             :  * item >= key.  When key.nextkey is true, we are looking for the first item
     229             :  * strictly greater than key.
     230             :  *
     231             :  * If forupdate is true, we will attempt to finish any incomplete splits
     232             :  * that we encounter.  This is required when locking a target page for an
     233             :  * insertion, because we don't allow inserting on a page before the split is
     234             :  * completed.  'heaprel' and 'stack' are only used if forupdate is true.
     235             :  *
     236             :  * On entry, we have the buffer pinned and a lock of the type specified by
     237             :  * 'access'.  If we move right, we release the buffer and lock and acquire
     238             :  * the same on the right sibling.  Return value is the buffer we stop at.
     239             :  */
     240             : static Buffer
     241    39456830 : _bt_moveright(Relation rel,
     242             :               Relation heaprel,
     243             :               BTScanInsert key,
     244             :               Buffer buf,
     245             :               bool forupdate,
     246             :               BTStack stack,
     247             :               int access)
     248             : {
     249             :     Page        page;
     250             :     BTPageOpaque opaque;
     251             :     int32       cmpval;
     252             : 
     253             :     Assert(!forupdate || heaprel != NULL);
     254             : 
     255             :     /*
     256             :      * When nextkey = false (normal case): if the scan key that brought us to
     257             :      * this page is > the high key stored on the page, then the page has split
     258             :      * and we need to move right.  (pg_upgrade'd !heapkeyspace indexes could
     259             :      * have some duplicates to the right as well as the left, but that's
     260             :      * something that's only ever dealt with on the leaf level, after
     261             :      * _bt_search has found an initial leaf page.)
     262             :      *
     263             :      * When nextkey = true: move right if the scan key is >= page's high key.
     264             :      * (Note that key.scantid cannot be set in this case.)
     265             :      *
     266             :      * The page could even have split more than once, so scan as far as
     267             :      * needed.
     268             :      *
     269             :      * We also have to move right if we followed a link that brought us to a
     270             :      * dead page.
     271             :      */
     272    39456830 :     cmpval = key->nextkey ? 0 : 1;
     273             : 
     274             :     for (;;)
     275             :     {
     276    39458198 :         page = BufferGetPage(buf);
     277    39458198 :         opaque = BTPageGetOpaque(page);
     278             : 
     279    39458198 :         if (P_RIGHTMOST(opaque))
     280    29986460 :             break;
     281             : 
     282             :         /*
     283             :          * Finish any incomplete splits we encounter along the way.
     284             :          */
     285     9471738 :         if (forupdate && P_INCOMPLETE_SPLIT(opaque))
     286             :         {
     287           0 :             BlockNumber blkno = BufferGetBlockNumber(buf);
     288             : 
     289             :             /* upgrade our lock if necessary */
     290           0 :             if (access == BT_READ)
     291             :             {
     292           0 :                 _bt_unlockbuf(rel, buf);
     293           0 :                 _bt_lockbuf(rel, buf, BT_WRITE);
     294             :             }
     295             : 
     296           0 :             if (P_INCOMPLETE_SPLIT(opaque))
     297           0 :                 _bt_finish_split(rel, heaprel, buf, stack);
     298             :             else
     299           0 :                 _bt_relbuf(rel, buf);
     300             : 
     301             :             /* re-acquire the lock in the right mode, and re-check */
     302           0 :             buf = _bt_getbuf(rel, blkno, access);
     303           0 :             continue;
     304             :         }
     305             : 
     306     9471738 :         if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
     307             :         {
     308             :             /* step right one page */
     309        1368 :             buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
     310        1368 :             continue;
     311             :         }
     312             :         else
     313             :             break;
     314             :     }
     315             : 
     316    39456830 :     if (P_IGNORE(opaque))
     317           0 :         elog(ERROR, "fell off the end of index \"%s\"",
     318             :              RelationGetRelationName(rel));
     319             : 
     320    39456830 :     return buf;
     321             : }
     322             : 
     323             : /*
     324             :  *  _bt_binsrch() -- Do a binary search for a key on a particular page.
     325             :  *
     326             :  * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
     327             :  * of the last key < given scankey, or last key <= given scankey if nextkey
     328             :  * is true.  (Since _bt_compare treats the first data key of such a page as
     329             :  * minus infinity, there will be at least one key < scankey, so the result
     330             :  * always points at one of the keys on the page.)
     331             :  *
     332             :  * On a leaf page, _bt_binsrch() returns the final result of the initial
     333             :  * positioning process that started with _bt_first's call to _bt_search.
     334             :  * We're returning a non-pivot tuple offset, so things are a little different.
     335             :  * It is possible that we'll return an offset that's either past the last
     336             :  * non-pivot slot, or (in the case of a backward scan) before the first slot.
     337             :  *
     338             :  * This procedure is not responsible for walking right, it just examines
     339             :  * the given page.  _bt_binsrch() has no lock or refcount side effects
     340             :  * on the buffer.
     341             :  */
     342             : static OffsetNumber
     343    31313090 : _bt_binsrch(Relation rel,
     344             :             BTScanInsert key,
     345             :             Buffer buf)
     346             : {
     347             :     Page        page;
     348             :     BTPageOpaque opaque;
     349             :     OffsetNumber low,
     350             :                 high;
     351             :     int32       result,
     352             :                 cmpval;
     353             : 
     354    31313090 :     page = BufferGetPage(buf);
     355    31313090 :     opaque = BTPageGetOpaque(page);
     356             : 
     357             :     /* Requesting nextkey semantics while using scantid seems nonsensical */
     358             :     Assert(!key->nextkey || key->scantid == NULL);
     359             :     /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
     360             :     Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
     361             : 
     362    31313090 :     low = P_FIRSTDATAKEY(opaque);
     363    31313090 :     high = PageGetMaxOffsetNumber(page);
     364             : 
     365             :     /*
     366             :      * If there are no keys on the page, return the first available slot. Note
     367             :      * this covers two cases: the page is really empty (no keys), or it
     368             :      * contains only a high key.  The latter case is possible after vacuuming.
     369             :      * This can never happen on an internal page, however, since they are
     370             :      * never empty (an internal page must have at least one child).
     371             :      */
     372    31313090 :     if (unlikely(high < low))
     373        7328 :         return low;
     374             : 
     375             :     /*
     376             :      * Binary search to find the first key on the page >= scan key, or first
     377             :      * key > scankey when nextkey is true.
     378             :      *
     379             :      * For nextkey=false (cmpval=1), the loop invariant is: all slots before
     380             :      * 'low' are < scan key, all slots at or after 'high' are >= scan key.
     381             :      *
     382             :      * For nextkey=true (cmpval=0), the loop invariant is: all slots before
     383             :      * 'low' are <= scan key, all slots at or after 'high' are > scan key.
     384             :      *
     385             :      * We can fall out when high == low.
     386             :      */
     387    31305762 :     high++;                     /* establish the loop invariant for high */
     388             : 
     389    31305762 :     cmpval = key->nextkey ? 0 : 1;   /* select comparison value */
     390             : 
     391   204219336 :     while (high > low)
     392             :     {
     393   172913574 :         OffsetNumber mid = low + ((high - low) / 2);
     394             : 
     395             :         /* We have low <= mid < high, so mid points at a real slot */
     396             : 
     397   172913574 :         result = _bt_compare(rel, key, page, mid);
     398             : 
     399   172913574 :         if (result >= cmpval)
     400   108190592 :             low = mid + 1;
     401             :         else
     402    64722982 :             high = mid;
     403             :     }
     404             : 
     405             :     /*
     406             :      * At this point we have high == low.
     407             :      *
     408             :      * On a leaf page we always return the first non-pivot tuple >= scan key
     409             :      * (resp. > scan key) for forward scan callers.  For backward scans, it's
     410             :      * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
     411             :      */
     412    31305762 :     if (P_ISLEAF(opaque))
     413             :     {
     414             :         /*
     415             :          * In the backward scan case we're supposed to locate the last
     416             :          * matching tuple on the leaf level -- not the first matching tuple
     417             :          * (the last tuple will be the first one returned by the scan).
     418             :          *
     419             :          * At this point we've located the first non-pivot tuple immediately
     420             :          * after the last matching tuple (which might just be maxoff + 1).
     421             :          * Compensate by stepping back.
     422             :          */
     423    13830064 :         if (key->backward)
     424       52676 :             return OffsetNumberPrev(low);
     425             : 
     426    13777388 :         return low;
     427             :     }
     428             : 
     429             :     /*
     430             :      * On a non-leaf page, return the last key < scan key (resp. <= scan key).
     431             :      * There must be one if _bt_compare() is playing by the rules.
     432             :      *
     433             :      * _bt_compare() will seldom see any exactly-matching pivot tuples, since
     434             :      * a truncated -inf heap TID is usually enough to prevent it altogether.
     435             :      * Even omitted scan key entries are treated as > truncated attributes.
     436             :      *
     437             :      * However, during backward scans _bt_compare() interprets omitted scan
     438             :      * key attributes as == corresponding truncated -inf attributes instead.
     439             :      * This works just like < would work here.  Under this scheme, < strategy
     440             :      * backward scans will always directly descend to the correct leaf page.
     441             :      * In particular, they will never incur an "extra" leaf page access with a
     442             :      * scan key that happens to contain the same prefix of values as some
     443             :      * pivot tuple's untruncated prefix.  VACUUM relies on this guarantee when
     444             :      * it uses a leaf page high key to "re-find" a page undergoing deletion.
     445             :      */
     446             :     Assert(low > P_FIRSTDATAKEY(opaque));
     447             : 
     448    17475698 :     return OffsetNumberPrev(low);
     449             : }
     450             : 
     451             : /*
     452             :  *
     453             :  *  _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
     454             :  *
     455             :  * Like _bt_binsrch(), but with support for caching the binary search
     456             :  * bounds.  Only used during insertion, and only on the leaf page that it
     457             :  * looks like caller will insert tuple on.  Exclusive-locked and pinned
     458             :  * leaf page is contained within insertstate.
     459             :  *
     460             :  * Caches the bounds fields in insertstate so that a subsequent call can
     461             :  * reuse the low and strict high bounds of original binary search.  Callers
     462             :  * that use these fields directly must be prepared for the case where low
     463             :  * and/or stricthigh are not on the same page (one or both exceed maxoff
     464             :  * for the page).  The case where there are no items on the page (high <
     465             :  * low) makes bounds invalid.
     466             :  *
     467             :  * Caller is responsible for invalidating bounds when it modifies the page
     468             :  * before calling here a second time, and for dealing with posting list
     469             :  * tuple matches (callers can use insertstate's postingoff field to
     470             :  * determine which existing heap TID will need to be replaced by a posting
     471             :  * list split).
     472             :  */
     473             : OffsetNumber
     474    12341298 : _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
     475             : {
     476    12341298 :     BTScanInsert key = insertstate->itup_key;
     477             :     Page        page;
     478             :     BTPageOpaque opaque;
     479             :     OffsetNumber low,
     480             :                 high,
     481             :                 stricthigh;
     482             :     int32       result,
     483             :                 cmpval;
     484             : 
     485    12341298 :     page = BufferGetPage(insertstate->buf);
     486    12341298 :     opaque = BTPageGetOpaque(page);
     487             : 
     488             :     Assert(P_ISLEAF(opaque));
     489             :     Assert(!key->nextkey);
     490             :     Assert(insertstate->postingoff == 0);
     491             : 
     492    12341298 :     if (!insertstate->bounds_valid)
     493             :     {
     494             :         /* Start new binary search */
     495     7383006 :         low = P_FIRSTDATAKEY(opaque);
     496     7383006 :         high = PageGetMaxOffsetNumber(page);
     497             :     }
     498             :     else
     499             :     {
     500             :         /* Restore result of previous binary search against same page */
     501     4958292 :         low = insertstate->low;
     502     4958292 :         high = insertstate->stricthigh;
     503             :     }
     504             : 
     505             :     /* If there are no keys on the page, return the first available slot */
     506    12341298 :     if (unlikely(high < low))
     507             :     {
     508             :         /* Caller can't reuse bounds */
     509       22470 :         insertstate->low = InvalidOffsetNumber;
     510       22470 :         insertstate->stricthigh = InvalidOffsetNumber;
     511       22470 :         insertstate->bounds_valid = false;
     512       22470 :         return low;
     513             :     }
     514             : 
     515             :     /*
     516             :      * Binary search to find the first key on the page >= scan key. (nextkey
     517             :      * is always false when inserting).
     518             :      *
     519             :      * The loop invariant is: all slots before 'low' are < scan key, all slots
     520             :      * at or after 'high' are >= scan key.  'stricthigh' is > scan key, and is
     521             :      * maintained to save additional search effort for caller.
     522             :      *
     523             :      * We can fall out when high == low.
     524             :      */
     525    12318828 :     if (!insertstate->bounds_valid)
     526     7360536 :         high++;                 /* establish the loop invariant for high */
     527    12318828 :     stricthigh = high;          /* high initially strictly higher */
     528             : 
     529    12318828 :     cmpval = 1;                 /* !nextkey comparison value */
     530             : 
     531    66248328 :     while (high > low)
     532             :     {
     533    53929500 :         OffsetNumber mid = low + ((high - low) / 2);
     534             : 
     535             :         /* We have low <= mid < high, so mid points at a real slot */
     536             : 
     537    53929500 :         result = _bt_compare(rel, key, page, mid);
     538             : 
     539    53929500 :         if (result >= cmpval)
     540    41305648 :             low = mid + 1;
     541             :         else
     542             :         {
     543    12623852 :             high = mid;
     544    12623852 :             if (result != 0)
     545    11539686 :                 stricthigh = high;
     546             :         }
     547             : 
     548             :         /*
     549             :          * If tuple at offset located by binary search is a posting list whose
     550             :          * TID range overlaps with caller's scantid, perform posting list
     551             :          * binary search to set postingoff for caller.  Caller must split the
     552             :          * posting list when postingoff is set.  This should happen
     553             :          * infrequently.
     554             :          */
     555    53929500 :         if (unlikely(result == 0 && key->scantid != NULL))
     556             :         {
     557             :             /*
     558             :              * postingoff should never be set more than once per leaf page
     559             :              * binary search.  That would mean that there are duplicate table
     560             :              * TIDs in the index, which is never okay.  Check for that here.
     561             :              */
     562      423322 :             if (insertstate->postingoff != 0)
     563           0 :                 ereport(ERROR,
     564             :                         (errcode(ERRCODE_INDEX_CORRUPTED),
     565             :                          errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
     566             :                                          ItemPointerGetBlockNumber(key->scantid),
     567             :                                          ItemPointerGetOffsetNumber(key->scantid),
     568             :                                          low, stricthigh,
     569             :                                          BufferGetBlockNumber(insertstate->buf),
     570             :                                          RelationGetRelationName(rel))));
     571             : 
     572      423322 :             insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
     573             :         }
     574             :     }
     575             : 
     576             :     /*
     577             :      * On a leaf page, a binary search always returns the first key >= scan
     578             :      * key (at least in !nextkey case), which could be the last slot + 1. This
     579             :      * is also the lower bound of cached search.
     580             :      *
     581             :      * stricthigh may also be the last slot + 1, which prevents caller from
     582             :      * using bounds directly, but is still useful to us if we're called a
     583             :      * second time with cached bounds (cached low will be < stricthigh when
     584             :      * that happens).
     585             :      */
     586    12318828 :     insertstate->low = low;
     587    12318828 :     insertstate->stricthigh = stricthigh;
     588    12318828 :     insertstate->bounds_valid = true;
     589             : 
     590    12318828 :     return low;
     591             : }
     592             : 
     593             : /*----------
     594             :  *  _bt_binsrch_posting() -- posting list binary search.
     595             :  *
     596             :  * Helper routine for _bt_binsrch_insert().
     597             :  *
     598             :  * Returns offset into posting list where caller's scantid belongs.
     599             :  *----------
     600             :  */
     601             : static int
     602      423322 : _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
     603             : {
     604             :     IndexTuple  itup;
     605             :     ItemId      itemid;
     606             :     int         low,
     607             :                 high,
     608             :                 mid,
     609             :                 res;
     610             : 
     611             :     /*
     612             :      * If this isn't a posting tuple, then the index must be corrupt (if it is
     613             :      * an ordinary non-pivot tuple then there must be an existing tuple with a
     614             :      * heap TID that equals inserter's new heap TID/scantid).  Defensively
     615             :      * check that tuple is a posting list tuple whose posting list range
     616             :      * includes caller's scantid.
     617             :      *
     618             :      * (This is also needed because contrib/amcheck's rootdescend option needs
     619             :      * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
     620             :      */
     621      423322 :     itemid = PageGetItemId(page, offnum);
     622      423322 :     itup = (IndexTuple) PageGetItem(page, itemid);
     623      423322 :     if (!BTreeTupleIsPosting(itup))
     624      402196 :         return 0;
     625             : 
     626             :     Assert(key->heapkeyspace && key->allequalimage);
     627             : 
     628             :     /*
     629             :      * In the event that posting list tuple has LP_DEAD bit set, indicate this
     630             :      * to _bt_binsrch_insert() caller by returning -1, a sentinel value.  A
     631             :      * second call to _bt_binsrch_insert() can take place when its caller has
     632             :      * removed the dead item.
     633             :      */
     634       21126 :     if (ItemIdIsDead(itemid))
     635           8 :         return -1;
     636             : 
     637             :     /* "high" is past end of posting list for loop invariant */
     638       21118 :     low = 0;
     639       21118 :     high = BTreeTupleGetNPosting(itup);
     640             :     Assert(high >= 2);
     641             : 
     642      171068 :     while (high > low)
     643             :     {
     644      149950 :         mid = low + ((high - low) / 2);
     645      149950 :         res = ItemPointerCompare(key->scantid,
     646             :                                  BTreeTupleGetPostingN(itup, mid));
     647             : 
     648      149950 :         if (res > 0)
     649       78934 :             low = mid + 1;
     650       71016 :         else if (res < 0)
     651       71016 :             high = mid;
     652             :         else
     653           0 :             return mid;
     654             :     }
     655             : 
     656             :     /* Exact match not found */
     657       21118 :     return low;
     658             : }
     659             : 
     660             : /*----------
     661             :  *  _bt_compare() -- Compare insertion-type scankey to tuple on a page.
     662             :  *
     663             :  *  page/offnum: location of btree item to be compared to.
     664             :  *
     665             :  *      This routine returns:
     666             :  *          <0 if scankey < tuple at offnum;
     667             :  *           0 if scankey == tuple at offnum;
     668             :  *          >0 if scankey > tuple at offnum.
     669             :  *
     670             :  * NULLs in the keys are treated as sortable values.  Therefore
     671             :  * "equality" does not necessarily mean that the item should be returned
     672             :  * to the caller as a matching key.  Similarly, an insertion scankey
     673             :  * with its scantid set is treated as equal to a posting tuple whose TID
     674             :  * range overlaps with their scantid.  There generally won't be a
     675             :  * matching TID in the posting tuple, which caller must handle
     676             :  * themselves (e.g., by splitting the posting list tuple).
     677             :  *
     678             :  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
     679             :  * "minus infinity": this routine will always claim it is less than the
     680             :  * scankey.  The actual key value stored is explicitly truncated to 0
     681             :  * attributes (explicitly minus infinity) with version 3+ indexes, but
     682             :  * that isn't relied upon.  This allows us to implement the Lehman and
     683             :  * Yao convention that the first down-link pointer is before the first
     684             :  * key.  See backend/access/nbtree/README for details.
     685             :  *----------
     686             :  */
     687             : int32
     688   253410830 : _bt_compare(Relation rel,
     689             :             BTScanInsert key,
     690             :             Page page,
     691             :             OffsetNumber offnum)
     692             : {
     693   253410830 :     TupleDesc   itupdesc = RelationGetDescr(rel);
     694   253410830 :     BTPageOpaque opaque = BTPageGetOpaque(page);
     695             :     IndexTuple  itup;
     696             :     ItemPointer heapTid;
     697             :     ScanKey     scankey;
     698             :     int         ncmpkey;
     699             :     int         ntupatts;
     700             :     int32       result;
     701             : 
     702             :     Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
     703             :     Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
     704             :     Assert(key->heapkeyspace || key->scantid == NULL);
     705             : 
     706             :     /*
     707             :      * Force result ">" if target item is first data item on an internal page
     708             :      * --- see NOTE above.
     709             :      */
     710   253410830 :     if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
     711     3235508 :         return 1;
     712             : 
     713   250175322 :     itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
     714   250175322 :     ntupatts = BTreeTupleGetNAtts(itup, rel);
     715             : 
     716             :     /*
     717             :      * The scan key is set up with the attribute number associated with each
     718             :      * term in the key.  It is important that, if the index is multi-key, the
     719             :      * scan contain the first k key attributes, and that they be in order.  If
     720             :      * you think about how multi-key ordering works, you'll understand why
     721             :      * this is.
     722             :      *
     723             :      * We don't test for violation of this condition here, however.  The
     724             :      * initial setup for the index scan had better have gotten it right (see
     725             :      * _bt_first).
     726             :      */
     727             : 
     728   250175322 :     ncmpkey = Min(ntupatts, key->keysz);
     729             :     Assert(key->heapkeyspace || ncmpkey == key->keysz);
     730             :     Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
     731   250175322 :     scankey = key->scankeys;
     732   314858636 :     for (int i = 1; i <= ncmpkey; i++)
     733             :     {
     734             :         Datum       datum;
     735             :         bool        isNull;
     736             : 
     737   292652038 :         datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
     738             : 
     739   292652038 :         if (scankey->sk_flags & SK_ISNULL)   /* key is NULL */
     740             :         {
     741      527054 :             if (isNull)
     742      157024 :                 result = 0;     /* NULL "=" NULL */
     743      370030 :             else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
     744         264 :                 result = -1;    /* NULL "<" NOT_NULL */
     745             :             else
     746      369766 :                 result = 1;     /* NULL ">" NOT_NULL */
     747             :         }
     748   292124984 :         else if (isNull)        /* key is NOT_NULL and item is NULL */
     749             :         {
     750         222 :             if (scankey->sk_flags & SK_BT_NULLS_FIRST)
     751           0 :                 result = 1;     /* NOT_NULL ">" NULL */
     752             :             else
     753         222 :                 result = -1;    /* NOT_NULL "<" NULL */
     754             :         }
     755             :         else
     756             :         {
     757             :             /*
     758             :              * The sk_func needs to be passed the index value as left arg and
     759             :              * the sk_argument as right arg (they might be of different
     760             :              * types).  Since it is convenient for callers to think of
     761             :              * _bt_compare as comparing the scankey to the index item, we have
     762             :              * to flip the sign of the comparison result.  (Unless it's a DESC
     763             :              * column, in which case we *don't* flip the sign.)
     764             :              */
     765   292124762 :             result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
     766             :                                                      scankey->sk_collation,
     767             :                                                      datum,
     768             :                                                      scankey->sk_argument));
     769             : 
     770   292124762 :             if (!(scankey->sk_flags & SK_BT_DESC))
     771   292124696 :                 INVERT_COMPARE_RESULT(result);
     772             :         }
     773             : 
     774             :         /* if the keys are unequal, return the difference */
     775   292652038 :         if (result != 0)
     776   227968724 :             return result;
     777             : 
     778    64683314 :         scankey++;
     779             :     }
     780             : 
     781             :     /*
     782             :      * All non-truncated attributes (other than heap TID) were found to be
     783             :      * equal.  Treat truncated attributes as minus infinity when scankey has a
     784             :      * key attribute value that would otherwise be compared directly.
     785             :      *
     786             :      * Note: it doesn't matter if ntupatts includes non-key attributes;
     787             :      * scankey won't, so explicitly excluding non-key attributes isn't
     788             :      * necessary.
     789             :      */
     790    22206598 :     if (key->keysz > ntupatts)
     791      200520 :         return 1;
     792             : 
     793             :     /*
     794             :      * Use the heap TID attribute and scantid to try to break the tie.  The
     795             :      * rules are the same as any other key attribute -- only the
     796             :      * representation differs.
     797             :      */
     798    22006078 :     heapTid = BTreeTupleGetHeapTID(itup);
     799    22006078 :     if (key->scantid == NULL)
     800             :     {
     801             :         /*
     802             :          * Forward scans have a scankey that is considered greater than a
     803             :          * truncated pivot tuple if and when the scankey has equal values for
     804             :          * attributes up to and including the least significant untruncated
     805             :          * attribute in tuple.  Even attributes that were omitted from the
     806             :          * scan key are considered greater than -inf truncated attributes.
     807             :          * (See _bt_binsrch for an explanation of our backward scan behavior.)
     808             :          *
     809             :          * For example, if an index has the minimum two attributes (single
     810             :          * user key attribute, plus heap TID attribute), and a page's high key
     811             :          * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
     812             :          * will not descend to the page to the left.  The search will descend
     813             :          * right instead.  The truncated attribute in pivot tuple means that
     814             :          * all non-pivot tuples on the page to the left are strictly < 'foo',
     815             :          * so it isn't necessary to descend left.  In other words, search
     816             :          * doesn't have to descend left because it isn't interested in a match
     817             :          * that has a heap TID value of -inf.
     818             :          *
     819             :          * Note: the heap TID part of the test ensures that scankey is being
     820             :          * compared to a pivot tuple with one or more truncated -inf key
     821             :          * attributes.  The heap TID attribute is the last key attribute in
     822             :          * every index, of course, but other than that it isn't special.
     823             :          */
     824    17688578 :         if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
     825        9116 :             key->heapkeyspace)
     826        9116 :             return 1;
     827             : 
     828             :         /* All provided scankey arguments found to be equal */
     829    17679462 :         return 0;
     830             :     }
     831             : 
     832             :     /*
     833             :      * Treat truncated heap TID as minus infinity, since scankey has a key
     834             :      * attribute value (scantid) that would otherwise be compared directly
     835             :      */
     836             :     Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
     837     4317500 :     if (heapTid == NULL)
     838        3962 :         return 1;
     839             : 
     840             :     /*
     841             :      * Scankey must be treated as equal to a posting list tuple if its scantid
     842             :      * value falls within the range of the posting list.  In all other cases
     843             :      * there can only be a single heap TID value, which is compared directly
     844             :      * with scantid.
     845             :      */
     846             :     Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
     847     4313538 :     result = ItemPointerCompare(key->scantid, heapTid);
     848     4313538 :     if (result <= 0 || !BTreeTupleIsPosting(itup))
     849     4154394 :         return result;
     850             :     else
     851             :     {
     852      159144 :         result = ItemPointerCompare(key->scantid,
     853             :                                     BTreeTupleGetMaxHeapTID(itup));
     854      159144 :         if (result > 0)
     855      138018 :             return 1;
     856             :     }
     857             : 
     858       21126 :     return 0;
     859             : }
     860             : 
     861             : /*
     862             :  *  _bt_first() -- Find the first item in a scan.
     863             :  *
     864             :  *      We need to be clever about the direction of scan, the search
     865             :  *      conditions, and the tree ordering.  We find the first item (or,
     866             :  *      if backwards scan, the last item) in the tree that satisfies the
     867             :  *      qualifications in the scan key.  On success exit, data about the
     868             :  *      matching tuple(s) on the page has been loaded into so->currPos.  We'll
     869             :  *      drop all locks and hold onto a pin on page's buffer, except when
     870             :  *      _bt_drop_lock_and_maybe_pin dropped the pin to avoid blocking VACUUM.
     871             :  *      _bt_returnitem sets the next item to return to scan on success exit.
     872             :  *
     873             :  * If there are no matching items in the index, we return false, with no
     874             :  * pins or locks held.  so->currPos will remain invalid.
     875             :  *
     876             :  * Note that scan->keyData[], and the so->keyData[] scankey built from it,
     877             :  * are both search-type scankeys (see nbtree/README for more about this).
     878             :  * Within this routine, we build a temporary insertion-type scankey to use
     879             :  * in locating the scan start position.
     880             :  */
     881             : bool
     882    14457598 : _bt_first(IndexScanDesc scan, ScanDirection dir)
     883             : {
     884    14457598 :     Relation    rel = scan->indexRelation;
     885    14457598 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     886             :     BTStack     stack;
     887             :     OffsetNumber offnum;
     888             :     BTScanInsertData inskey;
     889             :     ScanKey     startKeys[INDEX_MAX_KEYS];
     890             :     ScanKeyData notnullkeys[INDEX_MAX_KEYS];
     891    14457598 :     int         keysz = 0;
     892             :     StrategyNumber strat_total;
     893    14457598 :     BlockNumber blkno = InvalidBlockNumber,
     894             :                 lastcurrblkno;
     895             : 
     896             :     Assert(!BTScanPosIsValid(so->currPos));
     897             : 
     898             :     /*
     899             :      * Examine the scan keys and eliminate any redundant keys; also mark the
     900             :      * keys that must be matched to continue the scan.
     901             :      */
     902    14457598 :     _bt_preprocess_keys(scan);
     903             : 
     904             :     /*
     905             :      * Quit now if _bt_preprocess_keys() discovered that the scan keys can
     906             :      * never be satisfied (eg, x == 1 AND x > 2).
     907             :      */
     908    14457598 :     if (!so->qual_ok)
     909             :     {
     910             :         Assert(!so->needPrimScan);
     911        1120 :         _bt_parallel_done(scan);
     912        1120 :         return false;
     913             :     }
     914             : 
     915             :     /*
     916             :      * If this is a parallel scan, we must seize the scan.  _bt_readfirstpage
     917             :      * will likely release the parallel scan later on.
     918             :      */
     919    14456478 :     if (scan->parallel_scan != NULL &&
     920         446 :         !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
     921         314 :         return false;
     922             : 
     923             :     /*
     924             :      * Initialize the scan's arrays (if any) for the current scan direction
     925             :      * (except when they were already set to later values as part of
     926             :      * scheduling the primitive index scan that is now underway)
     927             :      */
     928    14456164 :     if (so->numArrayKeys && !so->needPrimScan)
     929       67102 :         _bt_start_array_keys(scan, dir);
     930             : 
     931    14456164 :     if (blkno != InvalidBlockNumber)
     932             :     {
     933             :         /*
     934             :          * We anticipated calling _bt_search, but another worker bet us to it.
     935             :          * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
     936             :          */
     937             :         Assert(scan->parallel_scan != NULL);
     938             :         Assert(!so->needPrimScan);
     939             :         Assert(blkno != P_NONE);
     940             : 
     941           8 :         if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
     942           0 :             return false;
     943             : 
     944           8 :         _bt_returnitem(scan, so);
     945           8 :         return true;
     946             :     }
     947             : 
     948             :     /*
     949             :      * Count an indexscan for stats, now that we know that we'll call
     950             :      * _bt_search/_bt_endpoint below
     951             :      */
     952    14456156 :     pgstat_count_index_scan(rel);
     953    14456156 :     if (scan->instrument)
     954      848928 :         scan->instrument->nsearches++;
     955             : 
     956             :     /*----------
     957             :      * Examine the scan keys to discover where we need to start the scan.
     958             :      *
     959             :      * We want to identify the keys that can be used as starting boundaries;
     960             :      * these are =, >, or >= keys for a forward scan or =, <, <= keys for
     961             :      * a backwards scan.  We can use keys for multiple attributes so long as
     962             :      * the prior attributes had only =, >= (resp. =, <=) keys.  Once we accept
     963             :      * a > or < boundary or find an attribute with no boundary (which can be
     964             :      * thought of as the same as "> -infinity"), we can't use keys for any
     965             :      * attributes to its right, because it would break our simplistic notion
     966             :      * of what initial positioning strategy to use.
     967             :      *
     968             :      * When the scan keys include cross-type operators, _bt_preprocess_keys
     969             :      * may not be able to eliminate redundant keys; in such cases we will
     970             :      * arbitrarily pick a usable one for each attribute.  This is correct
     971             :      * but possibly not optimal behavior.  (For example, with keys like
     972             :      * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
     973             :      * x=5 would be more efficient.)  Since the situation only arises given
     974             :      * a poorly-worded query plus an incomplete opfamily, live with it.
     975             :      *
     976             :      * When both equality and inequality keys appear for a single attribute
     977             :      * (again, only possible when cross-type operators appear), we *must*
     978             :      * select one of the equality keys for the starting point, because
     979             :      * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
     980             :      * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
     981             :      * start at x=4, we will fail and stop before reaching x=10.  If multiple
     982             :      * equality quals survive preprocessing, however, it doesn't matter which
     983             :      * one we use --- by definition, they are either redundant or
     984             :      * contradictory.
     985             :      *
     986             :      * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
     987             :      * If the index stores nulls at the end of the index we'll be starting
     988             :      * from, and we have no boundary key for the column (which means the key
     989             :      * we deduced NOT NULL from is an inequality key that constrains the other
     990             :      * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
     991             :      * use as a boundary key.  If we didn't do this, we might find ourselves
     992             :      * traversing a lot of null entries at the start of the scan.
     993             :      *
     994             :      * In this loop, row-comparison keys are treated the same as keys on their
     995             :      * first (leftmost) columns.  We'll add on lower-order columns of the row
     996             :      * comparison below, if possible.
     997             :      *
     998             :      * The selected scan keys (at most one per index column) are remembered by
     999             :      * storing their addresses into the local startKeys[] array.
    1000             :      *
    1001             :      * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
    1002             :      * the next primitive index scan (for scans with array keys) based in part
    1003             :      * on an understanding of how it'll enable us to reposition the scan.
    1004             :      * They're directly aware of how we'll sometimes cons up an explicit
    1005             :      * SK_SEARCHNOTNULL key.  They'll even end primitive scans by applying a
    1006             :      * symmetric "deduce NOT NULL" rule of their own.  This allows top-level
    1007             :      * scans to skip large groups of NULLs through repeated deductions about
    1008             :      * key strictness (for a required inequality key) and whether NULLs in the
    1009             :      * key's index column are stored last or first (relative to non-NULLs).
    1010             :      * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
    1011             :      * need to be kept in sync.
    1012             :      *----------
    1013             :      */
    1014    14456156 :     strat_total = BTEqualStrategyNumber;
    1015    14456156 :     if (so->numberOfKeys > 0)
    1016             :     {
    1017             :         AttrNumber  curattr;
    1018             :         ScanKey     chosen;
    1019             :         ScanKey     impliesNN;
    1020             :         ScanKey     cur;
    1021             : 
    1022             :         /*
    1023             :          * chosen is the so-far-chosen key for the current attribute, if any.
    1024             :          * We don't cast the decision in stone until we reach keys for the
    1025             :          * next attribute.
    1026             :          */
    1027    14443244 :         cur = so->keyData;
    1028    14443244 :         curattr = 1;
    1029    14443244 :         chosen = NULL;
    1030             :         /* Also remember any scankey that implies a NOT NULL constraint */
    1031    14443244 :         impliesNN = NULL;
    1032             : 
    1033             :         /*
    1034             :          * Loop iterates from 0 to numberOfKeys inclusive; we use the last
    1035             :          * pass to handle after-last-key processing.  Actual exit from the
    1036             :          * loop is at one of the "break" statements below.
    1037             :          */
    1038    14443244 :         for (int i = 0;; cur++, i++)
    1039             :         {
    1040    37447678 :             if (i >= so->numberOfKeys || cur->sk_attno != curattr)
    1041             :             {
    1042             :                 /*
    1043             :                  * Done looking at keys for curattr.  If we didn't find a
    1044             :                  * usable boundary key, see if we can deduce a NOT NULL key.
    1045             :                  */
    1046    23075596 :                 if (chosen == NULL && impliesNN != NULL &&
    1047       70096 :                     ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
    1048             :                      ScanDirectionIsForward(dir) :
    1049             :                      ScanDirectionIsBackward(dir)))
    1050             :                 {
    1051             :                     /* Yes, so build the key in notnullkeys[keysz] */
    1052           6 :                     chosen = &notnullkeys[keysz];
    1053           6 :                     ScanKeyEntryInitialize(chosen,
    1054             :                                            (SK_SEARCHNOTNULL | SK_ISNULL |
    1055           6 :                                             (impliesNN->sk_flags &
    1056             :                                              (SK_BT_DESC | SK_BT_NULLS_FIRST))),
    1057             :                                            curattr,
    1058           6 :                                            ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
    1059             :                                             BTGreaterStrategyNumber :
    1060             :                                             BTLessStrategyNumber),
    1061             :                                            InvalidOid,
    1062             :                                            InvalidOid,
    1063             :                                            InvalidOid,
    1064             :                                            (Datum) 0);
    1065             :                 }
    1066             : 
    1067             :                 /*
    1068             :                  * If we still didn't find a usable boundary key, quit; else
    1069             :                  * save the boundary key pointer in startKeys.
    1070             :                  */
    1071    23005500 :                 if (chosen == NULL)
    1072       73022 :                     break;
    1073    22932478 :                 startKeys[keysz++] = chosen;
    1074             : 
    1075             :                 /*
    1076             :                  * We can only consider adding more boundary keys when the one
    1077             :                  * that we just chose to add uses either the = or >= strategy
    1078             :                  * (during backwards scans we can only do so when the key that
    1079             :                  * we just added to startKeys[] uses the = or <= strategy)
    1080             :                  */
    1081    22932478 :                 strat_total = chosen->sk_strategy;
    1082    22932478 :                 if (strat_total == BTGreaterStrategyNumber ||
    1083             :                     strat_total == BTLessStrategyNumber)
    1084             :                     break;
    1085             : 
    1086             :                 /*
    1087             :                  * Done if that was the last attribute, or if next key is not
    1088             :                  * in sequence (implying no boundary key is available for the
    1089             :                  * next attribute).
    1090             :                  */
    1091    21447650 :                 if (i >= so->numberOfKeys ||
    1092     8562938 :                     cur->sk_attno != curattr + 1)
    1093             :                     break;
    1094             : 
    1095             :                 /*
    1096             :                  * Reset for next attr.
    1097             :                  */
    1098     8562256 :                 curattr = cur->sk_attno;
    1099     8562256 :                 chosen = NULL;
    1100     8562256 :                 impliesNN = NULL;
    1101             :             }
    1102             : 
    1103             :             /*
    1104             :              * Can we use this key as a starting boundary for this attr?
    1105             :              *
    1106             :              * If not, does it imply a NOT NULL constraint?  (Because
    1107             :              * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
    1108             :              * *any* inequality key works for that; we need not test.)
    1109             :              */
    1110    23004434 :             switch (cur->sk_strategy)
    1111             :             {
    1112      124466 :                 case BTLessStrategyNumber:
    1113             :                 case BTLessEqualStrategyNumber:
    1114      124466 :                     if (chosen == NULL)
    1115             :                     {
    1116      122624 :                         if (ScanDirectionIsBackward(dir))
    1117       52540 :                             chosen = cur;
    1118             :                         else
    1119       70084 :                             impliesNN = cur;
    1120             :                     }
    1121      124466 :                     break;
    1122    21443158 :                 case BTEqualStrategyNumber:
    1123             :                     /* override any non-equality choice */
    1124    21443158 :                     chosen = cur;
    1125    21443158 :                     break;
    1126     1436810 :                 case BTGreaterEqualStrategyNumber:
    1127             :                 case BTGreaterStrategyNumber:
    1128     1436810 :                     if (chosen == NULL)
    1129             :                     {
    1130     1436810 :                         if (ScanDirectionIsForward(dir))
    1131     1436774 :                             chosen = cur;
    1132             :                         else
    1133          36 :                             impliesNN = cur;
    1134             :                     }
    1135     1436810 :                     break;
    1136             :             }
    1137    23004434 :         }
    1138             :     }
    1139             : 
    1140             :     /*
    1141             :      * If we found no usable boundary keys, we have to start from one end of
    1142             :      * the tree.  Walk down that edge to the first or last key, and scan from
    1143             :      * there.
    1144             :      *
    1145             :      * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
    1146             :      */
    1147    14456156 :     if (keysz == 0)
    1148       85782 :         return _bt_endpoint(scan, dir);
    1149             : 
    1150             :     /*
    1151             :      * We want to start the scan somewhere within the index.  Set up an
    1152             :      * insertion scankey we can use to search for the boundary point we
    1153             :      * identified above.  The insertion scankey is built using the keys
    1154             :      * identified by startKeys[].  (Remaining insertion scankey fields are
    1155             :      * initialized after initial-positioning scan keys are finalized.)
    1156             :      */
    1157             :     Assert(keysz <= INDEX_MAX_KEYS);
    1158    37302816 :     for (int i = 0; i < keysz; i++)
    1159             :     {
    1160    22932478 :         ScanKey     cur = startKeys[i];
    1161             : 
    1162             :         Assert(cur->sk_attno == i + 1);
    1163             : 
    1164    22932478 :         if (cur->sk_flags & SK_ROW_HEADER)
    1165             :         {
    1166             :             /*
    1167             :              * Row comparison header: look to the first row member instead
    1168             :              */
    1169          36 :             ScanKey     subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
    1170             : 
    1171             :             /*
    1172             :              * Cannot be a NULL in the first row member: _bt_preprocess_keys
    1173             :              * would've marked the qual as unsatisfiable, preventing us from
    1174             :              * ever getting this far
    1175             :              */
    1176             :             Assert(subkey->sk_flags & SK_ROW_MEMBER);
    1177             :             Assert(subkey->sk_attno == cur->sk_attno);
    1178             :             Assert(!(subkey->sk_flags & SK_ISNULL));
    1179             : 
    1180             :             /*
    1181             :              * The member scankeys are already in insertion format (ie, they
    1182             :              * have sk_func = 3-way-comparison function)
    1183             :              */
    1184          36 :             memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
    1185             : 
    1186             :             /*
    1187             :              * If the row comparison is the last positioning key we accepted,
    1188             :              * try to add additional keys from the lower-order row members.
    1189             :              * (If we accepted independent conditions on additional index
    1190             :              * columns, we use those instead --- doesn't seem worth trying to
    1191             :              * determine which is more restrictive.)  Note that this is OK
    1192             :              * even if the row comparison is of ">" or "<" type, because the
    1193             :              * condition applied to all but the last row member is effectively
    1194             :              * ">=" or "<=", and so the extra keys don't break the positioning
    1195             :              * scheme.  But, by the same token, if we aren't able to use all
    1196             :              * the row members, then the part of the row comparison that we
    1197             :              * did use has to be treated as just a ">=" or "<=" condition, and
    1198             :              * so we'd better adjust strat_total accordingly.
    1199             :              */
    1200          36 :             if (i == keysz - 1)
    1201             :             {
    1202          36 :                 bool        used_all_subkeys = false;
    1203             : 
    1204             :                 Assert(!(subkey->sk_flags & SK_ROW_END));
    1205             :                 for (;;)
    1206             :                 {
    1207          36 :                     subkey++;
    1208             :                     Assert(subkey->sk_flags & SK_ROW_MEMBER);
    1209          36 :                     if (subkey->sk_attno != keysz + 1)
    1210          12 :                         break;  /* out-of-sequence, can't use it */
    1211          24 :                     if (subkey->sk_strategy != cur->sk_strategy)
    1212           0 :                         break;  /* wrong direction, can't use it */
    1213          24 :                     if (subkey->sk_flags & SK_ISNULL)
    1214           0 :                         break;  /* can't use null keys */
    1215             :                     Assert(keysz < INDEX_MAX_KEYS);
    1216          24 :                     memcpy(inskey.scankeys + keysz, subkey,
    1217             :                            sizeof(ScanKeyData));
    1218          24 :                     keysz++;
    1219          24 :                     if (subkey->sk_flags & SK_ROW_END)
    1220             :                     {
    1221          24 :                         used_all_subkeys = true;
    1222          24 :                         break;
    1223             :                     }
    1224             :                 }
    1225          36 :                 if (!used_all_subkeys)
    1226             :                 {
    1227          12 :                     switch (strat_total)
    1228             :                     {
    1229           6 :                         case BTLessStrategyNumber:
    1230           6 :                             strat_total = BTLessEqualStrategyNumber;
    1231           6 :                             break;
    1232           6 :                         case BTGreaterStrategyNumber:
    1233           6 :                             strat_total = BTGreaterEqualStrategyNumber;
    1234           6 :                             break;
    1235             :                     }
    1236          24 :                 }
    1237          36 :                 break;          /* done with outer loop */
    1238             :             }
    1239             :         }
    1240             :         else
    1241             :         {
    1242             :             /*
    1243             :              * Ordinary comparison key.  Transform the search-style scan key
    1244             :              * to an insertion scan key by replacing the sk_func with the
    1245             :              * appropriate btree comparison function.
    1246             :              *
    1247             :              * If scankey operator is not a cross-type comparison, we can use
    1248             :              * the cached comparison function; otherwise gotta look it up in
    1249             :              * the catalogs.  (That can't lead to infinite recursion, since no
    1250             :              * indexscan initiated by syscache lookup will use cross-data-type
    1251             :              * operators.)
    1252             :              *
    1253             :              * We support the convention that sk_subtype == InvalidOid means
    1254             :              * the opclass input type; this is a hack to simplify life for
    1255             :              * ScanKeyInit().
    1256             :              */
    1257    22932442 :             if (cur->sk_subtype == rel->rd_opcintype[i] ||
    1258    21989942 :                 cur->sk_subtype == InvalidOid)
    1259    22921724 :             {
    1260             :                 FmgrInfo   *procinfo;
    1261             : 
    1262    22921724 :                 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
    1263    22921724 :                 ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
    1264             :                                                cur->sk_flags,
    1265    22921724 :                                                cur->sk_attno,
    1266             :                                                InvalidStrategy,
    1267             :                                                cur->sk_subtype,
    1268             :                                                cur->sk_collation,
    1269             :                                                procinfo,
    1270             :                                                cur->sk_argument);
    1271             :             }
    1272             :             else
    1273             :             {
    1274             :                 RegProcedure cmp_proc;
    1275             : 
    1276       10718 :                 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
    1277       10718 :                                              rel->rd_opcintype[i],
    1278             :                                              cur->sk_subtype,
    1279             :                                              BTORDER_PROC);
    1280       10718 :                 if (!RegProcedureIsValid(cmp_proc))
    1281           0 :                     elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
    1282             :                          BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
    1283             :                          cur->sk_attno, RelationGetRelationName(rel));
    1284       10718 :                 ScanKeyEntryInitialize(inskey.scankeys + i,
    1285             :                                        cur->sk_flags,
    1286       10718 :                                        cur->sk_attno,
    1287             :                                        InvalidStrategy,
    1288             :                                        cur->sk_subtype,
    1289             :                                        cur->sk_collation,
    1290             :                                        cmp_proc,
    1291             :                                        cur->sk_argument);
    1292             :             }
    1293             :         }
    1294             :     }
    1295             : 
    1296             :     /*----------
    1297             :      * Examine the selected initial-positioning strategy to determine exactly
    1298             :      * where we need to start the scan, and set flag variables to control the
    1299             :      * initial descent by _bt_search (and our _bt_binsrch call for the leaf
    1300             :      * page _bt_search returns).
    1301             :      *----------
    1302             :      */
    1303    14370374 :     _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
    1304    14370374 :     inskey.anynullkeys = false; /* unused */
    1305    14370374 :     inskey.scantid = NULL;
    1306    14370374 :     inskey.keysz = keysz;
    1307    14370374 :     switch (strat_total)
    1308             :     {
    1309       52534 :         case BTLessStrategyNumber:
    1310             : 
    1311       52534 :             inskey.nextkey = false;
    1312       52534 :             inskey.backward = true;
    1313       52534 :             break;
    1314             : 
    1315          12 :         case BTLessEqualStrategyNumber:
    1316             : 
    1317          12 :             inskey.nextkey = true;
    1318          12 :             inskey.backward = true;
    1319          12 :             break;
    1320             : 
    1321    12881160 :         case BTEqualStrategyNumber:
    1322             : 
    1323             :             /*
    1324             :              * If a backward scan was specified, need to start with last equal
    1325             :              * item not first one.
    1326             :              */
    1327    12881160 :             if (ScanDirectionIsBackward(dir))
    1328             :             {
    1329             :                 /*
    1330             :                  * This is the same as the <= strategy
    1331             :                  */
    1332         158 :                 inskey.nextkey = true;
    1333         158 :                 inskey.backward = true;
    1334             :             }
    1335             :             else
    1336             :             {
    1337             :                 /*
    1338             :                  * This is the same as the >= strategy
    1339             :                  */
    1340    12881002 :                 inskey.nextkey = false;
    1341    12881002 :                 inskey.backward = false;
    1342             :             }
    1343    12881160 :             break;
    1344             : 
    1345        4386 :         case BTGreaterEqualStrategyNumber:
    1346             : 
    1347             :             /*
    1348             :              * Find first item >= scankey
    1349             :              */
    1350        4386 :             inskey.nextkey = false;
    1351        4386 :             inskey.backward = false;
    1352        4386 :             break;
    1353             : 
    1354     1432282 :         case BTGreaterStrategyNumber:
    1355             : 
    1356             :             /*
    1357             :              * Find first item > scankey
    1358             :              */
    1359     1432282 :             inskey.nextkey = true;
    1360     1432282 :             inskey.backward = false;
    1361     1432282 :             break;
    1362             : 
    1363           0 :         default:
    1364             :             /* can't get here, but keep compiler quiet */
    1365           0 :             elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
    1366             :             return false;
    1367             :     }
    1368             : 
    1369             :     /*
    1370             :      * Use the manufactured insertion scan key to descend the tree and
    1371             :      * position ourselves on the target leaf page.
    1372             :      */
    1373             :     Assert(ScanDirectionIsBackward(dir) == inskey.backward);
    1374    14370374 :     stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
    1375             : 
    1376             :     /* don't need to keep the stack around... */
    1377    14370374 :     _bt_freestack(stack);
    1378             : 
    1379    14370374 :     if (!BufferIsValid(so->currPos.buf))
    1380             :     {
    1381             :         /*
    1382             :          * We only get here if the index is completely empty. Lock relation
    1383             :          * because nothing finer to lock exists.  Without a buffer lock, it's
    1384             :          * possible for another transaction to insert data between
    1385             :          * _bt_search() and PredicateLockRelation().  We have to try again
    1386             :          * after taking the relation-level predicate lock, to close a narrow
    1387             :          * window where we wouldn't scan concurrently inserted tuples, but the
    1388             :          * writer wouldn't see our predicate lock.
    1389             :          */
    1390      532982 :         if (IsolationIsSerializable())
    1391             :         {
    1392        5508 :             PredicateLockRelation(rel, scan->xs_snapshot);
    1393        5508 :             stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
    1394        5508 :             _bt_freestack(stack);
    1395             :         }
    1396             : 
    1397      532982 :         if (!BufferIsValid(so->currPos.buf))
    1398             :         {
    1399             :             Assert(!so->needPrimScan);
    1400      532982 :             _bt_parallel_done(scan);
    1401      532982 :             return false;
    1402             :         }
    1403             :     }
    1404             : 
    1405             :     /* position to the precise item on the page */
    1406    13837392 :     offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
    1407             : 
    1408             :     /*
    1409             :      * Now load data from the first page of the scan (usually the page
    1410             :      * currently in so->currPos.buf).
    1411             :      *
    1412             :      * If inskey.nextkey = false and inskey.backward = false, offnum is
    1413             :      * positioned at the first non-pivot tuple >= inskey.scankeys.
    1414             :      *
    1415             :      * If inskey.nextkey = false and inskey.backward = true, offnum is
    1416             :      * positioned at the last non-pivot tuple < inskey.scankeys.
    1417             :      *
    1418             :      * If inskey.nextkey = true and inskey.backward = false, offnum is
    1419             :      * positioned at the first non-pivot tuple > inskey.scankeys.
    1420             :      *
    1421             :      * If inskey.nextkey = true and inskey.backward = true, offnum is
    1422             :      * positioned at the last non-pivot tuple <= inskey.scankeys.
    1423             :      *
    1424             :      * It's possible that _bt_binsrch returned an offnum that is out of bounds
    1425             :      * for the page.  For example, when inskey is both < the leaf page's high
    1426             :      * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
    1427             :      */
    1428    13837392 :     if (!_bt_readfirstpage(scan, offnum, dir))
    1429     3765496 :         return false;
    1430             : 
    1431    10071896 :     _bt_returnitem(scan, so);
    1432    10071896 :     return true;
    1433             : }
    1434             : 
    1435             : /*
    1436             :  *  _bt_next() -- Get the next item in a scan.
    1437             :  *
    1438             :  *      On entry, so->currPos describes the current page, which may be pinned
    1439             :  *      but is not locked, and so->currPos.itemIndex identifies which item was
    1440             :  *      previously returned.
    1441             :  *
    1442             :  *      On success exit, so->currPos is updated as needed, and _bt_returnitem
    1443             :  *      sets the next item to return to the scan.  so->currPos remains valid.
    1444             :  *
    1445             :  *      On failure exit (no more tuples), we invalidate so->currPos.  It'll
    1446             :  *      still be possible for the scan to return tuples by changing direction,
    1447             :  *      though we'll need to call _bt_first anew in that other direction.
    1448             :  */
    1449             : bool
    1450    18146416 : _bt_next(IndexScanDesc scan, ScanDirection dir)
    1451             : {
    1452    18146416 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1453             : 
    1454             :     Assert(BTScanPosIsValid(so->currPos));
    1455             : 
    1456             :     /*
    1457             :      * Advance to next tuple on current page; or if there's no more, try to
    1458             :      * step to the next page with data.
    1459             :      */
    1460    18146416 :     if (ScanDirectionIsForward(dir))
    1461             :     {
    1462    18108058 :         if (++so->currPos.itemIndex > so->currPos.lastItem)
    1463             :         {
    1464     2257838 :             if (!_bt_steppage(scan, dir))
    1465     2228340 :                 return false;
    1466             :         }
    1467             :     }
    1468             :     else
    1469             :     {
    1470       38358 :         if (--so->currPos.itemIndex < so->currPos.firstItem)
    1471             :         {
    1472         126 :             if (!_bt_steppage(scan, dir))
    1473          74 :                 return false;
    1474             :         }
    1475             :     }
    1476             : 
    1477    15918002 :     _bt_returnitem(scan, so);
    1478    15918002 :     return true;
    1479             : }
    1480             : 
    1481             : /*
    1482             :  *  _bt_readpage() -- Load data from current index page into so->currPos
    1483             :  *
    1484             :  * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
    1485             :  * is not changed here.  Also, currPos.moreLeft and moreRight must be valid;
    1486             :  * they are updated as appropriate.  All other fields of so->currPos are
    1487             :  * initialized from scratch here.
    1488             :  *
    1489             :  * We scan the current page starting at offnum and moving in the indicated
    1490             :  * direction.  All items matching the scan keys are loaded into currPos.items.
    1491             :  * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
    1492             :  * that there can be no more matching tuples in the current scan direction
    1493             :  * (could just be for the current primitive index scan when scan has arrays).
    1494             :  *
    1495             :  * In the case of a parallel scan, caller must have called _bt_parallel_seize
    1496             :  * prior to calling this function; this function will invoke
    1497             :  * _bt_parallel_release before returning.
    1498             :  *
    1499             :  * Returns true if any matching items found on the page, false if none.
    1500             :  */
    1501             : static bool
    1502    13947848 : _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
    1503             :              bool firstpage)
    1504             : {
    1505    13947848 :     Relation    rel = scan->indexRelation;
    1506    13947848 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1507             :     Page        page;
    1508             :     BTPageOpaque opaque;
    1509             :     OffsetNumber minoff;
    1510             :     OffsetNumber maxoff;
    1511             :     BTReadPageState pstate;
    1512             :     bool        arrayKeys;
    1513             :     int         itemIndex,
    1514             :                 indnatts;
    1515             : 
    1516             :     /* save the page/buffer block number, along with its sibling links */
    1517    13947848 :     page = BufferGetPage(so->currPos.buf);
    1518    13947848 :     opaque = BTPageGetOpaque(page);
    1519    13947848 :     so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
    1520    13947848 :     so->currPos.prevPage = opaque->btpo_prev;
    1521    13947848 :     so->currPos.nextPage = opaque->btpo_next;
    1522             : 
    1523             :     Assert(!P_IGNORE(opaque));
    1524             :     Assert(BTScanPosIsPinned(so->currPos));
    1525             :     Assert(!so->needPrimScan);
    1526             : 
    1527    13947848 :     if (scan->parallel_scan)
    1528             :     {
    1529             :         /* allow next/prev page to be read by other worker without delay */
    1530        1336 :         if (ScanDirectionIsForward(dir))
    1531        1336 :             _bt_parallel_release(scan, so->currPos.nextPage,
    1532             :                                  so->currPos.currPage);
    1533             :         else
    1534           0 :             _bt_parallel_release(scan, so->currPos.prevPage,
    1535             :                                  so->currPos.currPage);
    1536             :     }
    1537             : 
    1538             :     /* initialize remaining currPos fields related to current page */
    1539    13947848 :     so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
    1540    13947848 :     so->currPos.dir = dir;
    1541    13947848 :     so->currPos.nextTupleOffset = 0;
    1542             :     /* either moreLeft or moreRight should be set now (may be unset later) */
    1543             :     Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight :
    1544             :            so->currPos.moreLeft);
    1545             : 
    1546    13947848 :     PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot);
    1547             : 
    1548             :     /* initialize local variables */
    1549    13947848 :     indnatts = IndexRelationGetNumberOfAttributes(rel);
    1550    13947848 :     arrayKeys = so->numArrayKeys != 0;
    1551    13947848 :     minoff = P_FIRSTDATAKEY(opaque);
    1552    13947848 :     maxoff = PageGetMaxOffsetNumber(page);
    1553             : 
    1554             :     /* initialize page-level state that we'll pass to _bt_checkkeys */
    1555    13947848 :     pstate.minoff = minoff;
    1556    13947848 :     pstate.maxoff = maxoff;
    1557    13947848 :     pstate.finaltup = NULL;
    1558    13947848 :     pstate.page = page;
    1559    13947848 :     pstate.firstpage = firstpage;
    1560    13947848 :     pstate.offnum = InvalidOffsetNumber;
    1561    13947848 :     pstate.skip = InvalidOffsetNumber;
    1562    13947848 :     pstate.continuescan = true; /* default assumption */
    1563    13947848 :     pstate.prechecked = false;
    1564    13947848 :     pstate.firstmatch = false;
    1565    13947848 :     pstate.rechecks = 0;
    1566    13947848 :     pstate.targetdistance = 0;
    1567             : 
    1568             :     /*
    1569             :      * Prechecking the value of the continuescan flag for the last item on the
    1570             :      * page (for backwards scan it will be the first item on a page).  If we
    1571             :      * observe it to be true, then it should be true for all other items. This
    1572             :      * allows us to do significant optimizations in the _bt_checkkeys()
    1573             :      * function for all the items on the page.
    1574             :      *
    1575             :      * With the forward scan, we do this check for the last item on the page
    1576             :      * instead of the high key.  It's relatively likely that the most
    1577             :      * significant column in the high key will be different from the
    1578             :      * corresponding value from the last item on the page.  So checking with
    1579             :      * the last item on the page would give a more precise answer.
    1580             :      *
    1581             :      * We skip this for the first page read by each (primitive) scan, to avoid
    1582             :      * slowing down point queries.  They typically don't stand to gain much
    1583             :      * when the optimization can be applied, and are more likely to notice the
    1584             :      * overhead of the precheck.
    1585             :      *
    1586             :      * The optimization is unsafe and must be avoided whenever _bt_checkkeys
    1587             :      * just set a low-order required array's key to the best available match
    1588             :      * for a truncated -inf attribute value from the prior page's high key
    1589             :      * (array element 0 is always the best available match in this scenario).
    1590             :      * It's quite likely that matches for array element 0 begin on this page,
    1591             :      * but the start of matches won't necessarily align with page boundaries.
    1592             :      * When the start of matches is somewhere in the middle of this page, it
    1593             :      * would be wrong to treat page's final non-pivot tuple as representative.
    1594             :      * Doing so might lead us to treat some of the page's earlier tuples as
    1595             :      * being part of a group of tuples thought to satisfy the required keys.
    1596             :      *
    1597             :      * Note: Conversely, in the case where the scan's arrays just advanced
    1598             :      * using the prior page's HIKEY _without_ advancement setting scanBehind,
    1599             :      * the start of matches must be aligned with page boundaries, which makes
    1600             :      * it safe to attempt the optimization here now.  It's also safe when the
    1601             :      * prior page's HIKEY simply didn't need to advance any required array. In
    1602             :      * both cases we can safely assume that the _first_ tuple from this page
    1603             :      * must be >= the current set of array keys/equality constraints. And so
    1604             :      * if the final tuple is == those same keys (and also satisfies any
    1605             :      * required < or <= strategy scan keys) during the precheck, we can safely
    1606             :      * assume that this must also be true of all earlier tuples from the page.
    1607             :      */
    1608    13947848 :     if (!pstate.firstpage && !so->scanBehind && minoff < maxoff)
    1609             :     {
    1610             :         ItemId      iid;
    1611             :         IndexTuple  itup;
    1612             : 
    1613       31444 :         iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
    1614       31444 :         itup = (IndexTuple) PageGetItem(page, iid);
    1615             : 
    1616             :         /* Call with arrayKeys=false to avoid undesirable side-effects */
    1617       31444 :         _bt_checkkeys(scan, &pstate, false, itup, indnatts);
    1618       31444 :         pstate.prechecked = pstate.continuescan;
    1619       31444 :         pstate.continuescan = true; /* reset */
    1620             :     }
    1621             : 
    1622    13947848 :     if (ScanDirectionIsForward(dir))
    1623             :     {
    1624             :         /* SK_SEARCHARRAY forward scans must provide high key up front */
    1625    13894938 :         if (arrayKeys)
    1626             :         {
    1627       85218 :             if (!P_RIGHTMOST(opaque))
    1628             :             {
    1629       25284 :                 ItemId      iid = PageGetItemId(page, P_HIKEY);
    1630             : 
    1631       25284 :                 pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
    1632             : 
    1633       25284 :                 if (so->scanBehind &&
    1634         360 :                     !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
    1635             :                 {
    1636             :                     /* Schedule another primitive index scan after all */
    1637         342 :                     so->currPos.moreRight = false;
    1638         342 :                     so->needPrimScan = true;
    1639         342 :                     if (scan->parallel_scan)
    1640           0 :                         _bt_parallel_primscan_schedule(scan,
    1641             :                                                        so->currPos.currPage);
    1642         342 :                     return false;
    1643             :                 }
    1644             :             }
    1645             : 
    1646       84876 :             so->scanBehind = so->oppositeDirCheck = false;    /* reset */
    1647             :         }
    1648             : 
    1649             :         /* load items[] in ascending order */
    1650    13894596 :         itemIndex = 0;
    1651             : 
    1652    13894596 :         offnum = Max(offnum, minoff);
    1653             : 
    1654    55319290 :         while (offnum <= maxoff)
    1655             :         {
    1656    52217748 :             ItemId      iid = PageGetItemId(page, offnum);
    1657             :             IndexTuple  itup;
    1658             :             bool        passes_quals;
    1659             : 
    1660             :             /*
    1661             :              * If the scan specifies not to return killed tuples, then we
    1662             :              * treat a killed tuple as not passing the qual
    1663             :              */
    1664    52217748 :             if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
    1665             :             {
    1666     4323184 :                 offnum = OffsetNumberNext(offnum);
    1667     4323184 :                 continue;
    1668             :             }
    1669             : 
    1670    47894564 :             itup = (IndexTuple) PageGetItem(page, iid);
    1671             :             Assert(!BTreeTupleIsPivot(itup));
    1672             : 
    1673    47894564 :             pstate.offnum = offnum;
    1674    47894564 :             passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
    1675             :                                          itup, indnatts);
    1676             : 
    1677             :             /*
    1678             :              * Check if we need to skip ahead to a later tuple (only possible
    1679             :              * when the scan uses array keys)
    1680             :              */
    1681    47894564 :             if (arrayKeys && OffsetNumberIsValid(pstate.skip))
    1682             :             {
    1683             :                 Assert(!passes_quals && pstate.continuescan);
    1684             :                 Assert(offnum < pstate.skip);
    1685             : 
    1686        1642 :                 offnum = pstate.skip;
    1687        1642 :                 pstate.skip = InvalidOffsetNumber;
    1688        1642 :                 continue;
    1689             :             }
    1690             : 
    1691    47892922 :             if (passes_quals)
    1692             :             {
    1693             :                 /* tuple passes all scan key conditions */
    1694    36613278 :                 pstate.firstmatch = true;
    1695    36613278 :                 if (!BTreeTupleIsPosting(itup))
    1696             :                 {
    1697             :                     /* Remember it */
    1698    36171166 :                     _bt_saveitem(so, itemIndex, offnum, itup);
    1699    36171166 :                     itemIndex++;
    1700             :                 }
    1701             :                 else
    1702             :                 {
    1703             :                     int         tupleOffset;
    1704             : 
    1705             :                     /*
    1706             :                      * Set up state to return posting list, and remember first
    1707             :                      * TID
    1708             :                      */
    1709             :                     tupleOffset =
    1710      442112 :                         _bt_setuppostingitems(so, itemIndex, offnum,
    1711             :                                               BTreeTupleGetPostingN(itup, 0),
    1712             :                                               itup);
    1713      442112 :                     itemIndex++;
    1714             :                     /* Remember additional TIDs */
    1715     2757304 :                     for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
    1716             :                     {
    1717     2315192 :                         _bt_savepostingitem(so, itemIndex, offnum,
    1718             :                                             BTreeTupleGetPostingN(itup, i),
    1719             :                                             tupleOffset);
    1720     2315192 :                         itemIndex++;
    1721             :                     }
    1722             :                 }
    1723             :             }
    1724             :             /* When !continuescan, there can't be any more matches, so stop */
    1725    47892922 :             if (!pstate.continuescan)
    1726    10793054 :                 break;
    1727             : 
    1728    37099868 :             offnum = OffsetNumberNext(offnum);
    1729             :         }
    1730             : 
    1731             :         /*
    1732             :          * We don't need to visit page to the right when the high key
    1733             :          * indicates that no more matches will be found there.
    1734             :          *
    1735             :          * Checking the high key like this works out more often than you might
    1736             :          * think.  Leaf page splits pick a split point between the two most
    1737             :          * dissimilar tuples (this is weighed against the need to evenly share
    1738             :          * free space).  Leaf pages with high key attribute values that can
    1739             :          * only appear on non-pivot tuples on the right sibling page are
    1740             :          * common.
    1741             :          */
    1742    13894596 :         if (pstate.continuescan && !so->scanBehind && !P_RIGHTMOST(opaque))
    1743             :         {
    1744      145094 :             ItemId      iid = PageGetItemId(page, P_HIKEY);
    1745      145094 :             IndexTuple  itup = (IndexTuple) PageGetItem(page, iid);
    1746             :             int         truncatt;
    1747             : 
    1748      145094 :             truncatt = BTreeTupleGetNAtts(itup, rel);
    1749      145094 :             pstate.prechecked = false;  /* precheck didn't cover HIKEY */
    1750      145094 :             _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
    1751             :         }
    1752             : 
    1753    13894596 :         if (!pstate.continuescan)
    1754    10886202 :             so->currPos.moreRight = false;
    1755             : 
    1756             :         Assert(itemIndex <= MaxTIDsPerBTreePage);
    1757    13894596 :         so->currPos.firstItem = 0;
    1758    13894596 :         so->currPos.lastItem = itemIndex - 1;
    1759    13894596 :         so->currPos.itemIndex = 0;
    1760             :     }
    1761             :     else
    1762             :     {
    1763             :         /* SK_SEARCHARRAY backward scans must provide final tuple up front */
    1764       52910 :         if (arrayKeys)
    1765             :         {
    1766          30 :             if (minoff <= maxoff && !P_LEFTMOST(opaque))
    1767             :             {
    1768          24 :                 ItemId      iid = PageGetItemId(page, minoff);
    1769             : 
    1770          24 :                 pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
    1771             : 
    1772          24 :                 if (so->scanBehind &&
    1773           6 :                     !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
    1774             :                 {
    1775             :                     /* Schedule another primitive index scan after all */
    1776           0 :                     so->currPos.moreLeft = false;
    1777           0 :                     so->needPrimScan = true;
    1778           0 :                     if (scan->parallel_scan)
    1779           0 :                         _bt_parallel_primscan_schedule(scan,
    1780             :                                                        so->currPos.currPage);
    1781           0 :                     return false;
    1782             :                 }
    1783             :             }
    1784             : 
    1785          30 :             so->scanBehind = so->oppositeDirCheck = false;    /* reset */
    1786             :         }
    1787             : 
    1788             :         /* load items[] in descending order */
    1789       52910 :         itemIndex = MaxTIDsPerBTreePage;
    1790             : 
    1791       52910 :         offnum = Min(offnum, maxoff);
    1792             : 
    1793     8770246 :         while (offnum >= minoff)
    1794             :         {
    1795     8717432 :             ItemId      iid = PageGetItemId(page, offnum);
    1796             :             IndexTuple  itup;
    1797             :             bool        tuple_alive;
    1798             :             bool        passes_quals;
    1799             : 
    1800             :             /*
    1801             :              * If the scan specifies not to return killed tuples, then we
    1802             :              * treat a killed tuple as not passing the qual.  Most of the
    1803             :              * time, it's a win to not bother examining the tuple's index
    1804             :              * keys, but just skip to the next tuple (previous, actually,
    1805             :              * since we're scanning backwards).  However, if this is the first
    1806             :              * tuple on the page, we do check the index keys, to prevent
    1807             :              * uselessly advancing to the page to the left.  This is similar
    1808             :              * to the high key optimization used by forward scans.
    1809             :              */
    1810     8717432 :             if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
    1811             :             {
    1812      392268 :                 if (offnum > minoff)
    1813             :                 {
    1814      391468 :                     offnum = OffsetNumberPrev(offnum);
    1815      391468 :                     continue;
    1816             :                 }
    1817             : 
    1818         800 :                 tuple_alive = false;
    1819             :             }
    1820             :             else
    1821     8325164 :                 tuple_alive = true;
    1822             : 
    1823     8325964 :             itup = (IndexTuple) PageGetItem(page, iid);
    1824             :             Assert(!BTreeTupleIsPivot(itup));
    1825             : 
    1826     8325964 :             pstate.offnum = offnum;
    1827     8325964 :             passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
    1828             :                                          itup, indnatts);
    1829             : 
    1830             :             /*
    1831             :              * Check if we need to skip ahead to a later tuple (only possible
    1832             :              * when the scan uses array keys)
    1833             :              */
    1834     8325964 :             if (arrayKeys && OffsetNumberIsValid(pstate.skip))
    1835             :             {
    1836             :                 Assert(!passes_quals && pstate.continuescan);
    1837             :                 Assert(offnum > pstate.skip);
    1838             : 
    1839          18 :                 offnum = pstate.skip;
    1840          18 :                 pstate.skip = InvalidOffsetNumber;
    1841          18 :                 continue;
    1842             :             }
    1843             : 
    1844     8325946 :             if (passes_quals && tuple_alive)
    1845             :             {
    1846             :                 /* tuple passes all scan key conditions */
    1847     8324894 :                 pstate.firstmatch = true;
    1848     8324894 :                 if (!BTreeTupleIsPosting(itup))
    1849             :                 {
    1850             :                     /* Remember it */
    1851     8282568 :                     itemIndex--;
    1852     8282568 :                     _bt_saveitem(so, itemIndex, offnum, itup);
    1853             :                 }
    1854             :                 else
    1855             :                 {
    1856             :                     int         tupleOffset;
    1857             : 
    1858             :                     /*
    1859             :                      * Set up state to return posting list, and remember first
    1860             :                      * TID.
    1861             :                      *
    1862             :                      * Note that we deliberately save/return items from
    1863             :                      * posting lists in ascending heap TID order for backwards
    1864             :                      * scans.  This allows _bt_killitems() to make a
    1865             :                      * consistent assumption about the order of items
    1866             :                      * associated with the same posting list tuple.
    1867             :                      */
    1868       42326 :                     itemIndex--;
    1869             :                     tupleOffset =
    1870       42326 :                         _bt_setuppostingitems(so, itemIndex, offnum,
    1871             :                                               BTreeTupleGetPostingN(itup, 0),
    1872             :                                               itup);
    1873             :                     /* Remember additional TIDs */
    1874      169984 :                     for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
    1875             :                     {
    1876      127658 :                         itemIndex--;
    1877      127658 :                         _bt_savepostingitem(so, itemIndex, offnum,
    1878             :                                             BTreeTupleGetPostingN(itup, i),
    1879             :                                             tupleOffset);
    1880             :                     }
    1881             :                 }
    1882             :             }
    1883             :             /* When !continuescan, there can't be any more matches, so stop */
    1884     8325946 :             if (!pstate.continuescan)
    1885          96 :                 break;
    1886             : 
    1887     8325850 :             offnum = OffsetNumberPrev(offnum);
    1888             :         }
    1889             : 
    1890             :         /*
    1891             :          * We don't need to visit page to the left when no more matches will
    1892             :          * be found there
    1893             :          */
    1894       52910 :         if (!pstate.continuescan)
    1895          96 :             so->currPos.moreLeft = false;
    1896             : 
    1897             :         Assert(itemIndex >= 0);
    1898       52910 :         so->currPos.firstItem = itemIndex;
    1899       52910 :         so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
    1900       52910 :         so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
    1901             :     }
    1902             : 
    1903    13947506 :     return (so->currPos.firstItem <= so->currPos.lastItem);
    1904             : }
    1905             : 
    1906             : /* Save an index item into so->currPos.items[itemIndex] */
    1907             : static void
    1908    44453734 : _bt_saveitem(BTScanOpaque so, int itemIndex,
    1909             :              OffsetNumber offnum, IndexTuple itup)
    1910             : {
    1911    44453734 :     BTScanPosItem *currItem = &so->currPos.items[itemIndex];
    1912             : 
    1913             :     Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
    1914             : 
    1915    44453734 :     currItem->heapTid = itup->t_tid;
    1916    44453734 :     currItem->indexOffset = offnum;
    1917    44453734 :     if (so->currTuples)
    1918             :     {
    1919    21912366 :         Size        itupsz = IndexTupleSize(itup);
    1920             : 
    1921    21912366 :         currItem->tupleOffset = so->currPos.nextTupleOffset;
    1922    21912366 :         memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
    1923    21912366 :         so->currPos.nextTupleOffset += MAXALIGN(itupsz);
    1924             :     }
    1925    44453734 : }
    1926             : 
    1927             : /*
    1928             :  * Setup state to save TIDs/items from a single posting list tuple.
    1929             :  *
    1930             :  * Saves an index item into so->currPos.items[itemIndex] for TID that is
    1931             :  * returned to scan first.  Second or subsequent TIDs for posting list should
    1932             :  * be saved by calling _bt_savepostingitem().
    1933             :  *
    1934             :  * Returns an offset into tuple storage space that main tuple is stored at if
    1935             :  * needed.
    1936             :  */
    1937             : static int
    1938      484438 : _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
    1939             :                       ItemPointer heapTid, IndexTuple itup)
    1940             : {
    1941      484438 :     BTScanPosItem *currItem = &so->currPos.items[itemIndex];
    1942             : 
    1943             :     Assert(BTreeTupleIsPosting(itup));
    1944             : 
    1945      484438 :     currItem->heapTid = *heapTid;
    1946      484438 :     currItem->indexOffset = offnum;
    1947      484438 :     if (so->currTuples)
    1948             :     {
    1949             :         /* Save base IndexTuple (truncate posting list) */
    1950             :         IndexTuple  base;
    1951      152656 :         Size        itupsz = BTreeTupleGetPostingOffset(itup);
    1952             : 
    1953      152656 :         itupsz = MAXALIGN(itupsz);
    1954      152656 :         currItem->tupleOffset = so->currPos.nextTupleOffset;
    1955      152656 :         base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
    1956      152656 :         memcpy(base, itup, itupsz);
    1957             :         /* Defensively reduce work area index tuple header size */
    1958      152656 :         base->t_info &= ~INDEX_SIZE_MASK;
    1959      152656 :         base->t_info |= itupsz;
    1960      152656 :         so->currPos.nextTupleOffset += itupsz;
    1961             : 
    1962      152656 :         return currItem->tupleOffset;
    1963             :     }
    1964             : 
    1965      331782 :     return 0;
    1966             : }
    1967             : 
    1968             : /*
    1969             :  * Save an index item into so->currPos.items[itemIndex] for current posting
    1970             :  * tuple.
    1971             :  *
    1972             :  * Assumes that _bt_setuppostingitems() has already been called for current
    1973             :  * posting list tuple.  Caller passes its return value as tupleOffset.
    1974             :  */
    1975             : static inline void
    1976     2442850 : _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
    1977             :                     ItemPointer heapTid, int tupleOffset)
    1978             : {
    1979     2442850 :     BTScanPosItem *currItem = &so->currPos.items[itemIndex];
    1980             : 
    1981     2442850 :     currItem->heapTid = *heapTid;
    1982     2442850 :     currItem->indexOffset = offnum;
    1983             : 
    1984             :     /*
    1985             :      * Have index-only scans return the same base IndexTuple for every TID
    1986             :      * that originates from the same posting list
    1987             :      */
    1988     2442850 :     if (so->currTuples)
    1989      930244 :         currItem->tupleOffset = tupleOffset;
    1990     2442850 : }
    1991             : 
    1992             : /*
    1993             :  * Return the index item from so->currPos.items[so->currPos.itemIndex] to the
    1994             :  * index scan by setting the relevant fields in caller's index scan descriptor
    1995             :  */
    1996             : static inline void
    1997    26066804 : _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
    1998             : {
    1999    26066804 :     BTScanPosItem *currItem = &so->currPos.items[so->currPos.itemIndex];
    2000             : 
    2001             :     /* Most recent _bt_readpage must have succeeded */
    2002             :     Assert(BTScanPosIsValid(so->currPos));
    2003             :     Assert(so->currPos.itemIndex >= so->currPos.firstItem);
    2004             :     Assert(so->currPos.itemIndex <= so->currPos.lastItem);
    2005             : 
    2006             :     /* Return next item, per amgettuple contract */
    2007    26066804 :     scan->xs_heaptid = currItem->heapTid;
    2008    26066804 :     if (so->currTuples)
    2009     4386228 :         scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
    2010    26066804 : }
    2011             : 
    2012             : /*
    2013             :  *  _bt_steppage() -- Step to next page containing valid data for scan
    2014             :  *
    2015             :  * Wrapper on _bt_readnextpage that performs final steps for the current page.
    2016             :  *
    2017             :  * On entry, if so->currPos.buf is valid the buffer is pinned but not locked.
    2018             :  * If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped
    2019             :  * the pin eagerly earlier on.  The scan must have so->currPos.currPage set to
    2020             :  * a valid block, in any case.
    2021             :  */
    2022             : static bool
    2023     6025384 : _bt_steppage(IndexScanDesc scan, ScanDirection dir)
    2024             : {
    2025     6025384 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    2026             :     BlockNumber blkno,
    2027             :                 lastcurrblkno;
    2028             : 
    2029             :     Assert(BTScanPosIsValid(so->currPos));
    2030             : 
    2031             :     /* Before leaving current page, deal with any killed items */
    2032     6025384 :     if (so->numKilled > 0)
    2033       79212 :         _bt_killitems(scan);
    2034             : 
    2035             :     /*
    2036             :      * Before we modify currPos, make a copy of the page data if there was a
    2037             :      * mark position that needs it.
    2038             :      */
    2039     6025384 :     if (so->markItemIndex >= 0)
    2040             :     {
    2041             :         /* bump pin on current buffer for assignment to mark buffer */
    2042         362 :         if (BTScanPosIsPinned(so->currPos))
    2043         348 :             IncrBufferRefCount(so->currPos.buf);
    2044         362 :         memcpy(&so->markPos, &so->currPos,
    2045             :                offsetof(BTScanPosData, items[1]) +
    2046         362 :                so->currPos.lastItem * sizeof(BTScanPosItem));
    2047         362 :         if (so->markTuples)
    2048         348 :             memcpy(so->markTuples, so->currTuples,
    2049         348 :                    so->currPos.nextTupleOffset);
    2050         362 :         so->markPos.itemIndex = so->markItemIndex;
    2051         362 :         so->markItemIndex = -1;
    2052             : 
    2053             :         /*
    2054             :          * If we're just about to start the next primitive index scan
    2055             :          * (possible with a scan that has arrays keys, and needs to skip to
    2056             :          * continue in the current scan direction), moreLeft/moreRight only
    2057             :          * indicate the end of the current primitive index scan.  They must
    2058             :          * never be taken to indicate that the top-level index scan has ended
    2059             :          * (that would be wrong).
    2060             :          *
    2061             :          * We could handle this case by treating the current array keys as
    2062             :          * markPos state.  But depending on the current array state like this
    2063             :          * would add complexity.  Instead, we just unset markPos's copy of
    2064             :          * moreRight or moreLeft (whichever might be affected), while making
    2065             :          * btrestrpos reset the scan's arrays to their initial scan positions.
    2066             :          * In effect, btrestrpos leaves advancing the arrays up to the first
    2067             :          * _bt_readpage call (that takes place after it has restored markPos).
    2068             :          */
    2069         362 :         if (so->needPrimScan)
    2070             :         {
    2071           0 :             if (ScanDirectionIsForward(so->currPos.dir))
    2072           0 :                 so->markPos.moreRight = true;
    2073             :             else
    2074           0 :                 so->markPos.moreLeft = true;
    2075             :         }
    2076             : 
    2077             :         /* mark/restore not supported by parallel scans */
    2078             :         Assert(!scan->parallel_scan);
    2079             :     }
    2080             : 
    2081     6025384 :     BTScanPosUnpinIfPinned(so->currPos);
    2082             : 
    2083             :     /* Walk to the next page with data */
    2084     6025384 :     if (ScanDirectionIsForward(dir))
    2085     6025146 :         blkno = so->currPos.nextPage;
    2086             :     else
    2087         238 :         blkno = so->currPos.prevPage;
    2088     6025384 :     lastcurrblkno = so->currPos.currPage;
    2089             : 
    2090             :     /*
    2091             :      * Cancel primitive index scans that were scheduled when the call to
    2092             :      * _bt_readpage for currPos happened to use the opposite direction to the
    2093             :      * one that we're stepping in now.  (It's okay to leave the scan's array
    2094             :      * keys as-is, since the next _bt_readpage will advance them.)
    2095             :      */
    2096     6025384 :     if (so->currPos.dir != dir)
    2097          36 :         so->needPrimScan = false;
    2098             : 
    2099     6025384 :     return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
    2100             : }
    2101             : 
    2102             : /*
    2103             :  *  _bt_readfirstpage() -- Read first page containing valid data for _bt_first
    2104             :  *
    2105             :  * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
    2106             :  * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
    2107             :  * _bt_checkkeys will stop the scan as soon as an equality qual fails (when
    2108             :  * its scan key was marked required), so _bt_first _must_ pass us an offnum
    2109             :  * exactly at the beginning of where equal tuples are to be found.  When we're
    2110             :  * passed an offnum past the end of the page, we might still manage to stop
    2111             :  * the scan on this page by calling _bt_checkkeys against the high key.  See
    2112             :  * _bt_readpage for full details.
    2113             :  *
    2114             :  * On entry, so->currPos must be pinned and locked (so offnum stays valid).
    2115             :  * Parallel scan callers must have seized the scan before calling here.
    2116             :  *
    2117             :  * On exit, we'll have updated so->currPos and retained locks and pins
    2118             :  * according to the same rules as those laid out for _bt_readnextpage exit.
    2119             :  * Like _bt_readnextpage, our return value indicates if there are any matching
    2120             :  * records in the given direction.
    2121             :  *
    2122             :  * We always release the scan for a parallel scan caller, regardless of
    2123             :  * success or failure; we'll call _bt_parallel_release as soon as possible.
    2124             :  */
    2125             : static bool
    2126    13916012 : _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
    2127             : {
    2128    13916012 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    2129             : 
    2130    13916012 :     so->numKilled = 0;           /* just paranoia */
    2131    13916012 :     so->markItemIndex = -1;      /* ditto */
    2132             : 
    2133             :     /* Initialize so->currPos for the first page (page in so->currPos.buf) */
    2134    13916012 :     if (so->needPrimScan)
    2135             :     {
    2136             :         Assert(so->numArrayKeys);
    2137             : 
    2138       17240 :         so->currPos.moreLeft = true;
    2139       17240 :         so->currPos.moreRight = true;
    2140       17240 :         so->needPrimScan = false;
    2141             :     }
    2142    13898772 :     else if (ScanDirectionIsForward(dir))
    2143             :     {
    2144    13846026 :         so->currPos.moreLeft = false;
    2145    13846026 :         so->currPos.moreRight = true;
    2146             :     }
    2147             :     else
    2148             :     {
    2149       52746 :         so->currPos.moreLeft = true;
    2150       52746 :         so->currPos.moreRight = false;
    2151             :     }
    2152             : 
    2153             :     /*
    2154             :      * Attempt to load matching tuples from the first page.
    2155             :      *
    2156             :      * Note that _bt_readpage will finish initializing the so->currPos fields.
    2157             :      * _bt_readpage also releases parallel scan (even when it returns false).
    2158             :      */
    2159    13916012 :     if (_bt_readpage(scan, dir, offnum, true))
    2160             :     {
    2161             :         /*
    2162             :          * _bt_readpage succeeded.  Drop the lock (and maybe the pin) on
    2163             :          * so->currPos.buf in preparation for btgettuple returning tuples.
    2164             :          */
    2165             :         Assert(BTScanPosIsPinned(so->currPos));
    2166    10148592 :         _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
    2167    10148592 :         return true;
    2168             :     }
    2169             : 
    2170             :     /* There's no actually-matching data on the page in so->currPos.buf */
    2171     3767420 :     _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
    2172             : 
    2173             :     /* Call _bt_readnextpage using its _bt_steppage wrapper function */
    2174     3767420 :     if (!_bt_steppage(scan, dir))
    2175     3767218 :         return false;
    2176             : 
    2177             :     /* _bt_readpage for a later page (now in so->currPos) succeeded */
    2178         202 :     return true;
    2179             : }
    2180             : 
    2181             : /*
    2182             :  *  _bt_readnextpage() -- Read next page containing valid data for _bt_next
    2183             :  *
    2184             :  * Caller's blkno is the next interesting page's link, taken from either the
    2185             :  * previously-saved right link or left link.  lastcurrblkno is the page that
    2186             :  * was current at the point where the blkno link was saved, which we use to
    2187             :  * reason about concurrent page splits/page deletions during backwards scans.
    2188             :  *
    2189             :  * On entry, caller shouldn't hold any locks or pins on any page (we work
    2190             :  * directly off of blkno and lastcurrblkno instead).  Parallel scan callers
    2191             :  * that seized the scan before calling here should pass seized=true; such a
    2192             :  * caller's blkno and lastcurrblkno arguments come from the seized scan.
    2193             :  * seized=false callers just pass us the blkno/lastcurrblkno taken from their
    2194             :  * so->currPos, which (along with so->currPos itself) can be used to end the
    2195             :  * scan.  A seized=false caller's blkno can never be assumed to be the page
    2196             :  * that must be read next during a parallel scan, though.  We must figure that
    2197             :  * part out for ourselves by seizing the scan (the correct page to read might
    2198             :  * already be beyond the seized=false caller's blkno during a parallel scan,
    2199             :  * unless blkno/so->currPos.nextPage/so->currPos.prevPage is already P_NONE,
    2200             :  * or unless so->currPos.moreRight/so->currPos.moreLeft is already unset).
    2201             :  *
    2202             :  * On success exit, so->currPos is updated to contain data from the next
    2203             :  * interesting page, and we return true.  We hold a pin on the buffer on
    2204             :  * success exit, except when _bt_drop_lock_and_maybe_pin decided it was safe
    2205             :  * to eagerly drop the pin (to avoid blocking VACUUM).
    2206             :  *
    2207             :  * If there are no more matching records in the given direction, we drop all
    2208             :  * locks and pins, invalidate so->currPos, and return false.
    2209             :  *
    2210             :  * We always release the scan for a parallel scan caller, regardless of
    2211             :  * success or failure; we'll call _bt_parallel_release as soon as possible.
    2212             :  */
    2213             : static bool
    2214     6025392 : _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
    2215             :                  BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
    2216             : {
    2217     6025392 :     Relation    rel = scan->indexRelation;
    2218     6025392 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    2219             : 
    2220             :     Assert(so->currPos.currPage == lastcurrblkno || seized);
    2221             :     Assert(!(blkno == P_NONE && seized));
    2222             :     Assert(!BTScanPosIsPinned(so->currPos));
    2223             : 
    2224             :     /*
    2225             :      * Remember that the scan already read lastcurrblkno, a page to the left
    2226             :      * of blkno (or remember reading a page to the right, for backwards scans)
    2227             :      */
    2228     6025392 :     if (ScanDirectionIsForward(dir))
    2229     6025154 :         so->currPos.moreLeft = true;
    2230             :     else
    2231         238 :         so->currPos.moreRight = true;
    2232             : 
    2233             :     for (;;)
    2234        2076 :     {
    2235             :         Page        page;
    2236             :         BTPageOpaque opaque;
    2237             : 
    2238     6027468 :         if (blkno == P_NONE ||
    2239             :             (ScanDirectionIsForward(dir) ?
    2240     1982558 :              !so->currPos.moreRight : !so->currPos.moreLeft))
    2241             :         {
    2242             :             /* most recent _bt_readpage call (for lastcurrblkno) ended scan */
    2243             :             Assert(so->currPos.currPage == lastcurrblkno && !seized);
    2244     5995624 :             BTScanPosInvalidate(so->currPos);
    2245     5995624 :             _bt_parallel_done(scan);    /* iff !so->needPrimScan */
    2246     5995624 :             return false;
    2247             :         }
    2248             : 
    2249             :         Assert(!so->needPrimScan);
    2250             : 
    2251             :         /* parallel scan must never actually visit so->currPos blkno */
    2252       31844 :         if (!seized && scan->parallel_scan != NULL &&
    2253        1212 :             !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
    2254             :         {
    2255             :             /* whole scan is now done (or another primitive scan required) */
    2256           8 :             BTScanPosInvalidate(so->currPos);
    2257           8 :             return false;
    2258             :         }
    2259             : 
    2260       31836 :         if (ScanDirectionIsForward(dir))
    2261             :         {
    2262             :             /* read blkno, but check for interrupts first */
    2263       31672 :             CHECK_FOR_INTERRUPTS();
    2264       31672 :             so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
    2265             :         }
    2266             :         else
    2267             :         {
    2268             :             /* read blkno, avoiding race (also checks for interrupts) */
    2269         164 :             so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,
    2270             :                                                          lastcurrblkno);
    2271         164 :             if (so->currPos.buf == InvalidBuffer)
    2272             :             {
    2273             :                 /* must have been a concurrent deletion of leftmost page */
    2274           0 :                 BTScanPosInvalidate(so->currPos);
    2275           0 :                 _bt_parallel_done(scan);
    2276           0 :                 return false;
    2277             :             }
    2278             :         }
    2279             : 
    2280       31836 :         page = BufferGetPage(so->currPos.buf);
    2281       31836 :         opaque = BTPageGetOpaque(page);
    2282       31836 :         lastcurrblkno = blkno;
    2283       31836 :         if (likely(!P_IGNORE(opaque)))
    2284             :         {
    2285             :             /* see if there are any matches on this page */
    2286       31836 :             if (ScanDirectionIsForward(dir))
    2287             :             {
    2288             :                 /* note that this will clear moreRight if we can stop */
    2289       31672 :                 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), seized))
    2290       29608 :                     break;
    2291        2064 :                 blkno = so->currPos.nextPage;
    2292             :             }
    2293             :             else
    2294             :             {
    2295             :                 /* note that this will clear moreLeft if we can stop */
    2296         164 :                 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), seized))
    2297         152 :                     break;
    2298          12 :                 blkno = so->currPos.prevPage;
    2299             :             }
    2300             :         }
    2301             :         else
    2302             :         {
    2303             :             /* _bt_readpage not called, so do all this for ourselves */
    2304           0 :             if (ScanDirectionIsForward(dir))
    2305           0 :                 blkno = opaque->btpo_next;
    2306             :             else
    2307           0 :                 blkno = opaque->btpo_prev;
    2308           0 :             if (scan->parallel_scan != NULL)
    2309           0 :                 _bt_parallel_release(scan, blkno, lastcurrblkno);
    2310             :         }
    2311             : 
    2312             :         /* no matching tuples on this page */
    2313        2076 :         _bt_relbuf(rel, so->currPos.buf);
    2314        2076 :         seized = false;         /* released by _bt_readpage (or by us) */
    2315             :     }
    2316             : 
    2317             :     /*
    2318             :      * _bt_readpage succeeded.  Drop the lock (and maybe the pin) on
    2319             :      * so->currPos.buf in preparation for btgettuple returning tuples.
    2320             :      */
    2321             :     Assert(so->currPos.currPage == blkno);
    2322             :     Assert(BTScanPosIsPinned(so->currPos));
    2323       29760 :     _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
    2324             : 
    2325       29760 :     return true;
    2326             : }
    2327             : 
    2328             : /*
    2329             :  * _bt_lock_and_validate_left() -- lock caller's left sibling blkno,
    2330             :  * recovering from concurrent page splits/page deletions when necessary
    2331             :  *
    2332             :  * Called during backwards scans, to deal with their unique concurrency rules.
    2333             :  *
    2334             :  * blkno points to the block number of the page that we expect to move the
    2335             :  * scan to.  We'll successfully move the scan there when we find that its
    2336             :  * right sibling link still points to lastcurrblkno (the page we just read).
    2337             :  * Otherwise, we have to figure out which page is the correct one for the scan
    2338             :  * to now read the hard way, reasoning about concurrent splits and deletions.
    2339             :  * See nbtree/README.
    2340             :  *
    2341             :  * On return, we have both a pin and a read lock on the returned page, whose
    2342             :  * block number will be set in *blkno.  Returns InvalidBuffer if there is no
    2343             :  * page to the left (no lock or pin is held in that case).
    2344             :  *
    2345             :  * It is possible for the returned leaf page to be half-dead; caller must
    2346             :  * check that condition and step left again when required.
    2347             :  */
    2348             : static Buffer
    2349         164 : _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
    2350             :                            BlockNumber lastcurrblkno)
    2351             : {
    2352         164 :     BlockNumber origblkno = *blkno; /* detects circular links */
    2353             : 
    2354             :     for (;;)
    2355           0 :     {
    2356             :         Buffer      buf;
    2357             :         Page        page;
    2358             :         BTPageOpaque opaque;
    2359             :         int         tries;
    2360             : 
    2361             :         /* check for interrupts while we're not holding any buffer lock */
    2362         164 :         CHECK_FOR_INTERRUPTS();
    2363         164 :         buf = _bt_getbuf(rel, *blkno, BT_READ);
    2364         164 :         page = BufferGetPage(buf);
    2365         164 :         opaque = BTPageGetOpaque(page);
    2366             : 
    2367             :         /*
    2368             :          * If this isn't the page we want, walk right till we find what we
    2369             :          * want --- but go no more than four hops (an arbitrary limit). If we
    2370             :          * don't find the correct page by then, the most likely bet is that
    2371             :          * lastcurrblkno got deleted and isn't in the sibling chain at all
    2372             :          * anymore, not that its left sibling got split more than four times.
    2373             :          *
    2374             :          * Note that it is correct to test P_ISDELETED not P_IGNORE here,
    2375             :          * because half-dead pages are still in the sibling chain.
    2376             :          */
    2377         164 :         tries = 0;
    2378             :         for (;;)
    2379             :         {
    2380         164 :             if (likely(!P_ISDELETED(opaque) &&
    2381             :                        opaque->btpo_next == lastcurrblkno))
    2382             :             {
    2383             :                 /* Found desired page, return it */
    2384         164 :                 return buf;
    2385             :             }
    2386           0 :             if (P_RIGHTMOST(opaque) || ++tries > 4)
    2387             :                 break;
    2388             :             /* step right */
    2389           0 :             *blkno = opaque->btpo_next;
    2390           0 :             buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);
    2391           0 :             page = BufferGetPage(buf);
    2392           0 :             opaque = BTPageGetOpaque(page);
    2393             :         }
    2394             : 
    2395             :         /*
    2396             :          * Return to the original page (usually the page most recently read by
    2397             :          * _bt_readpage, which is passed by caller as lastcurrblkno) to see
    2398             :          * what's up with its prev sibling link
    2399             :          */
    2400           0 :         buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
    2401           0 :         page = BufferGetPage(buf);
    2402           0 :         opaque = BTPageGetOpaque(page);
    2403           0 :         if (P_ISDELETED(opaque))
    2404             :         {
    2405             :             /*
    2406             :              * It was deleted.  Move right to first nondeleted page (there
    2407             :              * must be one); that is the page that has acquired the deleted
    2408             :              * one's keyspace, so stepping left from it will take us where we
    2409             :              * want to be.
    2410             :              */
    2411             :             for (;;)
    2412             :             {
    2413           0 :                 if (P_RIGHTMOST(opaque))
    2414           0 :                     elog(ERROR, "fell off the end of index \"%s\"",
    2415             :                          RelationGetRelationName(rel));
    2416           0 :                 lastcurrblkno = opaque->btpo_next;
    2417           0 :                 buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
    2418           0 :                 page = BufferGetPage(buf);
    2419           0 :                 opaque = BTPageGetOpaque(page);
    2420           0 :                 if (!P_ISDELETED(opaque))
    2421           0 :                     break;
    2422             :             }
    2423             :         }
    2424             :         else
    2425             :         {
    2426             :             /*
    2427             :              * Original lastcurrblkno wasn't deleted; the explanation had
    2428             :              * better be that the page to the left got split or deleted.
    2429             :              * Without this check, we risk going into an infinite loop.
    2430             :              */
    2431           0 :             if (opaque->btpo_prev == origblkno)
    2432           0 :                 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
    2433             :                      lastcurrblkno, RelationGetRelationName(rel));
    2434             :             /* Okay to try again, since left sibling link changed */
    2435             :         }
    2436             : 
    2437             :         /*
    2438             :          * Original lastcurrblkno from caller was concurrently deleted (could
    2439             :          * also have been a great many concurrent left sibling page splits).
    2440             :          * Found a non-deleted page that should now act as our lastcurrblkno.
    2441             :          */
    2442           0 :         if (P_LEFTMOST(opaque))
    2443             :         {
    2444             :             /* New lastcurrblkno has no left sibling (concurrently deleted) */
    2445           0 :             _bt_relbuf(rel, buf);
    2446           0 :             break;
    2447             :         }
    2448             : 
    2449             :         /* Start from scratch with new lastcurrblkno's blkno/prev link */
    2450           0 :         *blkno = origblkno = opaque->btpo_prev;
    2451           0 :         _bt_relbuf(rel, buf);
    2452             :     }
    2453             : 
    2454           0 :     return InvalidBuffer;
    2455             : }
    2456             : 
    2457             : /*
    2458             :  * _bt_get_endpoint() -- Find the first or last page on a given tree level
    2459             :  *
    2460             :  * If the index is empty, we will return InvalidBuffer; any other failure
    2461             :  * condition causes ereport().  We will not return a dead page.
    2462             :  *
    2463             :  * The returned buffer is pinned and read-locked.
    2464             :  */
    2465             : Buffer
    2466       85806 : _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
    2467             : {
    2468             :     Buffer      buf;
    2469             :     Page        page;
    2470             :     BTPageOpaque opaque;
    2471             :     OffsetNumber offnum;
    2472             :     BlockNumber blkno;
    2473             :     IndexTuple  itup;
    2474             : 
    2475             :     /*
    2476             :      * If we are looking for a leaf page, okay to descend from fast root;
    2477             :      * otherwise better descend from true root.  (There is no point in being
    2478             :      * smarter about intermediate levels.)
    2479             :      */
    2480       85806 :     if (level == 0)
    2481       85782 :         buf = _bt_getroot(rel, NULL, BT_READ);
    2482             :     else
    2483          24 :         buf = _bt_gettrueroot(rel);
    2484             : 
    2485       85806 :     if (!BufferIsValid(buf))
    2486        7162 :         return InvalidBuffer;
    2487             : 
    2488       78644 :     page = BufferGetPage(buf);
    2489       78644 :     opaque = BTPageGetOpaque(page);
    2490             : 
    2491             :     for (;;)
    2492             :     {
    2493             :         /*
    2494             :          * If we landed on a deleted page, step right to find a live page
    2495             :          * (there must be one).  Also, if we want the rightmost page, step
    2496             :          * right if needed to get to it (this could happen if the page split
    2497             :          * since we obtained a pointer to it).
    2498             :          */
    2499      100768 :         while (P_IGNORE(opaque) ||
    2500          66 :                (rightmost && !P_RIGHTMOST(opaque)))
    2501             :         {
    2502           0 :             blkno = opaque->btpo_next;
    2503           0 :             if (blkno == P_NONE)
    2504           0 :                 elog(ERROR, "fell off the end of index \"%s\"",
    2505             :                      RelationGetRelationName(rel));
    2506           0 :             buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    2507           0 :             page = BufferGetPage(buf);
    2508           0 :             opaque = BTPageGetOpaque(page);
    2509             :         }
    2510             : 
    2511             :         /* Done? */
    2512      100768 :         if (opaque->btpo_level == level)
    2513       78644 :             break;
    2514       22124 :         if (opaque->btpo_level < level)
    2515           0 :             ereport(ERROR,
    2516             :                     (errcode(ERRCODE_INDEX_CORRUPTED),
    2517             :                      errmsg_internal("btree level %u not found in index \"%s\"",
    2518             :                                      level, RelationGetRelationName(rel))));
    2519             : 
    2520             :         /* Descend to leftmost or rightmost child page */
    2521       22124 :         if (rightmost)
    2522           6 :             offnum = PageGetMaxOffsetNumber(page);
    2523             :         else
    2524       22118 :             offnum = P_FIRSTDATAKEY(opaque);
    2525             : 
    2526       22124 :         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
    2527       22124 :         blkno = BTreeTupleGetDownLink(itup);
    2528             : 
    2529       22124 :         buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    2530       22124 :         page = BufferGetPage(buf);
    2531       22124 :         opaque = BTPageGetOpaque(page);
    2532             :     }
    2533             : 
    2534       78644 :     return buf;
    2535             : }
    2536             : 
    2537             : /*
    2538             :  *  _bt_endpoint() -- Find the first or last page in the index, and scan
    2539             :  * from there to the first key satisfying all the quals.
    2540             :  *
    2541             :  * This is used by _bt_first() to set up a scan when we've determined
    2542             :  * that the scan must start at the beginning or end of the index (for
    2543             :  * a forward or backward scan respectively).
    2544             :  *
    2545             :  * Parallel scan callers must have seized the scan before calling here.
    2546             :  * Exit conditions are the same as for _bt_first().
    2547             :  */
    2548             : static bool
    2549       85782 : _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
    2550             : {
    2551       85782 :     Relation    rel = scan->indexRelation;
    2552       85782 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    2553             :     Page        page;
    2554             :     BTPageOpaque opaque;
    2555             :     OffsetNumber start;
    2556             : 
    2557             :     Assert(!BTScanPosIsValid(so->currPos));
    2558             :     Assert(!so->needPrimScan);
    2559             : 
    2560             :     /*
    2561             :      * Scan down to the leftmost or rightmost leaf page.  This is a simplified
    2562             :      * version of _bt_search().
    2563             :      */
    2564       85782 :     so->currPos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
    2565             : 
    2566       85782 :     if (!BufferIsValid(so->currPos.buf))
    2567             :     {
    2568             :         /*
    2569             :          * Empty index. Lock the whole relation, as nothing finer to lock
    2570             :          * exists.
    2571             :          */
    2572        7162 :         PredicateLockRelation(rel, scan->xs_snapshot);
    2573        7162 :         _bt_parallel_done(scan);
    2574        7162 :         return false;
    2575             :     }
    2576             : 
    2577       78620 :     page = BufferGetPage(so->currPos.buf);
    2578       78620 :     opaque = BTPageGetOpaque(page);
    2579             :     Assert(P_ISLEAF(opaque));
    2580             : 
    2581       78620 :     if (ScanDirectionIsForward(dir))
    2582             :     {
    2583             :         /* There could be dead pages to the left, so not this: */
    2584             :         /* Assert(P_LEFTMOST(opaque)); */
    2585             : 
    2586       78560 :         start = P_FIRSTDATAKEY(opaque);
    2587             :     }
    2588          60 :     else if (ScanDirectionIsBackward(dir))
    2589             :     {
    2590             :         Assert(P_RIGHTMOST(opaque));
    2591             : 
    2592          60 :         start = PageGetMaxOffsetNumber(page);
    2593             :     }
    2594             :     else
    2595             :     {
    2596           0 :         elog(ERROR, "invalid scan direction: %d", (int) dir);
    2597             :         start = 0;              /* keep compiler quiet */
    2598             :     }
    2599             : 
    2600             :     /*
    2601             :      * Now load data from the first page of the scan.
    2602             :      */
    2603       78620 :     if (!_bt_readfirstpage(scan, start, dir))
    2604        1722 :         return false;
    2605             : 
    2606       76898 :     _bt_returnitem(scan, so);
    2607       76898 :     return true;
    2608             : }

Generated by: LCOV version 1.14