LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtsearch.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 482 546 88.3 %
Date: 2026-01-14 02:17:24 Functions: 16 16 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtsearch.c
       4             :  *    Search code for postgres btrees.
       5             :  *
       6             :  *
       7             :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       8             :  * Portions Copyright (c) 1994, Regents of the University of California
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/access/nbtree/nbtsearch.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : 
      16             : #include "postgres.h"
      17             : 
      18             : #include "access/nbtree.h"
      19             : #include "access/relscan.h"
      20             : #include "access/xact.h"
      21             : #include "executor/instrument_node.h"
      22             : #include "miscadmin.h"
      23             : #include "pgstat.h"
      24             : #include "storage/predicate.h"
      25             : #include "utils/lsyscache.h"
      26             : #include "utils/rel.h"
      27             : 
      28             : 
      29             : static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so);
      30             : static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
      31             :                             Buffer buf, bool forupdate, BTStack stack,
      32             :                             int access);
      33             : static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
      34             : static int  _bt_binsrch_posting(BTScanInsert key, Page page,
      35             :                                 OffsetNumber offnum);
      36             : static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so);
      37             : static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
      38             : static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,
      39             :                               ScanDirection dir);
      40             : static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
      41             :                              BlockNumber lastcurrblkno, ScanDirection dir,
      42             :                              bool seized);
      43             : static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
      44             :                                          BlockNumber lastcurrblkno);
      45             : static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
      46             : 
      47             : 
      48             : /*
      49             :  *  _bt_drop_lock_and_maybe_pin()
      50             :  *
      51             :  * Unlock so->currPos.buf.  If scan is so->dropPin, drop the pin, too.
      52             :  * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock.
      53             :  */
      54             : static inline void
      55    11831248 : _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so)
      56             : {
      57    11831248 :     if (!so->dropPin)
      58             :     {
      59             :         /* Just drop the lock (not the pin) */
      60      506252 :         _bt_unlockbuf(rel, so->currPos.buf);
      61      506252 :         return;
      62             :     }
      63             : 
      64             :     /*
      65             :      * Drop both the lock and the pin.
      66             :      *
      67             :      * Have to set so->currPos.lsn so that _bt_killitems has a way to detect
      68             :      * when concurrent heap TID recycling by VACUUM might have taken place.
      69             :      */
      70             :     Assert(RelationNeedsWAL(rel));
      71    11324996 :     so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
      72    11324996 :     _bt_relbuf(rel, so->currPos.buf);
      73    11324996 :     so->currPos.buf = InvalidBuffer;
      74             : }
      75             : 
      76             : /*
      77             :  *  _bt_search() -- Search the tree for a particular scankey,
      78             :  *      or more precisely for the first leaf page it could be on.
      79             :  *
      80             :  * The passed scankey is an insertion-type scankey (see nbtree/README),
      81             :  * but it can omit the rightmost column(s) of the index.
      82             :  *
      83             :  * Return value is a stack of parent-page pointers (i.e. there is no entry for
      84             :  * the leaf level/page).  *bufP is set to the address of the leaf-page buffer,
      85             :  * which is locked and pinned.  No locks are held on the parent pages,
      86             :  * however!
      87             :  *
      88             :  * The returned buffer is locked according to access parameter.  Additionally,
      89             :  * access = BT_WRITE will allow an empty root page to be created and returned.
      90             :  * When access = BT_READ, an empty index will result in *bufP being set to
      91             :  * InvalidBuffer.  Also, in BT_WRITE mode, any incomplete splits encountered
      92             :  * during the search will be finished.
      93             :  *
      94             :  * heaprel must be provided by callers that pass access = BT_WRITE, since we
      95             :  * might need to allocate a new root page for caller -- see _bt_allocbuf.
      96             :  */
      97             : BTStack
      98    24142652 : _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
      99             :            int access)
     100             : {
     101    24142652 :     BTStack     stack_in = NULL;
     102    24142652 :     int         page_access = BT_READ;
     103             : 
     104             :     /* heaprel must be set whenever _bt_allocbuf is reachable */
     105             :     Assert(access == BT_READ || access == BT_WRITE);
     106             :     Assert(access == BT_READ || heaprel != NULL);
     107             : 
     108             :     /* Get the root page to start with */
     109    24142652 :     *bufP = _bt_getroot(rel, heaprel, access);
     110             : 
     111             :     /* If index is empty and access = BT_READ, no root page is created. */
     112    24142652 :     if (!BufferIsValid(*bufP))
     113      583600 :         return (BTStack) NULL;
     114             : 
     115             :     /* Loop iterates once per level descended in the tree */
     116             :     for (;;)
     117    19392906 :     {
     118             :         Page        page;
     119             :         BTPageOpaque opaque;
     120             :         OffsetNumber offnum;
     121             :         ItemId      itemid;
     122             :         IndexTuple  itup;
     123             :         BlockNumber child;
     124             :         BTStack     new_stack;
     125             : 
     126             :         /*
     127             :          * Race -- the page we just grabbed may have split since we read its
     128             :          * downlink in its parent page (or the metapage).  If it has, we may
     129             :          * need to move right to its new sibling.  Do that.
     130             :          *
     131             :          * In write-mode, allow _bt_moveright to finish any incomplete splits
     132             :          * along the way.  Strictly speaking, we'd only need to finish an
     133             :          * incomplete split on the leaf page we're about to insert to, not on
     134             :          * any of the upper levels (internal pages with incomplete splits are
     135             :          * also taken care of in _bt_getstackbuf).  But this is a good
     136             :          * opportunity to finish splits of internal pages too.
     137             :          */
     138    42951958 :         *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
     139             :                               stack_in, page_access);
     140             : 
     141             :         /* if this is a leaf page, we're done */
     142    42951958 :         page = BufferGetPage(*bufP);
     143    42951958 :         opaque = BTPageGetOpaque(page);
     144    42951958 :         if (P_ISLEAF(opaque))
     145    23559052 :             break;
     146             : 
     147             :         /*
     148             :          * Find the appropriate pivot tuple on this page.  Its downlink points
     149             :          * to the child page that we're about to descend to.
     150             :          */
     151    19392906 :         offnum = _bt_binsrch(rel, key, *bufP);
     152    19392906 :         itemid = PageGetItemId(page, offnum);
     153    19392906 :         itup = (IndexTuple) PageGetItem(page, itemid);
     154             :         Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
     155    19392906 :         child = BTreeTupleGetDownLink(itup);
     156             : 
     157             :         /*
     158             :          * We need to save the location of the pivot tuple we chose in a new
     159             :          * stack entry for this page/level.  If caller ends up splitting a
     160             :          * page one level down, it usually ends up inserting a new pivot
     161             :          * tuple/downlink immediately after the location recorded here.
     162             :          */
     163    19392906 :         new_stack = (BTStack) palloc_object(BTStackData);
     164    19392906 :         new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
     165    19392906 :         new_stack->bts_offset = offnum;
     166    19392906 :         new_stack->bts_parent = stack_in;
     167             : 
     168             :         /*
     169             :          * Page level 1 is lowest non-leaf page level prior to leaves.  So, if
     170             :          * we're on the level 1 and asked to lock leaf page in write mode,
     171             :          * then lock next page in write mode, because it must be a leaf.
     172             :          */
     173    19392906 :         if (opaque->btpo_level == 1 && access == BT_WRITE)
     174     6449646 :             page_access = BT_WRITE;
     175             : 
     176             :         /* drop the read lock on the page, then acquire one on its child */
     177    19392906 :         *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
     178             : 
     179             :         /* okay, all set to move down a level */
     180    19392906 :         stack_in = new_stack;
     181             :     }
     182             : 
     183             :     /*
     184             :      * If we're asked to lock leaf in write mode, but didn't manage to, then
     185             :      * relock.  This should only happen when the root page is a leaf page (and
     186             :      * the only page in the index other than the metapage).
     187             :      */
     188    23559052 :     if (access == BT_WRITE && page_access == BT_READ)
     189             :     {
     190             :         /* trade in our read lock for a write lock */
     191      909782 :         _bt_unlockbuf(rel, *bufP);
     192      909782 :         _bt_lockbuf(rel, *bufP, BT_WRITE);
     193             : 
     194             :         /*
     195             :          * Race -- the leaf page may have split after we dropped the read lock
     196             :          * but before we acquired a write lock.  If it has, we may need to
     197             :          * move right to its new sibling.  Do that.
     198             :          */
     199      909782 :         *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
     200             :     }
     201             : 
     202    23559052 :     return stack_in;
     203             : }
     204             : 
     205             : /*
     206             :  *  _bt_moveright() -- move right in the btree if necessary.
     207             :  *
     208             :  * When we follow a pointer to reach a page, it is possible that
     209             :  * the page has changed in the meanwhile.  If this happens, we're
     210             :  * guaranteed that the page has "split right" -- that is, that any
     211             :  * data that appeared on the page originally is either on the page
     212             :  * or strictly to the right of it.
     213             :  *
     214             :  * This routine decides whether or not we need to move right in the
     215             :  * tree by examining the high key entry on the page.  If that entry is
     216             :  * strictly less than the scankey, or <= the scankey in the
     217             :  * key.nextkey=true case, then we followed the wrong link and we need
     218             :  * to move right.
     219             :  *
     220             :  * The passed insertion-type scankey can omit the rightmost column(s) of the
     221             :  * index. (see nbtree/README)
     222             :  *
     223             :  * When key.nextkey is false (the usual case), we are looking for the first
     224             :  * item >= key.  When key.nextkey is true, we are looking for the first item
     225             :  * strictly greater than key.
     226             :  *
     227             :  * If forupdate is true, we will attempt to finish any incomplete splits
     228             :  * that we encounter.  This is required when locking a target page for an
     229             :  * insertion, because we don't allow inserting on a page before the split is
     230             :  * completed.  'heaprel' and 'stack' are only used if forupdate is true.
     231             :  *
     232             :  * On entry, we have the buffer pinned and a lock of the type specified by
     233             :  * 'access'.  If we move right, we release the buffer and lock and acquire
     234             :  * the same on the right sibling.  Return value is the buffer we stop at.
     235             :  */
     236             : static Buffer
     237    43861740 : _bt_moveright(Relation rel,
     238             :               Relation heaprel,
     239             :               BTScanInsert key,
     240             :               Buffer buf,
     241             :               bool forupdate,
     242             :               BTStack stack,
     243             :               int access)
     244             : {
     245             :     Page        page;
     246             :     BTPageOpaque opaque;
     247             :     int32       cmpval;
     248             : 
     249             :     Assert(!forupdate || heaprel != NULL);
     250             : 
     251             :     /*
     252             :      * When nextkey = false (normal case): if the scan key that brought us to
     253             :      * this page is > the high key stored on the page, then the page has split
     254             :      * and we need to move right.  (pg_upgrade'd !heapkeyspace indexes could
     255             :      * have some duplicates to the right as well as the left, but that's
     256             :      * something that's only ever dealt with on the leaf level, after
     257             :      * _bt_search has found an initial leaf page.)
     258             :      *
     259             :      * When nextkey = true: move right if the scan key is >= page's high key.
     260             :      * (Note that key.scantid cannot be set in this case.)
     261             :      *
     262             :      * The page could even have split more than once, so scan as far as
     263             :      * needed.
     264             :      *
     265             :      * We also have to move right if we followed a link that brought us to a
     266             :      * dead page.
     267             :      */
     268    43861740 :     cmpval = key->nextkey ? 0 : 1;
     269             : 
     270             :     for (;;)
     271             :     {
     272    43863482 :         page = BufferGetPage(buf);
     273    43863482 :         opaque = BTPageGetOpaque(page);
     274             : 
     275    43863482 :         if (P_RIGHTMOST(opaque))
     276    33217786 :             break;
     277             : 
     278             :         /*
     279             :          * Finish any incomplete splits we encounter along the way.
     280             :          */
     281    10645696 :         if (forupdate && P_INCOMPLETE_SPLIT(opaque))
     282           0 :         {
     283           0 :             BlockNumber blkno = BufferGetBlockNumber(buf);
     284             : 
     285             :             /* upgrade our lock if necessary */
     286           0 :             if (access == BT_READ)
     287             :             {
     288           0 :                 _bt_unlockbuf(rel, buf);
     289           0 :                 _bt_lockbuf(rel, buf, BT_WRITE);
     290             :             }
     291             : 
     292           0 :             if (P_INCOMPLETE_SPLIT(opaque))
     293           0 :                 _bt_finish_split(rel, heaprel, buf, stack);
     294             :             else
     295           0 :                 _bt_relbuf(rel, buf);
     296             : 
     297             :             /* re-acquire the lock in the right mode, and re-check */
     298           0 :             buf = _bt_getbuf(rel, blkno, access);
     299           0 :             continue;
     300             :         }
     301             : 
     302    10645696 :         if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
     303             :         {
     304             :             /* step right one page */
     305        1742 :             buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
     306        1742 :             continue;
     307             :         }
     308             :         else
     309             :             break;
     310             :     }
     311             : 
     312    43861740 :     if (P_IGNORE(opaque))
     313           0 :         elog(ERROR, "fell off the end of index \"%s\"",
     314             :              RelationGetRelationName(rel));
     315             : 
     316    43861740 :     return buf;
     317             : }
     318             : 
     319             : /*
     320             :  *  _bt_binsrch() -- Do a binary search for a key on a particular page.
     321             :  *
     322             :  * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
     323             :  * of the last key < given scankey, or last key <= given scankey if nextkey
     324             :  * is true.  (Since _bt_compare treats the first data key of such a page as
     325             :  * minus infinity, there will be at least one key < scankey, so the result
     326             :  * always points at one of the keys on the page.)
     327             :  *
     328             :  * On a leaf page, _bt_binsrch() returns the final result of the initial
     329             :  * positioning process that started with _bt_first's call to _bt_search.
     330             :  * We're returning a non-pivot tuple offset, so things are a little different.
     331             :  * It is possible that we'll return an offset that's either past the last
     332             :  * non-pivot slot, or (in the case of a backward scan) before the first slot.
     333             :  *
     334             :  * This procedure is not responsible for walking right, it just examines
     335             :  * the given page.  _bt_binsrch() has no lock or refcount side effects
     336             :  * on the buffer.
     337             :  */
     338             : static OffsetNumber
     339    35184640 : _bt_binsrch(Relation rel,
     340             :             BTScanInsert key,
     341             :             Buffer buf)
     342             : {
     343             :     Page        page;
     344             :     BTPageOpaque opaque;
     345             :     OffsetNumber low,
     346             :                 high;
     347             :     int32       result,
     348             :                 cmpval;
     349             : 
     350    35184640 :     page = BufferGetPage(buf);
     351    35184640 :     opaque = BTPageGetOpaque(page);
     352             : 
     353             :     /* Requesting nextkey semantics while using scantid seems nonsensical */
     354             :     Assert(!key->nextkey || key->scantid == NULL);
     355             :     /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
     356             :     Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
     357             : 
     358    35184640 :     low = P_FIRSTDATAKEY(opaque);
     359    35184640 :     high = PageGetMaxOffsetNumber(page);
     360             : 
     361             :     /*
     362             :      * If there are no keys on the page, return the first available slot. Note
     363             :      * this covers two cases: the page is really empty (no keys), or it
     364             :      * contains only a high key.  The latter case is possible after vacuuming.
     365             :      * This can never happen on an internal page, however, since they are
     366             :      * never empty (an internal page must have at least one child).
     367             :      */
     368    35184640 :     if (unlikely(high < low))
     369       10494 :         return low;
     370             : 
     371             :     /*
     372             :      * Binary search to find the first key on the page >= scan key, or first
     373             :      * key > scankey when nextkey is true.
     374             :      *
     375             :      * For nextkey=false (cmpval=1), the loop invariant is: all slots before
     376             :      * 'low' are < scan key, all slots at or after 'high' are >= scan key.
     377             :      *
     378             :      * For nextkey=true (cmpval=0), the loop invariant is: all slots before
     379             :      * 'low' are <= scan key, all slots at or after 'high' are > scan key.
     380             :      *
     381             :      * We can fall out when high == low.
     382             :      */
     383    35174146 :     high++;                     /* establish the loop invariant for high */
     384             : 
     385    35174146 :     cmpval = key->nextkey ? 0 : 1;   /* select comparison value */
     386             : 
     387   230248538 :     while (high > low)
     388             :     {
     389   195074392 :         OffsetNumber mid = low + ((high - low) / 2);
     390             : 
     391             :         /* We have low <= mid < high, so mid points at a real slot */
     392             : 
     393   195074392 :         result = _bt_compare(rel, key, page, mid);
     394             : 
     395   195074392 :         if (result >= cmpval)
     396   120841154 :             low = mid + 1;
     397             :         else
     398    74233238 :             high = mid;
     399             :     }
     400             : 
     401             :     /*
     402             :      * At this point we have high == low.
     403             :      *
     404             :      * On a leaf page we always return the first non-pivot tuple >= scan key
     405             :      * (resp. > scan key) for forward scan callers.  For backward scans, it's
     406             :      * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
     407             :      */
     408    35174146 :     if (P_ISLEAF(opaque))
     409             :     {
     410             :         /*
     411             :          * In the backward scan case we're supposed to locate the last
     412             :          * matching tuple on the leaf level -- not the first matching tuple
     413             :          * (the last tuple will be the first one returned by the scan).
     414             :          *
     415             :          * At this point we've located the first non-pivot tuple immediately
     416             :          * after the last matching tuple (which might just be maxoff + 1).
     417             :          * Compensate by stepping back.
     418             :          */
     419    15781240 :         if (key->backward)
     420       56636 :             return OffsetNumberPrev(low);
     421             : 
     422    15724604 :         return low;
     423             :     }
     424             : 
     425             :     /*
     426             :      * On a non-leaf page, return the last key < scan key (resp. <= scan key).
     427             :      * There must be one if _bt_compare() is playing by the rules.
     428             :      *
     429             :      * _bt_compare() will seldom see any exactly-matching pivot tuples, since
     430             :      * a truncated -inf heap TID is usually enough to prevent it altogether.
     431             :      * Even omitted scan key entries are treated as > truncated attributes.
     432             :      *
     433             :      * However, during backward scans _bt_compare() interprets omitted scan
     434             :      * key attributes as == corresponding truncated -inf attributes instead.
     435             :      * This works just like < would work here.  Under this scheme, < strategy
     436             :      * backward scans will always directly descend to the correct leaf page.
     437             :      * In particular, they will never incur an "extra" leaf page access with a
     438             :      * scan key that happens to contain the same prefix of values as some
     439             :      * pivot tuple's untruncated prefix.  VACUUM relies on this guarantee when
     440             :      * it uses a leaf page high key to "re-find" a page undergoing deletion.
     441             :      */
     442             :     Assert(low > P_FIRSTDATAKEY(opaque));
     443             : 
     444    19392906 :     return OffsetNumberPrev(low);
     445             : }
     446             : 
     447             : /*
     448             :  *
     449             :  *  _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
     450             :  *
     451             :  * Like _bt_binsrch(), but with support for caching the binary search
     452             :  * bounds.  Only used during insertion, and only on the leaf page that it
     453             :  * looks like caller will insert tuple on.  Exclusive-locked and pinned
     454             :  * leaf page is contained within insertstate.
     455             :  *
     456             :  * Caches the bounds fields in insertstate so that a subsequent call can
     457             :  * reuse the low and strict high bounds of original binary search.  Callers
     458             :  * that use these fields directly must be prepared for the case where low
     459             :  * and/or stricthigh are not on the same page (one or both exceed maxoff
     460             :  * for the page).  The case where there are no items on the page (high <
     461             :  * low) makes bounds invalid.
     462             :  *
     463             :  * Caller is responsible for invalidating bounds when it modifies the page
     464             :  * before calling here a second time, and for dealing with posting list
     465             :  * tuple matches (callers can use insertstate's postingoff field to
     466             :  * determine which existing heap TID will need to be replaced by a posting
     467             :  * list split).
     468             :  */
     469             : OffsetNumber
     470    13118006 : _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
     471             : {
     472    13118006 :     BTScanInsert key = insertstate->itup_key;
     473             :     Page        page;
     474             :     BTPageOpaque opaque;
     475             :     OffsetNumber low,
     476             :                 high,
     477             :                 stricthigh;
     478             :     int32       result,
     479             :                 cmpval;
     480             : 
     481    13118006 :     page = BufferGetPage(insertstate->buf);
     482    13118006 :     opaque = BTPageGetOpaque(page);
     483             : 
     484             :     Assert(P_ISLEAF(opaque));
     485             :     Assert(!key->nextkey);
     486             :     Assert(insertstate->postingoff == 0);
     487             : 
     488    13118006 :     if (!insertstate->bounds_valid)
     489             :     {
     490             :         /* Start new binary search */
     491     7853772 :         low = P_FIRSTDATAKEY(opaque);
     492     7853772 :         high = PageGetMaxOffsetNumber(page);
     493             :     }
     494             :     else
     495             :     {
     496             :         /* Restore result of previous binary search against same page */
     497     5264234 :         low = insertstate->low;
     498     5264234 :         high = insertstate->stricthigh;
     499             :     }
     500             : 
     501             :     /* If there are no keys on the page, return the first available slot */
     502    13118006 :     if (unlikely(high < low))
     503             :     {
     504             :         /* Caller can't reuse bounds */
     505       24260 :         insertstate->low = InvalidOffsetNumber;
     506       24260 :         insertstate->stricthigh = InvalidOffsetNumber;
     507       24260 :         insertstate->bounds_valid = false;
     508       24260 :         return low;
     509             :     }
     510             : 
     511             :     /*
     512             :      * Binary search to find the first key on the page >= scan key. (nextkey
     513             :      * is always false when inserting).
     514             :      *
     515             :      * The loop invariant is: all slots before 'low' are < scan key, all slots
     516             :      * at or after 'high' are >= scan key.  'stricthigh' is > scan key, and is
     517             :      * maintained to save additional search effort for caller.
     518             :      *
     519             :      * We can fall out when high == low.
     520             :      */
     521    13093746 :     if (!insertstate->bounds_valid)
     522     7829512 :         high++;                 /* establish the loop invariant for high */
     523    13093746 :     stricthigh = high;          /* high initially strictly higher */
     524             : 
     525    13093746 :     cmpval = 1;                 /* !nextkey comparison value */
     526             : 
     527    70451108 :     while (high > low)
     528             :     {
     529    57357362 :         OffsetNumber mid = low + ((high - low) / 2);
     530             : 
     531             :         /* We have low <= mid < high, so mid points at a real slot */
     532             : 
     533    57357362 :         result = _bt_compare(rel, key, page, mid);
     534             : 
     535    57357362 :         if (result >= cmpval)
     536    43644618 :             low = mid + 1;
     537             :         else
     538             :         {
     539    13712744 :             high = mid;
     540    13712744 :             if (result != 0)
     541    12590070 :                 stricthigh = high;
     542             :         }
     543             : 
     544             :         /*
     545             :          * If tuple at offset located by binary search is a posting list whose
     546             :          * TID range overlaps with caller's scantid, perform posting list
     547             :          * binary search to set postingoff for caller.  Caller must split the
     548             :          * posting list when postingoff is set.  This should happen
     549             :          * infrequently.
     550             :          */
     551    57357362 :         if (unlikely(result == 0 && key->scantid != NULL))
     552             :         {
     553             :             /*
     554             :              * postingoff should never be set more than once per leaf page
     555             :              * binary search.  That would mean that there are duplicate table
     556             :              * TIDs in the index, which is never okay.  Check for that here.
     557             :              */
     558      427166 :             if (insertstate->postingoff != 0)
     559           0 :                 ereport(ERROR,
     560             :                         (errcode(ERRCODE_INDEX_CORRUPTED),
     561             :                          errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
     562             :                                          ItemPointerGetBlockNumber(key->scantid),
     563             :                                          ItemPointerGetOffsetNumber(key->scantid),
     564             :                                          low, stricthigh,
     565             :                                          BufferGetBlockNumber(insertstate->buf),
     566             :                                          RelationGetRelationName(rel))));
     567             : 
     568      427166 :             insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
     569             :         }
     570             :     }
     571             : 
     572             :     /*
     573             :      * On a leaf page, a binary search always returns the first key >= scan
     574             :      * key (at least in !nextkey case), which could be the last slot + 1. This
     575             :      * is also the lower bound of cached search.
     576             :      *
     577             :      * stricthigh may also be the last slot + 1, which prevents caller from
     578             :      * using bounds directly, but is still useful to us if we're called a
     579             :      * second time with cached bounds (cached low will be < stricthigh when
     580             :      * that happens).
     581             :      */
     582    13093746 :     insertstate->low = low;
     583    13093746 :     insertstate->stricthigh = stricthigh;
     584    13093746 :     insertstate->bounds_valid = true;
     585             : 
     586    13093746 :     return low;
     587             : }
     588             : 
     589             : /*----------
     590             :  *  _bt_binsrch_posting() -- posting list binary search.
     591             :  *
     592             :  * Helper routine for _bt_binsrch_insert().
     593             :  *
     594             :  * Returns offset into posting list where caller's scantid belongs.
     595             :  *----------
     596             :  */
     597             : static int
     598      427166 : _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
     599             : {
     600             :     IndexTuple  itup;
     601             :     ItemId      itemid;
     602             :     int         low,
     603             :                 high,
     604             :                 mid,
     605             :                 res;
     606             : 
     607             :     /*
     608             :      * If this isn't a posting tuple, then the index must be corrupt (if it is
     609             :      * an ordinary non-pivot tuple then there must be an existing tuple with a
     610             :      * heap TID that equals inserter's new heap TID/scantid).  Defensively
     611             :      * check that tuple is a posting list tuple whose posting list range
     612             :      * includes caller's scantid.
     613             :      *
     614             :      * (This is also needed because contrib/amcheck's rootdescend option needs
     615             :      * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
     616             :      */
     617      427166 :     itemid = PageGetItemId(page, offnum);
     618      427166 :     itup = (IndexTuple) PageGetItem(page, itemid);
     619      427166 :     if (!BTreeTupleIsPosting(itup))
     620      402196 :         return 0;
     621             : 
     622             :     Assert(key->heapkeyspace && key->allequalimage);
     623             : 
     624             :     /*
     625             :      * In the event that posting list tuple has LP_DEAD bit set, indicate this
     626             :      * to _bt_binsrch_insert() caller by returning -1, a sentinel value.  A
     627             :      * second call to _bt_binsrch_insert() can take place when its caller has
     628             :      * removed the dead item.
     629             :      */
     630       24970 :     if (ItemIdIsDead(itemid))
     631           6 :         return -1;
     632             : 
     633             :     /* "high" is past end of posting list for loop invariant */
     634       24964 :     low = 0;
     635       24964 :     high = BTreeTupleGetNPosting(itup);
     636             :     Assert(high >= 2);
     637             : 
     638      201428 :     while (high > low)
     639             :     {
     640      176464 :         mid = low + ((high - low) / 2);
     641      176464 :         res = ItemPointerCompare(key->scantid,
     642      176464 :                                  BTreeTupleGetPostingN(itup, mid));
     643             : 
     644      176464 :         if (res > 0)
     645       92678 :             low = mid + 1;
     646       83786 :         else if (res < 0)
     647       83786 :             high = mid;
     648             :         else
     649           0 :             return mid;
     650             :     }
     651             : 
     652             :     /* Exact match not found */
     653       24964 :     return low;
     654             : }
     655             : 
     656             : /*----------
     657             :  *  _bt_compare() -- Compare insertion-type scankey to tuple on a page.
     658             :  *
     659             :  *  page/offnum: location of btree item to be compared to.
     660             :  *
     661             :  *      This routine returns:
     662             :  *          <0 if scankey < tuple at offnum;
     663             :  *           0 if scankey == tuple at offnum;
     664             :  *          >0 if scankey > tuple at offnum.
     665             :  *
     666             :  * NULLs in the keys are treated as sortable values.  Therefore
     667             :  * "equality" does not necessarily mean that the item should be returned
     668             :  * to the caller as a matching key.  Similarly, an insertion scankey
     669             :  * with its scantid set is treated as equal to a posting tuple whose TID
     670             :  * range overlaps with their scantid.  There generally won't be a
     671             :  * matching TID in the posting tuple, which caller must handle
     672             :  * themselves (e.g., by splitting the posting list tuple).
     673             :  *
     674             :  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
     675             :  * "minus infinity": this routine will always claim it is less than the
     676             :  * scankey.  The actual key value stored is explicitly truncated to 0
     677             :  * attributes (explicitly minus infinity) with version 3+ indexes, but
     678             :  * that isn't relied upon.  This allows us to implement the Lehman and
     679             :  * Yao convention that the first down-link pointer is before the first
     680             :  * key.  See backend/access/nbtree/README for details.
     681             :  *----------
     682             :  */
     683             : int32
     684   280264950 : _bt_compare(Relation rel,
     685             :             BTScanInsert key,
     686             :             Page page,
     687             :             OffsetNumber offnum)
     688             : {
     689   280264950 :     TupleDesc   itupdesc = RelationGetDescr(rel);
     690   280264950 :     BTPageOpaque opaque = BTPageGetOpaque(page);
     691             :     IndexTuple  itup;
     692             :     ItemPointer heapTid;
     693             :     ScanKey     scankey;
     694             :     int         ncmpkey;
     695             :     int         ntupatts;
     696             :     int32       result;
     697             : 
     698             :     Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
     699             :     Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
     700             :     Assert(key->heapkeyspace || key->scantid == NULL);
     701             : 
     702             :     /*
     703             :      * Force result ">" if target item is first data item on an internal page
     704             :      * --- see NOTE above.
     705             :      */
     706   280264950 :     if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
     707     3796344 :         return 1;
     708             : 
     709   276468606 :     itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
     710   276468606 :     ntupatts = BTreeTupleGetNAtts(itup, rel);
     711             : 
     712             :     /*
     713             :      * The scan key is set up with the attribute number associated with each
     714             :      * term in the key.  It is important that, if the index is multi-key, the
     715             :      * scan contain the first k key attributes, and that they be in order.  If
     716             :      * you think about how multi-key ordering works, you'll understand why
     717             :      * this is.
     718             :      *
     719             :      * We don't test for violation of this condition here, however.  The
     720             :      * initial setup for the index scan had better have gotten it right (see
     721             :      * _bt_first).
     722             :      */
     723             : 
     724   276468606 :     ncmpkey = Min(ntupatts, key->keysz);
     725             :     Assert(key->heapkeyspace || ncmpkey == key->keysz);
     726             :     Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
     727   276468606 :     scankey = key->scankeys;
     728   348025892 :     for (int i = 1; i <= ncmpkey; i++)
     729             :     {
     730             :         Datum       datum;
     731             :         bool        isNull;
     732             : 
     733   323758488 :         datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
     734             : 
     735   323758488 :         if (scankey->sk_flags & SK_ISNULL)   /* key is NULL */
     736             :         {
     737      555480 :             if (isNull)
     738      157384 :                 result = 0;     /* NULL "=" NULL */
     739      398096 :             else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
     740         624 :                 result = -1;    /* NULL "<" NOT_NULL */
     741             :             else
     742      397472 :                 result = 1;     /* NULL ">" NOT_NULL */
     743             :         }
     744   323203008 :         else if (isNull)        /* key is NOT_NULL and item is NULL */
     745             :         {
     746         264 :             if (scankey->sk_flags & SK_BT_NULLS_FIRST)
     747           0 :                 result = 1;     /* NOT_NULL ">" NULL */
     748             :             else
     749         264 :                 result = -1;    /* NOT_NULL "<" NULL */
     750             :         }
     751             :         else
     752             :         {
     753             :             /*
     754             :              * The sk_func needs to be passed the index value as left arg and
     755             :              * the sk_argument as right arg (they might be of different
     756             :              * types).  Since it is convenient for callers to think of
     757             :              * _bt_compare as comparing the scankey to the index item, we have
     758             :              * to flip the sign of the comparison result.  (Unless it's a DESC
     759             :              * column, in which case we *don't* flip the sign.)
     760             :              */
     761   323202744 :             result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
     762             :                                                      scankey->sk_collation,
     763             :                                                      datum,
     764             :                                                      scankey->sk_argument));
     765             : 
     766   323202744 :             if (!(scankey->sk_flags & SK_BT_DESC))
     767   323202678 :                 INVERT_COMPARE_RESULT(result);
     768             :         }
     769             : 
     770             :         /* if the keys are unequal, return the difference */
     771   323758488 :         if (result != 0)
     772   252201202 :             return result;
     773             : 
     774    71557286 :         scankey++;
     775             :     }
     776             : 
     777             :     /*
     778             :      * All non-truncated attributes (other than heap TID) were found to be
     779             :      * equal.  Treat truncated attributes as minus infinity when scankey has a
     780             :      * key attribute value that would otherwise be compared directly.
     781             :      *
     782             :      * Note: it doesn't matter if ntupatts includes non-key attributes;
     783             :      * scankey won't, so explicitly excluding non-key attributes isn't
     784             :      * necessary.
     785             :      */
     786    24267404 :     if (key->keysz > ntupatts)
     787      201104 :         return 1;
     788             : 
     789             :     /*
     790             :      * Use the heap TID attribute and scantid to try to break the tie.  The
     791             :      * rules are the same as any other key attribute -- only the
     792             :      * representation differs.
     793             :      */
     794    24066300 :     heapTid = BTreeTupleGetHeapTID(itup);
     795    24066300 :     if (key->scantid == NULL)
     796             :     {
     797             :         /*
     798             :          * Forward scans have a scankey that is considered greater than a
     799             :          * truncated pivot tuple if and when the scankey has equal values for
     800             :          * attributes up to and including the least significant untruncated
     801             :          * attribute in tuple.  Even attributes that were omitted from the
     802             :          * scan key are considered greater than -inf truncated attributes.
     803             :          * (See _bt_binsrch for an explanation of our backward scan behavior.)
     804             :          *
     805             :          * For example, if an index has the minimum two attributes (single
     806             :          * user key attribute, plus heap TID attribute), and a page's high key
     807             :          * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
     808             :          * will not descend to the page to the left.  The search will descend
     809             :          * right instead.  The truncated attribute in pivot tuple means that
     810             :          * all non-pivot tuples on the page to the left are strictly < 'foo',
     811             :          * so it isn't necessary to descend left.  In other words, search
     812             :          * doesn't have to descend left because it isn't interested in a match
     813             :          * that has a heap TID value of -inf.
     814             :          *
     815             :          * Note: the heap TID part of the test ensures that scankey is being
     816             :          * compared to a pivot tuple with one or more truncated -inf key
     817             :          * attributes.  The heap TID attribute is the last key attribute in
     818             :          * every index, of course, but other than that it isn't special.
     819             :          */
     820    19480198 :         if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
     821       10754 :             key->heapkeyspace)
     822       10754 :             return 1;
     823             : 
     824             :         /* All provided scankey arguments found to be equal */
     825    19469444 :         return 0;
     826             :     }
     827             : 
     828             :     /*
     829             :      * Treat truncated heap TID as minus infinity, since scankey has a key
     830             :      * attribute value (scantid) that would otherwise be compared directly
     831             :      */
     832             :     Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
     833     4586102 :     if (heapTid == NULL)
     834        3972 :         return 1;
     835             : 
     836             :     /*
     837             :      * Scankey must be treated as equal to a posting list tuple if its scantid
     838             :      * value falls within the range of the posting list.  In all other cases
     839             :      * there can only be a single heap TID value, which is compared directly
     840             :      * with scantid.
     841             :      */
     842             :     Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
     843     4582130 :     result = ItemPointerCompare(key->scantid, heapTid);
     844     4582130 :     if (result <= 0 || !BTreeTupleIsPosting(itup))
     845     4409138 :         return result;
     846             :     else
     847             :     {
     848      172992 :         result = ItemPointerCompare(key->scantid,
     849      172992 :                                     BTreeTupleGetMaxHeapTID(itup));
     850      172992 :         if (result > 0)
     851      148022 :             return 1;
     852             :     }
     853             : 
     854       24970 :     return 0;
     855             : }
     856             : 
     857             : /*
     858             :  *  _bt_first() -- Find the first item in a scan.
     859             :  *
     860             :  *      We need to be clever about the direction of scan, the search
     861             :  *      conditions, and the tree ordering.  We find the first item (or,
     862             :  *      if backwards scan, the last item) in the tree that satisfies the
     863             :  *      qualifications in the scan key.  On success exit, data about the
     864             :  *      matching tuple(s) on the page has been loaded into so->currPos.  We'll
     865             :  *      drop all locks and hold onto a pin on page's buffer, except during
     866             :  *      so->dropPin scans, when we drop both the lock and the pin.
     867             :  *      _bt_returnitem sets the next item to return to scan on success exit.
     868             :  *
     869             :  * If there are no matching items in the index, we return false, with no
     870             :  * pins or locks held.  so->currPos will remain invalid.
     871             :  *
     872             :  * Note that scan->keyData[], and the so->keyData[] scankey built from it,
     873             :  * are both search-type scankeys (see nbtree/README for more about this).
     874             :  * Within this routine, we build a temporary insertion-type scankey to use
     875             :  * in locating the scan start position.
     876             :  */
     877             : bool
     878    16461622 : _bt_first(IndexScanDesc scan, ScanDirection dir)
     879             : {
     880    16461622 :     Relation    rel = scan->indexRelation;
     881    16461622 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     882             :     BTStack     stack;
     883             :     OffsetNumber offnum;
     884             :     BTScanInsertData inskey;
     885             :     ScanKey     startKeys[INDEX_MAX_KEYS];
     886             :     ScanKeyData notnullkey;
     887    16461622 :     int         keysz = 0;
     888    16461622 :     StrategyNumber strat_total = InvalidStrategy;
     889    16461622 :     BlockNumber blkno = InvalidBlockNumber,
     890             :                 lastcurrblkno;
     891             : 
     892             :     Assert(!BTScanPosIsValid(so->currPos));
     893             : 
     894             :     /*
     895             :      * Examine the scan keys and eliminate any redundant keys; also mark the
     896             :      * keys that must be matched to continue the scan.
     897             :      */
     898    16461622 :     _bt_preprocess_keys(scan);
     899             : 
     900             :     /*
     901             :      * Quit now if _bt_preprocess_keys() discovered that the scan keys can
     902             :      * never be satisfied (eg, x == 1 AND x > 2).
     903             :      */
     904    16461622 :     if (!so->qual_ok)
     905             :     {
     906             :         Assert(!so->needPrimScan);
     907        2076 :         _bt_parallel_done(scan);
     908        2076 :         return false;
     909             :     }
     910             : 
     911             :     /*
     912             :      * If this is a parallel scan, we must seize the scan.  _bt_readfirstpage
     913             :      * will likely release the parallel scan later on.
     914             :      */
     915    16459546 :     if (scan->parallel_scan != NULL &&
     916         446 :         !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
     917         294 :         return false;
     918             : 
     919             :     /*
     920             :      * Initialize the scan's arrays (if any) for the current scan direction
     921             :      * (except when they were already set to later values as part of
     922             :      * scheduling the primitive index scan that is now underway)
     923             :      */
     924    16459252 :     if (so->numArrayKeys && !so->needPrimScan)
     925       71330 :         _bt_start_array_keys(scan, dir);
     926             : 
     927    16459252 :     if (blkno != InvalidBlockNumber)
     928             :     {
     929             :         /*
     930             :          * We anticipated calling _bt_search, but another worker bet us to it.
     931             :          * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
     932             :          */
     933             :         Assert(scan->parallel_scan != NULL);
     934             :         Assert(!so->needPrimScan);
     935             :         Assert(blkno != P_NONE);
     936             : 
     937          28 :         if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
     938           0 :             return false;
     939             : 
     940          28 :         _bt_returnitem(scan, so);
     941          28 :         return true;
     942             :     }
     943             : 
     944             :     /*
     945             :      * Count an indexscan for stats, now that we know that we'll call
     946             :      * _bt_search/_bt_endpoint below
     947             :      */
     948    16459224 :     pgstat_count_index_scan(rel);
     949    16459224 :     if (scan->instrument)
     950      806934 :         scan->instrument->nsearches++;
     951             : 
     952             :     /*----------
     953             :      * Examine the scan keys to discover where we need to start the scan.
     954             :      * The selected scan keys (at most one per index column) are remembered by
     955             :      * storing their addresses into the local startKeys[] array.  The final
     956             :      * startKeys[] entry's strategy is set in strat_total. (Actually, there
     957             :      * are a couple of cases where we force a less/more restrictive strategy.)
     958             :      *
     959             :      * We must use the key that was marked required (in the direction opposite
     960             :      * our own scan's) during preprocessing.  Each index attribute can only
     961             :      * have one such required key.  In general, the keys that we use to find
     962             :      * an initial position when scanning forwards are the same keys that end
     963             :      * the scan on the leaf level when scanning backwards (and vice-versa).
     964             :      *
     965             :      * When the scan keys include cross-type operators, _bt_preprocess_keys
     966             :      * may not be able to eliminate redundant keys; in such cases it will
     967             :      * arbitrarily pick a usable key for each attribute (and scan direction),
     968             :      * ensuring that there is no more than one key required in each direction.
     969             :      * We stop considering further keys once we reach the first nonrequired
     970             :      * key (which must come after all required keys), so this can't affect us.
     971             :      *
     972             :      * The required keys that we use as starting boundaries have to be =, >,
     973             :      * or >= keys for a forward scan or =, <, <= keys for a backwards scan.
     974             :      * We can use keys for multiple attributes so long as the prior attributes
     975             :      * had only =, >= (resp. =, <=) keys.  These rules are very similar to the
     976             :      * rules that preprocessing used to determine which keys to mark required.
     977             :      * We cannot always use every required key as a positioning key, though.
     978             :      * Skip arrays necessitate independently applying our own rules here.
     979             :      * Skip arrays are always generally considered = array keys, but we'll
     980             :      * nevertheless treat them as inequalities at certain points of the scan.
     981             :      * When that happens, it _might_ have implications for the number of
     982             :      * required keys that we can safely use for initial positioning purposes.
     983             :      *
     984             :      * For example, a forward scan with a skip array on its leading attribute
     985             :      * (with no low_compare/high_compare) will have at least two required scan
     986             :      * keys, but we won't use any of them as boundary keys during the scan's
     987             :      * initial call here.  Our positioning key during the first call here can
     988             :      * be thought of as representing "> -infinity".  Similarly, if such a skip
     989             :      * array's low_compare is "a > 'foo'", then we position using "a > 'foo'"
     990             :      * during the scan's initial call here; a lower-order key such as "b = 42"
     991             :      * can't be used until the "a" array advances beyond MINVAL/low_compare.
     992             :      *
     993             :      * On the other hand, if such a skip array's low_compare was "a >= 'foo'",
     994             :      * then we _can_ use "a >= 'foo' AND b = 42" during the initial call here.
     995             :      * A subsequent call here might have us use "a = 'fop' AND b = 42".  Note
     996             :      * that we treat = and >= as equivalent when scanning forwards (just as we
     997             :      * treat = and <= as equivalent when scanning backwards).  We effectively
     998             :      * do the same thing (though with a distinct "a" element/value) each time.
     999             :      *
    1000             :      * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP
    1001             :      * array keys whose array is "null_elem=true") imply a NOT NULL qualifier.
    1002             :      * If the index stores nulls at the end of the index we'll be starting
    1003             :      * from, and we have no boundary key for the column (which means the key
    1004             :      * we deduced NOT NULL from is an inequality key that constrains the other
    1005             :      * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
    1006             :      * use as a boundary key.  If we didn't do this, we might find ourselves
    1007             :      * traversing a lot of null entries at the start of the scan.
    1008             :      *
    1009             :      * In this loop, row-comparison keys are treated the same as keys on their
    1010             :      * first (leftmost) columns.  We'll add all lower-order columns of the row
    1011             :      * comparison that were marked required during preprocessing below.
    1012             :      *
    1013             :      * _bt_advance_array_keys needs to know exactly how we'll reposition the
    1014             :      * scan (should it opt to schedule another primitive index scan).  It is
    1015             :      * critical that primscans only be scheduled when they'll definitely make
    1016             :      * some useful progress.  _bt_advance_array_keys does this by calling
    1017             :      * _bt_checkkeys routines that report whether a tuple is past the end of
    1018             :      * matches for the scan's keys (given the scan's current array elements).
    1019             :      * If the page's final tuple is "after the end of matches" for a scan that
    1020             :      * uses the *opposite* scan direction, then it must follow that it's also
    1021             :      * "before the start of matches" for the actual current scan direction.
    1022             :      * It is therefore essential that all of our initial positioning rules are
    1023             :      * symmetric with _bt_checkkeys's corresponding continuescan=false rule.
    1024             :      * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
    1025             :      * need to be kept in sync.
    1026             :      *----------
    1027             :      */
    1028    16459224 :     if (so->numberOfKeys > 0)
    1029             :     {
    1030             :         AttrNumber  curattr;
    1031             :         ScanKey     bkey;
    1032             :         ScanKey     impliesNN;
    1033             :         ScanKey     cur;
    1034             : 
    1035             :         /*
    1036             :          * bkey will be set to the key that preprocessing left behind as the
    1037             :          * boundary key for this attribute, in this scan direction (if any)
    1038             :          */
    1039    16445730 :         cur = so->keyData;
    1040    16445730 :         curattr = 1;
    1041    16445730 :         bkey = NULL;
    1042             :         /* Also remember any scankey that implies a NOT NULL constraint */
    1043    16445730 :         impliesNN = NULL;
    1044             : 
    1045             :         /*
    1046             :          * Loop iterates from 0 to numberOfKeys inclusive; we use the last
    1047             :          * pass to handle after-last-key processing.  Actual exit from the
    1048             :          * loop is at one of the "break" statements below.
    1049             :          */
    1050    42522676 :         for (int i = 0;; cur++, i++)
    1051             :         {
    1052    42522676 :             if (i >= so->numberOfKeys || cur->sk_attno != curattr)
    1053             :             {
    1054             :                 /* Done looking for the curattr boundary key */
    1055             :                 Assert(bkey == NULL ||
    1056             :                        (bkey->sk_attno == curattr &&
    1057             :                         (bkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
    1058             :                 Assert(impliesNN == NULL ||
    1059             :                        (impliesNN->sk_attno == curattr &&
    1060             :                         (impliesNN->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
    1061             : 
    1062             :                 /*
    1063             :                  * If this is a scan key for a skip array whose current
    1064             :                  * element is MINVAL, choose low_compare (when scanning
    1065             :                  * backwards it'll be MAXVAL, and we'll choose high_compare).
    1066             :                  *
    1067             :                  * Note: if the array's low_compare key makes 'bkey' NULL,
    1068             :                  * then we behave as if the array's first element is -inf,
    1069             :                  * except when !array->null_elem implies a usable NOT NULL
    1070             :                  * constraint.
    1071             :                  */
    1072    26075126 :                 if (bkey != NULL &&
    1073    26001958 :                     (bkey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))
    1074             :                 {
    1075        3730 :                     int         ikey = bkey - so->keyData;
    1076        3730 :                     ScanKey     skipequalitykey = bkey;
    1077        3730 :                     BTArrayKeyInfo *array = NULL;
    1078             : 
    1079        3836 :                     for (int arridx = 0; arridx < so->numArrayKeys; arridx++)
    1080             :                     {
    1081        3836 :                         array = &so->arrayKeys[arridx];
    1082        3836 :                         if (array->scan_key == ikey)
    1083        3730 :                             break;
    1084             :                     }
    1085             : 
    1086        3730 :                     if (ScanDirectionIsForward(dir))
    1087             :                     {
    1088             :                         Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL));
    1089        3712 :                         bkey = array->low_compare;
    1090             :                     }
    1091             :                     else
    1092             :                     {
    1093             :                         Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL));
    1094          18 :                         bkey = array->high_compare;
    1095             :                     }
    1096             : 
    1097             :                     Assert(bkey == NULL ||
    1098             :                            bkey->sk_attno == skipequalitykey->sk_attno);
    1099             : 
    1100        3730 :                     if (!array->null_elem)
    1101         136 :                         impliesNN = skipequalitykey;
    1102             :                     else
    1103             :                         Assert(bkey == NULL && impliesNN == NULL);
    1104             :                 }
    1105             : 
    1106             :                 /*
    1107             :                  * If we didn't find a usable boundary key, see if we can
    1108             :                  * deduce a NOT NULL key
    1109             :                  */
    1110    26148354 :                 if (bkey == NULL && impliesNN != NULL &&
    1111       73228 :                     ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
    1112             :                      ScanDirectionIsForward(dir) :
    1113             :                      ScanDirectionIsBackward(dir)))
    1114             :                 {
    1115             :                     /* Final startKeys[] entry will be deduced NOT NULL key */
    1116          30 :                     bkey = &notnullkey;
    1117          30 :                     ScanKeyEntryInitialize(bkey,
    1118             :                                            (SK_SEARCHNOTNULL | SK_ISNULL |
    1119          30 :                                             (impliesNN->sk_flags &
    1120             :                                              (SK_BT_DESC | SK_BT_NULLS_FIRST))),
    1121             :                                            curattr,
    1122             :                                            ScanDirectionIsForward(dir) ?
    1123             :                                            BTGreaterStrategyNumber : BTLessStrategyNumber,
    1124             :                                            InvalidOid,
    1125             :                                            InvalidOid,
    1126             :                                            InvalidOid,
    1127             :                                            (Datum) 0);
    1128             :                 }
    1129             : 
    1130             :                 /*
    1131             :                  * If preprocessing didn't leave a usable boundary key, quit;
    1132             :                  * else save the boundary key pointer in startKeys[]
    1133             :                  */
    1134    26075126 :                 if (bkey == NULL)
    1135       76792 :                     break;
    1136    25998334 :                 startKeys[keysz++] = bkey;
    1137             : 
    1138             :                 /*
    1139             :                  * We can only consider adding more boundary keys when the one
    1140             :                  * that we just chose to add uses either the = or >= strategy
    1141             :                  * (during backwards scans we can only do so when the key that
    1142             :                  * we just added to startKeys[] uses the = or <= strategy)
    1143             :                  */
    1144    25998334 :                 strat_total = bkey->sk_strategy;
    1145    25998334 :                 if (strat_total == BTGreaterStrategyNumber ||
    1146             :                     strat_total == BTLessStrategyNumber)
    1147             :                     break;
    1148             : 
    1149             :                 /*
    1150             :                  * If the key that we just added to startKeys[] is a skip
    1151             :                  * array = key whose current element is marked NEXT or PRIOR,
    1152             :                  * make strat_total > or < (and stop adding boundary keys).
    1153             :                  * This can only happen with opclasses that lack skip support.
    1154             :                  */
    1155    24268384 :                 if (bkey->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))
    1156             :                 {
    1157             :                     Assert(bkey->sk_flags & SK_BT_SKIP);
    1158             :                     Assert(strat_total == BTEqualStrategyNumber);
    1159             : 
    1160          12 :                     if (ScanDirectionIsForward(dir))
    1161             :                     {
    1162             :                         Assert(!(bkey->sk_flags & SK_BT_PRIOR));
    1163           6 :                         strat_total = BTGreaterStrategyNumber;
    1164             :                     }
    1165             :                     else
    1166             :                     {
    1167             :                         Assert(!(bkey->sk_flags & SK_BT_NEXT));
    1168           6 :                         strat_total = BTLessStrategyNumber;
    1169             :                     }
    1170             : 
    1171             :                     /*
    1172             :                      * We're done.  We'll never find an exact = match for a
    1173             :                      * NEXT or PRIOR sentinel sk_argument value.  There's no
    1174             :                      * sense in trying to add more keys to startKeys[].
    1175             :                      */
    1176          12 :                     break;
    1177             :                 }
    1178             : 
    1179             :                 /*
    1180             :                  * Done if that was the last scan key output by preprocessing.
    1181             :                  * Also done if we've now examined all keys marked required.
    1182             :                  */
    1183    24268372 :                 if (i >= so->numberOfKeys ||
    1184     9629402 :                     !(cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
    1185             :                     break;
    1186             : 
    1187             :                 /*
    1188             :                  * Reset for next attr.
    1189             :                  */
    1190             :                 Assert(cur->sk_attno == curattr + 1);
    1191     9629396 :                 curattr = cur->sk_attno;
    1192     9629396 :                 bkey = NULL;
    1193     9629396 :                 impliesNN = NULL;
    1194             :             }
    1195             : 
    1196             :             /*
    1197             :              * If we've located the starting boundary key for curattr, we have
    1198             :              * no interest in curattr's other required key
    1199             :              */
    1200    26076946 :             if (bkey != NULL)
    1201        1796 :                 continue;
    1202             : 
    1203             :             /*
    1204             :              * Is this key the starting boundary key for curattr?
    1205             :              *
    1206             :              * If not, does it imply a NOT NULL constraint?  (Because
    1207             :              * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
    1208             :              * *any* inequality key works for that; we need not test.)
    1209             :              */
    1210    26075150 :             switch (cur->sk_strategy)
    1211             :             {
    1212      129586 :                 case BTLessStrategyNumber:
    1213             :                 case BTLessEqualStrategyNumber:
    1214      129586 :                     if (ScanDirectionIsBackward(dir))
    1215       56436 :                         bkey = cur;
    1216       73150 :                     else if (impliesNN == NULL)
    1217       73150 :                         impliesNN = cur;
    1218      129586 :                     break;
    1219    24261926 :                 case BTEqualStrategyNumber:
    1220    24261926 :                     bkey = cur;
    1221    24261926 :                     break;
    1222     1683638 :                 case BTGreaterEqualStrategyNumber:
    1223             :                 case BTGreaterStrategyNumber:
    1224     1683638 :                     if (ScanDirectionIsForward(dir))
    1225     1683596 :                         bkey = cur;
    1226          42 :                     else if (impliesNN == NULL)
    1227          42 :                         impliesNN = cur;
    1228     1683638 :                     break;
    1229             :             }
    1230             :         }
    1231             :     }
    1232             : 
    1233             :     /*
    1234             :      * If we found no usable boundary keys, we have to start from one end of
    1235             :      * the tree.  Walk down that edge to the first or last key, and scan from
    1236             :      * there.
    1237             :      *
    1238             :      * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
    1239             :      */
    1240    16459224 :     if (keysz == 0)
    1241       89552 :         return _bt_endpoint(scan, dir);
    1242             : 
    1243             :     /*
    1244             :      * We want to start the scan somewhere within the index.  Set up an
    1245             :      * insertion scankey we can use to search for the boundary point we
    1246             :      * identified above.  The insertion scankey is built using the keys
    1247             :      * identified by startKeys[].  (Remaining insertion scankey fields are
    1248             :      * initialized after initial-positioning scan keys are finalized.)
    1249             :      */
    1250             :     Assert(keysz <= INDEX_MAX_KEYS);
    1251    42367958 :     for (int i = 0; i < keysz; i++)
    1252             :     {
    1253    25998334 :         ScanKey     bkey = startKeys[i];
    1254             : 
    1255             :         Assert(bkey->sk_attno == i + 1);
    1256             : 
    1257    25998334 :         if (bkey->sk_flags & SK_ROW_HEADER)
    1258             :         {
    1259             :             /*
    1260             :              * Row comparison header: look to the first row member instead
    1261             :              */
    1262          48 :             ScanKey     subkey = (ScanKey) DatumGetPointer(bkey->sk_argument);
    1263          48 :             bool        loosen_strat = false,
    1264          48 :                         tighten_strat = false;
    1265             : 
    1266             :             /*
    1267             :              * Cannot be a NULL in the first row member: _bt_preprocess_keys
    1268             :              * would've marked the qual as unsatisfiable, preventing us from
    1269             :              * ever getting this far
    1270             :              */
    1271             :             Assert(subkey->sk_flags & SK_ROW_MEMBER);
    1272             :             Assert(subkey->sk_attno == bkey->sk_attno);
    1273             :             Assert(!(subkey->sk_flags & SK_ISNULL));
    1274             : 
    1275             :             /*
    1276             :              * This is either a > or >= key (during backwards scans it is
    1277             :              * either < or <=) that was marked required during preprocessing.
    1278             :              * Later so->keyData[] keys can't have been marked required, so
    1279             :              * our row compare header key must be the final startKeys[] entry.
    1280             :              */
    1281             :             Assert(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD));
    1282             :             Assert(subkey->sk_strategy == bkey->sk_strategy);
    1283             :             Assert(subkey->sk_strategy == strat_total);
    1284             :             Assert(i == keysz - 1);
    1285             : 
    1286             :             /*
    1287             :              * The member scankeys are already in insertion format (ie, they
    1288             :              * have sk_func = 3-way-comparison function)
    1289             :              */
    1290          48 :             memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
    1291             : 
    1292             :             /*
    1293             :              * Now look to later row compare members.
    1294             :              *
    1295             :              * If there's an "index attribute gap" between two row compare
    1296             :              * members, the second member won't have been marked required, and
    1297             :              * so can't be used as a starting boundary key here.  The part of
    1298             :              * the row comparison that we do still use has to be treated as a
    1299             :              * ">=" or "<=" condition.  For example, a qual "(a, c) > (1, 42)"
    1300             :              * with an omitted intervening index attribute "b" will use an
    1301             :              * insertion scan key "a >= 1".  Even the first "a = 1" tuple on
    1302             :              * the leaf level might satisfy the row compare qual.
    1303             :              *
    1304             :              * We're able to use a _more_ restrictive strategy when we reach a
    1305             :              * NULL row compare member, since they're always unsatisfiable.
    1306             :              * For example, a qual "(a, b, c) >= (1, NULL, 77)" will use an
    1307             :              * insertion scan key "a > 1".  All tuples where "a = 1" cannot
    1308             :              * possibly satisfy the row compare qual, so this is safe.
    1309             :              */
    1310             :             Assert(!(subkey->sk_flags & SK_ROW_END));
    1311             :             for (;;)
    1312             :             {
    1313          48 :                 subkey++;
    1314             :                 Assert(subkey->sk_flags & SK_ROW_MEMBER);
    1315             : 
    1316          48 :                 if (subkey->sk_flags & SK_ISNULL)
    1317             :                 {
    1318             :                     /*
    1319             :                      * NULL member key, can only use earlier keys.
    1320             :                      *
    1321             :                      * We deliberately avoid checking if this key is marked
    1322             :                      * required.  All earlier keys are required, and this key
    1323             :                      * is unsatisfiable either way, so we can't miss anything.
    1324             :                      */
    1325          12 :                     tighten_strat = true;
    1326          12 :                     break;
    1327             :                 }
    1328             : 
    1329          36 :                 if (!(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
    1330             :                 {
    1331             :                     /* nonrequired member key, can only use earlier keys */
    1332          12 :                     loosen_strat = true;
    1333          12 :                     break;
    1334             :                 }
    1335             : 
    1336             :                 Assert(subkey->sk_attno == keysz + 1);
    1337             :                 Assert(subkey->sk_strategy == bkey->sk_strategy);
    1338             :                 Assert(keysz < INDEX_MAX_KEYS);
    1339             : 
    1340          24 :                 memcpy(inskey.scankeys + keysz, subkey, sizeof(ScanKeyData));
    1341          24 :                 keysz++;
    1342             : 
    1343          24 :                 if (subkey->sk_flags & SK_ROW_END)
    1344          24 :                     break;
    1345             :             }
    1346             :             Assert(!(loosen_strat && tighten_strat));
    1347          48 :             if (loosen_strat)
    1348             :             {
    1349             :                 /* Use less restrictive strategy (and fewer member keys) */
    1350          12 :                 switch (strat_total)
    1351             :                 {
    1352           6 :                     case BTLessStrategyNumber:
    1353           6 :                         strat_total = BTLessEqualStrategyNumber;
    1354           6 :                         break;
    1355           6 :                     case BTGreaterStrategyNumber:
    1356           6 :                         strat_total = BTGreaterEqualStrategyNumber;
    1357           6 :                         break;
    1358             :                 }
    1359             :             }
    1360          48 :             if (tighten_strat)
    1361             :             {
    1362             :                 /* Use more restrictive strategy (and fewer member keys) */
    1363          12 :                 switch (strat_total)
    1364             :                 {
    1365           6 :                     case BTLessEqualStrategyNumber:
    1366           6 :                         strat_total = BTLessStrategyNumber;
    1367           6 :                         break;
    1368           6 :                     case BTGreaterEqualStrategyNumber:
    1369           6 :                         strat_total = BTGreaterStrategyNumber;
    1370           6 :                         break;
    1371             :                 }
    1372             :             }
    1373             : 
    1374             :             /* Done (row compare header key is always last startKeys[] key) */
    1375          48 :             break;
    1376             :         }
    1377             : 
    1378             :         /*
    1379             :          * Ordinary comparison key/search-style key.
    1380             :          *
    1381             :          * Transform the search-style scan key to an insertion scan key by
    1382             :          * replacing the sk_func with the appropriate btree 3-way-comparison
    1383             :          * function.
    1384             :          *
    1385             :          * If scankey operator is not a cross-type comparison, we can use the
    1386             :          * cached comparison function; otherwise gotta look it up in the
    1387             :          * catalogs.  (That can't lead to infinite recursion, since no
    1388             :          * indexscan initiated by syscache lookup will use cross-data-type
    1389             :          * operators.)
    1390             :          *
    1391             :          * We support the convention that sk_subtype == InvalidOid means the
    1392             :          * opclass input type; this hack simplifies life for ScanKeyInit().
    1393             :          */
    1394    25998286 :         if (bkey->sk_subtype == rel->rd_opcintype[i] ||
    1395    25144290 :             bkey->sk_subtype == InvalidOid)
    1396    25987666 :         {
    1397             :             FmgrInfo   *procinfo;
    1398             : 
    1399    25987666 :             procinfo = index_getprocinfo(rel, bkey->sk_attno, BTORDER_PROC);
    1400    25987666 :             ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
    1401             :                                            bkey->sk_flags,
    1402    25987666 :                                            bkey->sk_attno,
    1403             :                                            InvalidStrategy,
    1404             :                                            bkey->sk_subtype,
    1405             :                                            bkey->sk_collation,
    1406             :                                            procinfo,
    1407             :                                            bkey->sk_argument);
    1408             :         }
    1409             :         else
    1410             :         {
    1411             :             RegProcedure cmp_proc;
    1412             : 
    1413       10620 :             cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
    1414       10620 :                                          rel->rd_opcintype[i],
    1415             :                                          bkey->sk_subtype, BTORDER_PROC);
    1416       10620 :             if (!RegProcedureIsValid(cmp_proc))
    1417           0 :                 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
    1418             :                      BTORDER_PROC, rel->rd_opcintype[i], bkey->sk_subtype,
    1419             :                      bkey->sk_attno, RelationGetRelationName(rel));
    1420       10620 :             ScanKeyEntryInitialize(inskey.scankeys + i,
    1421             :                                    bkey->sk_flags,
    1422       10620 :                                    bkey->sk_attno,
    1423             :                                    InvalidStrategy,
    1424             :                                    bkey->sk_subtype,
    1425             :                                    bkey->sk_collation,
    1426             :                                    cmp_proc,
    1427             :                                    bkey->sk_argument);
    1428             :         }
    1429             :     }
    1430             : 
    1431             :     /*----------
    1432             :      * Examine the selected initial-positioning strategy to determine exactly
    1433             :      * where we need to start the scan, and set flag variables to control the
    1434             :      * initial descent by _bt_search (and our _bt_binsrch call for the leaf
    1435             :      * page _bt_search returns).
    1436             :      *----------
    1437             :      */
    1438    16369672 :     _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
    1439    16369672 :     inskey.anynullkeys = false; /* unused */
    1440    16369672 :     inskey.scantid = NULL;
    1441    16369672 :     inskey.keysz = keysz;
    1442    16369672 :     switch (strat_total)
    1443             :     {
    1444       56442 :         case BTLessStrategyNumber:
    1445             : 
    1446       56442 :             inskey.nextkey = false;
    1447       56442 :             inskey.backward = true;
    1448       56442 :             break;
    1449             : 
    1450          18 :         case BTLessEqualStrategyNumber:
    1451             : 
    1452          18 :             inskey.nextkey = true;
    1453          18 :             inskey.backward = true;
    1454          18 :             break;
    1455             : 
    1456    14629592 :         case BTEqualStrategyNumber:
    1457             : 
    1458             :             /*
    1459             :              * If a backward scan was specified, need to start with last equal
    1460             :              * item not first one.
    1461             :              */
    1462    14629592 :             if (ScanDirectionIsBackward(dir))
    1463             :             {
    1464             :                 /*
    1465             :                  * This is the same as the <= strategy
    1466             :                  */
    1467         200 :                 inskey.nextkey = true;
    1468         200 :                 inskey.backward = true;
    1469             :             }
    1470             :             else
    1471             :             {
    1472             :                 /*
    1473             :                  * This is the same as the >= strategy
    1474             :                  */
    1475    14629392 :                 inskey.nextkey = false;
    1476    14629392 :                 inskey.backward = false;
    1477             :             }
    1478    14629592 :             break;
    1479             : 
    1480       10100 :         case BTGreaterEqualStrategyNumber:
    1481             : 
    1482             :             /*
    1483             :              * Find first item >= scankey
    1484             :              */
    1485       10100 :             inskey.nextkey = false;
    1486       10100 :             inskey.backward = false;
    1487       10100 :             break;
    1488             : 
    1489     1673520 :         case BTGreaterStrategyNumber:
    1490             : 
    1491             :             /*
    1492             :              * Find first item > scankey
    1493             :              */
    1494     1673520 :             inskey.nextkey = true;
    1495     1673520 :             inskey.backward = false;
    1496     1673520 :             break;
    1497             : 
    1498           0 :         default:
    1499             :             /* can't get here, but keep compiler quiet */
    1500           0 :             elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
    1501             :             return false;
    1502             :     }
    1503             : 
    1504             :     /*
    1505             :      * Use the manufactured insertion scan key to descend the tree and
    1506             :      * position ourselves on the target leaf page.
    1507             :      */
    1508             :     Assert(ScanDirectionIsBackward(dir) == inskey.backward);
    1509    16369672 :     stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
    1510             : 
    1511             :     /* don't need to keep the stack around... */
    1512    16369672 :     _bt_freestack(stack);
    1513             : 
    1514    16369672 :     if (!BufferIsValid(so->currPos.buf))
    1515             :     {
    1516             :         Assert(!so->needPrimScan);
    1517             : 
    1518             :         /*
    1519             :          * We only get here if the index is completely empty. Lock relation
    1520             :          * because nothing finer to lock exists.  Without a buffer lock, it's
    1521             :          * possible for another transaction to insert data between
    1522             :          * _bt_search() and PredicateLockRelation().  We have to try again
    1523             :          * after taking the relation-level predicate lock, to close a narrow
    1524             :          * window where we wouldn't scan concurrently inserted tuples, but the
    1525             :          * writer wouldn't see our predicate lock.
    1526             :          */
    1527      577938 :         if (IsolationIsSerializable())
    1528             :         {
    1529        5662 :             PredicateLockRelation(rel, scan->xs_snapshot);
    1530        5662 :             stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
    1531        5662 :             _bt_freestack(stack);
    1532             :         }
    1533             : 
    1534      577938 :         if (!BufferIsValid(so->currPos.buf))
    1535             :         {
    1536      577938 :             _bt_parallel_done(scan);
    1537      577938 :             return false;
    1538             :         }
    1539             :     }
    1540             : 
    1541             :     /* position to the precise item on the page */
    1542    15791734 :     offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
    1543             : 
    1544             :     /*
    1545             :      * Now load data from the first page of the scan (usually the page
    1546             :      * currently in so->currPos.buf).
    1547             :      *
    1548             :      * If inskey.nextkey = false and inskey.backward = false, offnum is
    1549             :      * positioned at the first non-pivot tuple >= inskey.scankeys.
    1550             :      *
    1551             :      * If inskey.nextkey = false and inskey.backward = true, offnum is
    1552             :      * positioned at the last non-pivot tuple < inskey.scankeys.
    1553             :      *
    1554             :      * If inskey.nextkey = true and inskey.backward = false, offnum is
    1555             :      * positioned at the first non-pivot tuple > inskey.scankeys.
    1556             :      *
    1557             :      * If inskey.nextkey = true and inskey.backward = true, offnum is
    1558             :      * positioned at the last non-pivot tuple <= inskey.scankeys.
    1559             :      *
    1560             :      * It's possible that _bt_binsrch returned an offnum that is out of bounds
    1561             :      * for the page.  For example, when inskey is both < the leaf page's high
    1562             :      * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
    1563             :      */
    1564    15791734 :     if (!_bt_readfirstpage(scan, offnum, dir))
    1565     4071354 :         return false;
    1566             : 
    1567    11720380 :     _bt_returnitem(scan, so);
    1568    11720380 :     return true;
    1569             : }
    1570             : 
    1571             : /*
    1572             :  *  _bt_next() -- Get the next item in a scan.
    1573             :  *
    1574             :  *      On entry, so->currPos describes the current page, which may be pinned
    1575             :  *      but is not locked, and so->currPos.itemIndex identifies which item was
    1576             :  *      previously returned.
    1577             :  *
    1578             :  *      On success exit, so->currPos is updated as needed, and _bt_returnitem
    1579             :  *      sets the next item to return to the scan.  so->currPos remains valid.
    1580             :  *
    1581             :  *      On failure exit (no more tuples), we invalidate so->currPos.  It'll
    1582             :  *      still be possible for the scan to return tuples by changing direction,
    1583             :  *      though we'll need to call _bt_first anew in that other direction.
    1584             :  */
    1585             : bool
    1586    20086418 : _bt_next(IndexScanDesc scan, ScanDirection dir)
    1587             : {
    1588    20086418 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1589             : 
    1590             :     Assert(BTScanPosIsValid(so->currPos));
    1591             : 
    1592             :     /*
    1593             :      * Advance to next tuple on current page; or if there's no more, try to
    1594             :      * step to the next page with data.
    1595             :      */
    1596    20086418 :     if (ScanDirectionIsForward(dir))
    1597             :     {
    1598    20044118 :         if (++so->currPos.itemIndex > so->currPos.lastItem)
    1599             :         {
    1600     2654776 :             if (!_bt_steppage(scan, dir))
    1601     2624014 :                 return false;
    1602             :         }
    1603             :     }
    1604             :     else
    1605             :     {
    1606       42300 :         if (--so->currPos.itemIndex < so->currPos.firstItem)
    1607             :         {
    1608         116 :             if (!_bt_steppage(scan, dir))
    1609          92 :                 return false;
    1610             :         }
    1611             :     }
    1612             : 
    1613    17462308 :     _bt_returnitem(scan, so);
    1614    17462308 :     return true;
    1615             : }
    1616             : 
    1617             : /*
    1618             :  * Return the index item from so->currPos.items[so->currPos.itemIndex] to the
    1619             :  * index scan by setting the relevant fields in caller's index scan descriptor
    1620             :  */
    1621             : static inline void
    1622    29262774 : _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
    1623             : {
    1624    29262774 :     BTScanPosItem *currItem = &so->currPos.items[so->currPos.itemIndex];
    1625             : 
    1626             :     /* Most recent _bt_readpage must have succeeded */
    1627             :     Assert(BTScanPosIsValid(so->currPos));
    1628             :     Assert(so->currPos.itemIndex >= so->currPos.firstItem);
    1629             :     Assert(so->currPos.itemIndex <= so->currPos.lastItem);
    1630             : 
    1631             :     /* Return next item, per amgettuple contract */
    1632    29262774 :     scan->xs_heaptid = currItem->heapTid;
    1633    29262774 :     if (so->currTuples)
    1634     4273884 :         scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
    1635    29262774 : }
    1636             : 
    1637             : /*
    1638             :  *  _bt_steppage() -- Step to next page containing valid data for scan
    1639             :  *
    1640             :  * Wrapper on _bt_readnextpage that performs final steps for the current page.
    1641             :  *
    1642             :  * On entry, so->currPos must be valid.  Its buffer will be pinned, though
    1643             :  * never locked. (Actually, when so->dropPin there won't even be a pin held,
    1644             :  * though so->currPos.currPage must still be set to a valid block number.)
    1645             :  */
    1646             : static bool
    1647     6728178 : _bt_steppage(IndexScanDesc scan, ScanDirection dir)
    1648             : {
    1649     6728178 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1650             :     BlockNumber blkno,
    1651             :                 lastcurrblkno;
    1652             : 
    1653             :     Assert(BTScanPosIsValid(so->currPos));
    1654             : 
    1655             :     /* Before leaving current page, deal with any killed items */
    1656     6728178 :     if (so->numKilled > 0)
    1657       84220 :         _bt_killitems(scan);
    1658             : 
    1659             :     /*
    1660             :      * Before we modify currPos, make a copy of the page data if there was a
    1661             :      * mark position that needs it.
    1662             :      */
    1663     6728178 :     if (so->markItemIndex >= 0)
    1664             :     {
    1665             :         /* bump pin on current buffer for assignment to mark buffer */
    1666         374 :         if (BTScanPosIsPinned(so->currPos))
    1667         348 :             IncrBufferRefCount(so->currPos.buf);
    1668         374 :         memcpy(&so->markPos, &so->currPos,
    1669             :                offsetof(BTScanPosData, items[1]) +
    1670         374 :                so->currPos.lastItem * sizeof(BTScanPosItem));
    1671         374 :         if (so->markTuples)
    1672         348 :             memcpy(so->markTuples, so->currTuples,
    1673         348 :                    so->currPos.nextTupleOffset);
    1674         374 :         so->markPos.itemIndex = so->markItemIndex;
    1675         374 :         so->markItemIndex = -1;
    1676             : 
    1677             :         /*
    1678             :          * If we're just about to start the next primitive index scan
    1679             :          * (possible with a scan that has arrays keys, and needs to skip to
    1680             :          * continue in the current scan direction), moreLeft/moreRight only
    1681             :          * indicate the end of the current primitive index scan.  They must
    1682             :          * never be taken to indicate that the top-level index scan has ended
    1683             :          * (that would be wrong).
    1684             :          *
    1685             :          * We could handle this case by treating the current array keys as
    1686             :          * markPos state.  But depending on the current array state like this
    1687             :          * would add complexity.  Instead, we just unset markPos's copy of
    1688             :          * moreRight or moreLeft (whichever might be affected), while making
    1689             :          * btrestrpos reset the scan's arrays to their initial scan positions.
    1690             :          * In effect, btrestrpos leaves advancing the arrays up to the first
    1691             :          * _bt_readpage call (that takes place after it has restored markPos).
    1692             :          */
    1693         374 :         if (so->needPrimScan)
    1694             :         {
    1695           0 :             if (ScanDirectionIsForward(so->currPos.dir))
    1696           0 :                 so->markPos.moreRight = true;
    1697             :             else
    1698           0 :                 so->markPos.moreLeft = true;
    1699             :         }
    1700             : 
    1701             :         /* mark/restore not supported by parallel scans */
    1702             :         Assert(!scan->parallel_scan);
    1703             :     }
    1704             : 
    1705     6728178 :     BTScanPosUnpinIfPinned(so->currPos);
    1706             : 
    1707             :     /* Walk to the next page with data */
    1708     6728178 :     if (ScanDirectionIsForward(dir))
    1709     6727970 :         blkno = so->currPos.nextPage;
    1710             :     else
    1711         208 :         blkno = so->currPos.prevPage;
    1712     6728178 :     lastcurrblkno = so->currPos.currPage;
    1713             : 
    1714             :     /*
    1715             :      * Cancel primitive index scans that were scheduled when the call to
    1716             :      * _bt_readpage for currPos happened to use the opposite direction to the
    1717             :      * one that we're stepping in now.  (It's okay to leave the scan's array
    1718             :      * keys as-is, since the next _bt_readpage will advance them.)
    1719             :      */
    1720     6728178 :     if (so->currPos.dir != dir)
    1721          36 :         so->needPrimScan = false;
    1722             : 
    1723     6728178 :     return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
    1724             : }
    1725             : 
    1726             : /*
    1727             :  *  _bt_readfirstpage() -- Read first page containing valid data for _bt_first
    1728             :  *
    1729             :  * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
    1730             :  * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
    1731             :  * When we're passed an offnum past the end of the page, we might still manage
    1732             :  * to stop the scan on this page by calling _bt_checkkeys against the high
    1733             :  * key.  See _bt_readpage for full details.
    1734             :  *
    1735             :  * On entry, so->currPos must be pinned and locked (so offnum stays valid).
    1736             :  * Parallel scan callers must have seized the scan before calling here.
    1737             :  *
    1738             :  * On exit, we'll have updated so->currPos and retained locks and pins
    1739             :  * according to the same rules as those laid out for _bt_readnextpage exit.
    1740             :  * Like _bt_readnextpage, our return value indicates if there are any matching
    1741             :  * records in the given direction.
    1742             :  *
    1743             :  * We always release the scan for a parallel scan caller, regardless of
    1744             :  * success or failure; we'll call _bt_parallel_release as soon as possible.
    1745             :  */
    1746             : static bool
    1747    15873574 : _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
    1748             : {
    1749    15873574 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1750             : 
    1751    15873574 :     so->numKilled = 0;           /* just paranoia */
    1752    15873574 :     so->markItemIndex = -1;      /* ditto */
    1753             : 
    1754             :     /* Initialize so->currPos for the first page (page in so->currPos.buf) */
    1755    15873574 :     if (so->needPrimScan)
    1756             :     {
    1757             :         Assert(so->numArrayKeys);
    1758             : 
    1759       17566 :         so->currPos.moreLeft = true;
    1760       17566 :         so->currPos.moreRight = true;
    1761       17566 :         so->needPrimScan = false;
    1762             :     }
    1763    15856008 :     else if (ScanDirectionIsForward(dir))
    1764             :     {
    1765    15799318 :         so->currPos.moreLeft = false;
    1766    15799318 :         so->currPos.moreRight = true;
    1767             :     }
    1768             :     else
    1769             :     {
    1770       56690 :         so->currPos.moreLeft = true;
    1771       56690 :         so->currPos.moreRight = false;
    1772             :     }
    1773             : 
    1774             :     /*
    1775             :      * Attempt to load matching tuples from the first page.
    1776             :      *
    1777             :      * Note that _bt_readpage will finish initializing the so->currPos fields.
    1778             :      * _bt_readpage also releases parallel scan (even when it returns false).
    1779             :      */
    1780    15873574 :     if (_bt_readpage(scan, dir, offnum, true))
    1781             :     {
    1782    11800288 :         Relation    rel = scan->indexRelation;
    1783             : 
    1784             :         /*
    1785             :          * _bt_readpage succeeded.  Drop the lock (and maybe the pin) on
    1786             :          * so->currPos.buf in preparation for btgettuple returning tuples.
    1787             :          */
    1788             :         Assert(BTScanPosIsPinned(so->currPos));
    1789    11800288 :         _bt_drop_lock_and_maybe_pin(rel, so);
    1790    11800288 :         return true;
    1791             :     }
    1792             : 
    1793             :     /* There's no actually-matching data on the page in so->currPos.buf */
    1794     4073286 :     _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
    1795             : 
    1796             :     /* Call _bt_readnextpage using its _bt_steppage wrapper function */
    1797     4073286 :     if (!_bt_steppage(scan, dir))
    1798     4073136 :         return false;
    1799             : 
    1800             :     /* _bt_readpage for a later page (now in so->currPos) succeeded */
    1801         150 :     return true;
    1802             : }
    1803             : 
    1804             : /*
    1805             :  *  _bt_readnextpage() -- Read next page containing valid data for _bt_next
    1806             :  *
    1807             :  * Caller's blkno is the next interesting page's link, taken from either the
    1808             :  * previously-saved right link or left link.  lastcurrblkno is the page that
    1809             :  * was current at the point where the blkno link was saved, which we use to
    1810             :  * reason about concurrent page splits/page deletions during backwards scans.
    1811             :  * In the common case where seized=false, blkno is either so->currPos.nextPage
    1812             :  * or so->currPos.prevPage, and lastcurrblkno is so->currPos.currPage.
    1813             :  *
    1814             :  * On entry, so->currPos shouldn't be locked by caller.  so->currPos.buf must
    1815             :  * be InvalidBuffer/unpinned as needed by caller (note that lastcurrblkno
    1816             :  * won't need to be read again in almost all cases).  Parallel scan callers
    1817             :  * that seized the scan before calling here should pass seized=true; such a
    1818             :  * caller's blkno and lastcurrblkno arguments come from the seized scan.
    1819             :  * seized=false callers just pass us the blkno/lastcurrblkno taken from their
    1820             :  * so->currPos, which (along with so->currPos itself) can be used to end the
    1821             :  * scan.  A seized=false caller's blkno can never be assumed to be the page
    1822             :  * that must be read next during a parallel scan, though.  We must figure that
    1823             :  * part out for ourselves by seizing the scan (the correct page to read might
    1824             :  * already be beyond the seized=false caller's blkno during a parallel scan,
    1825             :  * unless blkno/so->currPos.nextPage/so->currPos.prevPage is already P_NONE,
    1826             :  * or unless so->currPos.moreRight/so->currPos.moreLeft is already unset).
    1827             :  *
    1828             :  * On success exit, so->currPos is updated to contain data from the next
    1829             :  * interesting page, and we return true.  We hold a pin on the buffer on
    1830             :  * success exit (except during so->dropPin index scans, when we drop the pin
    1831             :  * eagerly to avoid blocking VACUUM).
    1832             :  *
    1833             :  * If there are no more matching records in the given direction, we invalidate
    1834             :  * so->currPos (while ensuring it retains no locks or pins), and return false.
    1835             :  *
    1836             :  * We always release the scan for a parallel scan caller, regardless of
    1837             :  * success or failure; we'll call _bt_parallel_release as soon as possible.
    1838             :  */
    1839             : static bool
    1840     6728206 : _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
    1841             :                  BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
    1842             : {
    1843     6728206 :     Relation    rel = scan->indexRelation;
    1844     6728206 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1845             : 
    1846             :     Assert(so->currPos.currPage == lastcurrblkno || seized);
    1847             :     Assert(!(blkno == P_NONE && seized));
    1848             :     Assert(!BTScanPosIsPinned(so->currPos));
    1849             : 
    1850             :     /*
    1851             :      * Remember that the scan already read lastcurrblkno, a page to the left
    1852             :      * of blkno (or remember reading a page to the right, for backwards scans)
    1853             :      */
    1854     6728206 :     if (ScanDirectionIsForward(dir))
    1855     6727998 :         so->currPos.moreLeft = true;
    1856             :     else
    1857         208 :         so->currPos.moreRight = true;
    1858             : 
    1859             :     for (;;)
    1860        2430 :     {
    1861             :         Page        page;
    1862             :         BTPageOpaque opaque;
    1863             : 
    1864     6730636 :         if (blkno == P_NONE ||
    1865             :             (ScanDirectionIsForward(dir) ?
    1866     2131016 :              !so->currPos.moreRight : !so->currPos.moreLeft))
    1867             :         {
    1868             :             /* most recent _bt_readpage call (for lastcurrblkno) ended scan */
    1869             :             Assert(so->currPos.currPage == lastcurrblkno && !seized);
    1870     6697214 :             BTScanPosInvalidate(so->currPos);
    1871     6697214 :             _bt_parallel_done(scan);    /* iff !so->needPrimScan */
    1872     6697214 :             return false;
    1873             :         }
    1874             : 
    1875             :         Assert(!so->needPrimScan);
    1876             : 
    1877             :         /* parallel scan must never actually visit so->currPos blkno */
    1878       33422 :         if (!seized && scan->parallel_scan != NULL &&
    1879        1212 :             !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
    1880             :         {
    1881             :             /* whole scan is now done (or another primitive scan required) */
    1882          28 :             BTScanPosInvalidate(so->currPos);
    1883          28 :             return false;
    1884             :         }
    1885             : 
    1886       33394 :         if (ScanDirectionIsForward(dir))
    1887             :         {
    1888             :             /* read blkno, but check for interrupts first */
    1889       33280 :             CHECK_FOR_INTERRUPTS();
    1890       33276 :             so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
    1891             :         }
    1892             :         else
    1893             :         {
    1894             :             /* read blkno, avoiding race (also checks for interrupts) */
    1895         114 :             so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,
    1896             :                                                          lastcurrblkno);
    1897         114 :             if (so->currPos.buf == InvalidBuffer)
    1898             :             {
    1899             :                 /* must have been a concurrent deletion of leftmost page */
    1900           0 :                 BTScanPosInvalidate(so->currPos);
    1901           0 :                 _bt_parallel_done(scan);
    1902           0 :                 return false;
    1903             :             }
    1904             :         }
    1905             : 
    1906       33390 :         page = BufferGetPage(so->currPos.buf);
    1907       33390 :         opaque = BTPageGetOpaque(page);
    1908       33390 :         lastcurrblkno = blkno;
    1909       33390 :         if (likely(!P_IGNORE(opaque)))
    1910             :         {
    1911             :             /* see if there are any matches on this page */
    1912       33390 :             if (ScanDirectionIsForward(dir))
    1913             :             {
    1914             :                 /* note that this will clear moreRight if we can stop */
    1915       33276 :                 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), seized))
    1916       30868 :                     break;
    1917        2408 :                 blkno = so->currPos.nextPage;
    1918             :             }
    1919             :             else
    1920             :             {
    1921             :                 /* note that this will clear moreLeft if we can stop */
    1922         114 :                 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), seized))
    1923          92 :                     break;
    1924          22 :                 blkno = so->currPos.prevPage;
    1925             :             }
    1926             :         }
    1927             :         else
    1928             :         {
    1929             :             /* _bt_readpage not called, so do all this for ourselves */
    1930           0 :             if (ScanDirectionIsForward(dir))
    1931           0 :                 blkno = opaque->btpo_next;
    1932             :             else
    1933           0 :                 blkno = opaque->btpo_prev;
    1934           0 :             if (scan->parallel_scan != NULL)
    1935           0 :                 _bt_parallel_release(scan, blkno, lastcurrblkno);
    1936             :         }
    1937             : 
    1938             :         /* no matching tuples on this page */
    1939        2430 :         _bt_relbuf(rel, so->currPos.buf);
    1940        2430 :         seized = false;         /* released by _bt_readpage (or by us) */
    1941             :     }
    1942             : 
    1943             :     /*
    1944             :      * _bt_readpage succeeded.  Drop the lock (and maybe the pin) on
    1945             :      * so->currPos.buf in preparation for btgettuple returning tuples.
    1946             :      */
    1947             :     Assert(so->currPos.currPage == blkno);
    1948             :     Assert(BTScanPosIsPinned(so->currPos));
    1949       30960 :     _bt_drop_lock_and_maybe_pin(rel, so);
    1950             : 
    1951       30960 :     return true;
    1952             : }
    1953             : 
    1954             : /*
    1955             :  * _bt_lock_and_validate_left() -- lock caller's left sibling blkno,
    1956             :  * recovering from concurrent page splits/page deletions when necessary
    1957             :  *
    1958             :  * Called during backwards scans, to deal with their unique concurrency rules.
    1959             :  *
    1960             :  * blkno points to the block number of the page that we expect to move the
    1961             :  * scan to.  We'll successfully move the scan there when we find that its
    1962             :  * right sibling link still points to lastcurrblkno (the page we just read).
    1963             :  * Otherwise, we have to figure out which page is the correct one for the scan
    1964             :  * to now read the hard way, reasoning about concurrent splits and deletions.
    1965             :  * See nbtree/README.
    1966             :  *
    1967             :  * On return, we have both a pin and a read lock on the returned page, whose
    1968             :  * block number will be set in *blkno.  Returns InvalidBuffer if there is no
    1969             :  * page to the left (no lock or pin is held in that case).
    1970             :  *
    1971             :  * It is possible for the returned leaf page to be half-dead; caller must
    1972             :  * check that condition and step left again when required.
    1973             :  */
    1974             : static Buffer
    1975         114 : _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
    1976             :                            BlockNumber lastcurrblkno)
    1977             : {
    1978         114 :     BlockNumber origblkno = *blkno; /* detects circular links */
    1979             : 
    1980             :     for (;;)
    1981           0 :     {
    1982             :         Buffer      buf;
    1983             :         Page        page;
    1984             :         BTPageOpaque opaque;
    1985             :         int         tries;
    1986             : 
    1987             :         /* check for interrupts while we're not holding any buffer lock */
    1988         114 :         CHECK_FOR_INTERRUPTS();
    1989         114 :         buf = _bt_getbuf(rel, *blkno, BT_READ);
    1990         114 :         page = BufferGetPage(buf);
    1991         114 :         opaque = BTPageGetOpaque(page);
    1992             : 
    1993             :         /*
    1994             :          * If this isn't the page we want, walk right till we find what we
    1995             :          * want --- but go no more than four hops (an arbitrary limit). If we
    1996             :          * don't find the correct page by then, the most likely bet is that
    1997             :          * lastcurrblkno got deleted and isn't in the sibling chain at all
    1998             :          * anymore, not that its left sibling got split more than four times.
    1999             :          *
    2000             :          * Note that it is correct to test P_ISDELETED not P_IGNORE here,
    2001             :          * because half-dead pages are still in the sibling chain.
    2002             :          */
    2003         114 :         tries = 0;
    2004             :         for (;;)
    2005             :         {
    2006         114 :             if (likely(!P_ISDELETED(opaque) &&
    2007             :                        opaque->btpo_next == lastcurrblkno))
    2008             :             {
    2009             :                 /* Found desired page, return it */
    2010         114 :                 return buf;
    2011             :             }
    2012           0 :             if (P_RIGHTMOST(opaque) || ++tries > 4)
    2013             :                 break;
    2014             :             /* step right */
    2015           0 :             *blkno = opaque->btpo_next;
    2016           0 :             buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);
    2017           0 :             page = BufferGetPage(buf);
    2018           0 :             opaque = BTPageGetOpaque(page);
    2019             :         }
    2020             : 
    2021             :         /*
    2022             :          * Return to the original page (usually the page most recently read by
    2023             :          * _bt_readpage, which is passed by caller as lastcurrblkno) to see
    2024             :          * what's up with its prev sibling link
    2025             :          */
    2026           0 :         buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
    2027           0 :         page = BufferGetPage(buf);
    2028           0 :         opaque = BTPageGetOpaque(page);
    2029           0 :         if (P_ISDELETED(opaque))
    2030             :         {
    2031             :             /*
    2032             :              * It was deleted.  Move right to first nondeleted page (there
    2033             :              * must be one); that is the page that has acquired the deleted
    2034             :              * one's keyspace, so stepping left from it will take us where we
    2035             :              * want to be.
    2036             :              */
    2037             :             for (;;)
    2038             :             {
    2039           0 :                 if (P_RIGHTMOST(opaque))
    2040           0 :                     elog(ERROR, "fell off the end of index \"%s\"",
    2041             :                          RelationGetRelationName(rel));
    2042           0 :                 lastcurrblkno = opaque->btpo_next;
    2043           0 :                 buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
    2044           0 :                 page = BufferGetPage(buf);
    2045           0 :                 opaque = BTPageGetOpaque(page);
    2046           0 :                 if (!P_ISDELETED(opaque))
    2047           0 :                     break;
    2048             :             }
    2049             :         }
    2050             :         else
    2051             :         {
    2052             :             /*
    2053             :              * Original lastcurrblkno wasn't deleted; the explanation had
    2054             :              * better be that the page to the left got split or deleted.
    2055             :              * Without this check, we risk going into an infinite loop.
    2056             :              */
    2057           0 :             if (opaque->btpo_prev == origblkno)
    2058           0 :                 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
    2059             :                      lastcurrblkno, RelationGetRelationName(rel));
    2060             :             /* Okay to try again, since left sibling link changed */
    2061             :         }
    2062             : 
    2063             :         /*
    2064             :          * Original lastcurrblkno from caller was concurrently deleted (could
    2065             :          * also have been a great many concurrent left sibling page splits).
    2066             :          * Found a non-deleted page that should now act as our lastcurrblkno.
    2067             :          */
    2068           0 :         if (P_LEFTMOST(opaque))
    2069             :         {
    2070             :             /* New lastcurrblkno has no left sibling (concurrently deleted) */
    2071           0 :             _bt_relbuf(rel, buf);
    2072           0 :             break;
    2073             :         }
    2074             : 
    2075             :         /* Start from scratch with new lastcurrblkno's blkno/prev link */
    2076           0 :         *blkno = origblkno = opaque->btpo_prev;
    2077           0 :         _bt_relbuf(rel, buf);
    2078             :     }
    2079             : 
    2080           0 :     return InvalidBuffer;
    2081             : }
    2082             : 
    2083             : /*
    2084             :  * _bt_get_endpoint() -- Find the first or last page on a given tree level
    2085             :  *
    2086             :  * If the index is empty, we will return InvalidBuffer; any other failure
    2087             :  * condition causes ereport().  We will not return a dead page.
    2088             :  *
    2089             :  * The returned buffer is pinned and read-locked.
    2090             :  */
    2091             : Buffer
    2092       89576 : _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
    2093             : {
    2094             :     Buffer      buf;
    2095             :     Page        page;
    2096             :     BTPageOpaque opaque;
    2097             :     OffsetNumber offnum;
    2098             :     BlockNumber blkno;
    2099             :     IndexTuple  itup;
    2100             : 
    2101             :     /*
    2102             :      * If we are looking for a leaf page, okay to descend from fast root;
    2103             :      * otherwise better descend from true root.  (There is no point in being
    2104             :      * smarter about intermediate levels.)
    2105             :      */
    2106       89576 :     if (level == 0)
    2107       89552 :         buf = _bt_getroot(rel, NULL, BT_READ);
    2108             :     else
    2109          24 :         buf = _bt_gettrueroot(rel);
    2110             : 
    2111       89576 :     if (!BufferIsValid(buf))
    2112        7712 :         return InvalidBuffer;
    2113             : 
    2114       81864 :     page = BufferGetPage(buf);
    2115       81864 :     opaque = BTPageGetOpaque(page);
    2116             : 
    2117             :     for (;;)
    2118             :     {
    2119             :         /*
    2120             :          * If we landed on a deleted page, step right to find a live page
    2121             :          * (there must be one).  Also, if we want the rightmost page, step
    2122             :          * right if needed to get to it (this could happen if the page split
    2123             :          * since we obtained a pointer to it).
    2124             :          */
    2125      104846 :         while (P_IGNORE(opaque) ||
    2126          66 :                (rightmost && !P_RIGHTMOST(opaque)))
    2127             :         {
    2128           0 :             blkno = opaque->btpo_next;
    2129           0 :             if (blkno == P_NONE)
    2130           0 :                 elog(ERROR, "fell off the end of index \"%s\"",
    2131             :                      RelationGetRelationName(rel));
    2132           0 :             buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    2133           0 :             page = BufferGetPage(buf);
    2134           0 :             opaque = BTPageGetOpaque(page);
    2135             :         }
    2136             : 
    2137             :         /* Done? */
    2138      104846 :         if (opaque->btpo_level == level)
    2139       81864 :             break;
    2140       22982 :         if (opaque->btpo_level < level)
    2141           0 :             ereport(ERROR,
    2142             :                     (errcode(ERRCODE_INDEX_CORRUPTED),
    2143             :                      errmsg_internal("btree level %u not found in index \"%s\"",
    2144             :                                      level, RelationGetRelationName(rel))));
    2145             : 
    2146             :         /* Descend to leftmost or rightmost child page */
    2147       22982 :         if (rightmost)
    2148           6 :             offnum = PageGetMaxOffsetNumber(page);
    2149             :         else
    2150       22976 :             offnum = P_FIRSTDATAKEY(opaque);
    2151             : 
    2152       22982 :         if (offnum < 1 || offnum > PageGetMaxOffsetNumber(page))
    2153           0 :             elog(PANIC, "offnum out of range");
    2154             : 
    2155       22982 :         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
    2156       22982 :         blkno = BTreeTupleGetDownLink(itup);
    2157             : 
    2158       22982 :         buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    2159       22982 :         page = BufferGetPage(buf);
    2160       22982 :         opaque = BTPageGetOpaque(page);
    2161             :     }
    2162             : 
    2163       81864 :     return buf;
    2164             : }
    2165             : 
    2166             : /*
    2167             :  *  _bt_endpoint() -- Find the first or last page in the index, and scan
    2168             :  * from there to the first key satisfying all the quals.
    2169             :  *
    2170             :  * This is used by _bt_first() to set up a scan when we've determined
    2171             :  * that the scan must start at the beginning or end of the index (for
    2172             :  * a forward or backward scan respectively).
    2173             :  *
    2174             :  * Parallel scan callers must have seized the scan before calling here.
    2175             :  * Exit conditions are the same as for _bt_first().
    2176             :  */
    2177             : static bool
    2178       89552 : _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
    2179             : {
    2180       89552 :     Relation    rel = scan->indexRelation;
    2181       89552 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    2182             :     Page        page;
    2183             :     BTPageOpaque opaque;
    2184             :     OffsetNumber start;
    2185             : 
    2186             :     Assert(!BTScanPosIsValid(so->currPos));
    2187             :     Assert(!so->needPrimScan);
    2188             : 
    2189             :     /*
    2190             :      * Scan down to the leftmost or rightmost leaf page.  This is a simplified
    2191             :      * version of _bt_search().
    2192             :      */
    2193       89552 :     so->currPos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
    2194             : 
    2195       89552 :     if (!BufferIsValid(so->currPos.buf))
    2196             :     {
    2197             :         /*
    2198             :          * Empty index. Lock the whole relation, as nothing finer to lock
    2199             :          * exists.
    2200             :          */
    2201        7712 :         PredicateLockRelation(rel, scan->xs_snapshot);
    2202        7712 :         _bt_parallel_done(scan);
    2203        7712 :         return false;
    2204             :     }
    2205             : 
    2206       81840 :     page = BufferGetPage(so->currPos.buf);
    2207       81840 :     opaque = BTPageGetOpaque(page);
    2208             :     Assert(P_ISLEAF(opaque));
    2209             : 
    2210       81840 :     if (ScanDirectionIsForward(dir))
    2211             :     {
    2212             :         /* There could be dead pages to the left, so not this: */
    2213             :         /* Assert(P_LEFTMOST(opaque)); */
    2214             : 
    2215       81780 :         start = P_FIRSTDATAKEY(opaque);
    2216             :     }
    2217          60 :     else if (ScanDirectionIsBackward(dir))
    2218             :     {
    2219             :         Assert(P_RIGHTMOST(opaque));
    2220             : 
    2221          60 :         start = PageGetMaxOffsetNumber(page);
    2222             :     }
    2223             :     else
    2224             :     {
    2225           0 :         elog(ERROR, "invalid scan direction: %d", (int) dir);
    2226             :         start = 0;              /* keep compiler quiet */
    2227             :     }
    2228             : 
    2229             :     /*
    2230             :      * Now load data from the first page of the scan.
    2231             :      */
    2232       81840 :     if (!_bt_readfirstpage(scan, start, dir))
    2233        1782 :         return false;
    2234             : 
    2235       80058 :     _bt_returnitem(scan, so);
    2236       80058 :     return true;
    2237             : }

Generated by: LCOV version 1.16