LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtsearch.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 88.3 % 546 482
Test Date: 2026-03-02 05:14:46 Functions: 100.0 % 16 16
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * nbtsearch.c
       4              :  *    Search code for postgres btrees.
       5              :  *
       6              :  *
       7              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       8              :  * Portions Copyright (c) 1994, Regents of the University of California
       9              :  *
      10              :  * IDENTIFICATION
      11              :  *    src/backend/access/nbtree/nbtsearch.c
      12              :  *
      13              :  *-------------------------------------------------------------------------
      14              :  */
      15              : 
      16              : #include "postgres.h"
      17              : 
      18              : #include "access/nbtree.h"
      19              : #include "access/relscan.h"
      20              : #include "access/xact.h"
      21              : #include "executor/instrument_node.h"
      22              : #include "miscadmin.h"
      23              : #include "pgstat.h"
      24              : #include "storage/predicate.h"
      25              : #include "utils/lsyscache.h"
      26              : #include "utils/rel.h"
      27              : 
      28              : 
      29              : static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so);
      30              : static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
      31              :                             Buffer buf, bool forupdate, BTStack stack,
      32              :                             int access);
      33              : static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
      34              : static int  _bt_binsrch_posting(BTScanInsert key, Page page,
      35              :                                 OffsetNumber offnum);
      36              : static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so);
      37              : static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
      38              : static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,
      39              :                               ScanDirection dir);
      40              : static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
      41              :                              BlockNumber lastcurrblkno, ScanDirection dir,
      42              :                              bool seized);
      43              : static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
      44              :                                          BlockNumber lastcurrblkno);
      45              : static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
      46              : 
      47              : 
      48              : /*
      49              :  *  _bt_drop_lock_and_maybe_pin()
      50              :  *
      51              :  * Unlock so->currPos.buf.  If scan is so->dropPin, drop the pin, too.
      52              :  * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock.
      53              :  */
      54              : static inline void
      55      6260615 : _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so)
      56              : {
      57      6260615 :     if (!so->dropPin)
      58              :     {
      59              :         /* Just drop the lock (not the pin) */
      60       258099 :         _bt_unlockbuf(rel, so->currPos.buf);
      61       258099 :         return;
      62              :     }
      63              : 
      64              :     /*
      65              :      * Drop both the lock and the pin.
      66              :      *
      67              :      * Have to set so->currPos.lsn so that _bt_killitems has a way to detect
      68              :      * when concurrent heap TID recycling by VACUUM might have taken place.
      69              :      */
      70              :     Assert(RelationNeedsWAL(rel));
      71      6002516 :     so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
      72      6002516 :     _bt_relbuf(rel, so->currPos.buf);
      73      6002516 :     so->currPos.buf = InvalidBuffer;
      74              : }
      75              : 
      76              : /*
      77              :  *  _bt_search() -- Search the tree for a particular scankey,
      78              :  *      or more precisely for the first leaf page it could be on.
      79              :  *
      80              :  * The passed scankey is an insertion-type scankey (see nbtree/README),
      81              :  * but it can omit the rightmost column(s) of the index.
      82              :  *
      83              :  * Return value is a stack of parent-page pointers (i.e. there is no entry for
      84              :  * the leaf level/page).  *bufP is set to the address of the leaf-page buffer,
      85              :  * which is locked and pinned.  No locks are held on the parent pages,
      86              :  * however!
      87              :  *
      88              :  * The returned buffer is locked according to access parameter.  Additionally,
      89              :  * access = BT_WRITE will allow an empty root page to be created and returned.
      90              :  * When access = BT_READ, an empty index will result in *bufP being set to
      91              :  * InvalidBuffer.  Also, in BT_WRITE mode, any incomplete splits encountered
      92              :  * during the search will be finished.
      93              :  *
      94              :  * heaprel must be provided by callers that pass access = BT_WRITE, since we
      95              :  * might need to allocate a new root page for caller -- see _bt_allocbuf.
      96              :  */
      97              : BTStack
      98     12490412 : _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
      99              :            int access)
     100              : {
     101     12490412 :     BTStack     stack_in = NULL;
     102     12490412 :     int         page_access = BT_READ;
     103              : 
     104              :     /* heaprel must be set whenever _bt_allocbuf is reachable */
     105              :     Assert(access == BT_READ || access == BT_WRITE);
     106              :     Assert(access == BT_READ || heaprel != NULL);
     107              : 
     108              :     /* Get the root page to start with */
     109     12490412 :     *bufP = _bt_getroot(rel, heaprel, access);
     110              : 
     111              :     /* If index is empty and access = BT_READ, no root page is created. */
     112     12490412 :     if (!BufferIsValid(*bufP))
     113       293644 :         return (BTStack) NULL;
     114              : 
     115              :     /* Loop iterates once per level descended in the tree */
     116              :     for (;;)
     117     10013453 :     {
     118              :         Page        page;
     119              :         BTPageOpaque opaque;
     120              :         OffsetNumber offnum;
     121              :         ItemId      itemid;
     122              :         IndexTuple  itup;
     123              :         BlockNumber child;
     124              :         BTStack     new_stack;
     125              : 
     126              :         /*
     127              :          * Race -- the page we just grabbed may have split since we read its
     128              :          * downlink in its parent page (or the metapage).  If it has, we may
     129              :          * need to move right to its new sibling.  Do that.
     130              :          *
     131              :          * In write-mode, allow _bt_moveright to finish any incomplete splits
     132              :          * along the way.  Strictly speaking, we'd only need to finish an
     133              :          * incomplete split on the leaf page we're about to insert to, not on
     134              :          * any of the upper levels (internal pages with incomplete splits are
     135              :          * also taken care of in _bt_getstackbuf).  But this is a good
     136              :          * opportunity to finish splits of internal pages too.
     137              :          */
     138     22210221 :         *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
     139              :                               stack_in, page_access);
     140              : 
     141              :         /* if this is a leaf page, we're done */
     142     22210221 :         page = BufferGetPage(*bufP);
     143     22210221 :         opaque = BTPageGetOpaque(page);
     144     22210221 :         if (P_ISLEAF(opaque))
     145     12196768 :             break;
     146              : 
     147              :         /*
     148              :          * Find the appropriate pivot tuple on this page.  Its downlink points
     149              :          * to the child page that we're about to descend to.
     150              :          */
     151     10013453 :         offnum = _bt_binsrch(rel, key, *bufP);
     152     10013453 :         itemid = PageGetItemId(page, offnum);
     153     10013453 :         itup = (IndexTuple) PageGetItem(page, itemid);
     154              :         Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
     155     10013453 :         child = BTreeTupleGetDownLink(itup);
     156              : 
     157              :         /*
     158              :          * We need to save the location of the pivot tuple we chose in a new
     159              :          * stack entry for this page/level.  If caller ends up splitting a
     160              :          * page one level down, it usually ends up inserting a new pivot
     161              :          * tuple/downlink immediately after the location recorded here.
     162              :          */
     163     10013453 :         new_stack = (BTStack) palloc_object(BTStackData);
     164     10013453 :         new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
     165     10013453 :         new_stack->bts_offset = offnum;
     166     10013453 :         new_stack->bts_parent = stack_in;
     167              : 
     168              :         /*
     169              :          * Page level 1 is lowest non-leaf page level prior to leaves.  So, if
     170              :          * we're on the level 1 and asked to lock leaf page in write mode,
     171              :          * then lock next page in write mode, because it must be a leaf.
     172              :          */
     173     10013453 :         if (opaque->btpo_level == 1 && access == BT_WRITE)
     174      3239493 :             page_access = BT_WRITE;
     175              : 
     176              :         /* drop the read lock on the page, then acquire one on its child */
     177     10013453 :         *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
     178              : 
     179              :         /* okay, all set to move down a level */
     180     10013453 :         stack_in = new_stack;
     181              :     }
     182              : 
     183              :     /*
     184              :      * If we're asked to lock leaf in write mode, but didn't manage to, then
     185              :      * relock.  This should only happen when the root page is a leaf page (and
     186              :      * the only page in the index other than the metapage).
     187              :      */
     188     12196768 :     if (access == BT_WRITE && page_access == BT_READ)
     189              :     {
     190              :         /* trade in our read lock for a write lock */
     191       453640 :         _bt_unlockbuf(rel, *bufP);
     192       453640 :         _bt_lockbuf(rel, *bufP, BT_WRITE);
     193              : 
     194              :         /*
     195              :          * Race -- the leaf page may have split after we dropped the read lock
     196              :          * but before we acquired a write lock.  If it has, we may need to
     197              :          * move right to its new sibling.  Do that.
     198              :          */
     199       453640 :         *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
     200              :     }
     201              : 
     202     12196768 :     return stack_in;
     203              : }
     204              : 
     205              : /*
     206              :  *  _bt_moveright() -- move right in the btree if necessary.
     207              :  *
     208              :  * When we follow a pointer to reach a page, it is possible that
     209              :  * the page has changed in the meanwhile.  If this happens, we're
     210              :  * guaranteed that the page has "split right" -- that is, that any
     211              :  * data that appeared on the page originally is either on the page
     212              :  * or strictly to the right of it.
     213              :  *
     214              :  * This routine decides whether or not we need to move right in the
     215              :  * tree by examining the high key entry on the page.  If that entry is
     216              :  * strictly less than the scankey, or <= the scankey in the
     217              :  * key.nextkey=true case, then we followed the wrong link and we need
     218              :  * to move right.
     219              :  *
     220              :  * The passed insertion-type scankey can omit the rightmost column(s) of the
     221              :  * index. (see nbtree/README)
     222              :  *
     223              :  * When key.nextkey is false (the usual case), we are looking for the first
     224              :  * item >= key.  When key.nextkey is true, we are looking for the first item
     225              :  * strictly greater than key.
     226              :  *
     227              :  * If forupdate is true, we will attempt to finish any incomplete splits
     228              :  * that we encounter.  This is required when locking a target page for an
     229              :  * insertion, because we don't allow inserting on a page before the split is
     230              :  * completed.  'heaprel' and 'stack' are only used if forupdate is true.
     231              :  *
     232              :  * On entry, we have the buffer pinned and a lock of the type specified by
     233              :  * 'access'.  If we move right, we release the buffer and lock and acquire
     234              :  * the same on the right sibling.  Return value is the buffer we stop at.
     235              :  */
     236              : static Buffer
     237     22663861 : _bt_moveright(Relation rel,
     238              :               Relation heaprel,
     239              :               BTScanInsert key,
     240              :               Buffer buf,
     241              :               bool forupdate,
     242              :               BTStack stack,
     243              :               int access)
     244              : {
     245              :     Page        page;
     246              :     BTPageOpaque opaque;
     247              :     int32       cmpval;
     248              : 
     249              :     Assert(!forupdate || heaprel != NULL);
     250              : 
     251              :     /*
     252              :      * When nextkey = false (normal case): if the scan key that brought us to
     253              :      * this page is > the high key stored on the page, then the page has split
     254              :      * and we need to move right.  (pg_upgrade'd !heapkeyspace indexes could
     255              :      * have some duplicates to the right as well as the left, but that's
     256              :      * something that's only ever dealt with on the leaf level, after
     257              :      * _bt_search has found an initial leaf page.)
     258              :      *
     259              :      * When nextkey = true: move right if the scan key is >= page's high key.
     260              :      * (Note that key.scantid cannot be set in this case.)
     261              :      *
     262              :      * The page could even have split more than once, so scan as far as
     263              :      * needed.
     264              :      *
     265              :      * We also have to move right if we followed a link that brought us to a
     266              :      * dead page.
     267              :      */
     268     22663861 :     cmpval = key->nextkey ? 0 : 1;
     269              : 
     270              :     for (;;)
     271              :     {
     272     22664752 :         page = BufferGetPage(buf);
     273     22664752 :         opaque = BTPageGetOpaque(page);
     274              : 
     275     22664752 :         if (P_RIGHTMOST(opaque))
     276     17092779 :             break;
     277              : 
     278              :         /*
     279              :          * Finish any incomplete splits we encounter along the way.
     280              :          */
     281      5571973 :         if (forupdate && P_INCOMPLETE_SPLIT(opaque))
     282            0 :         {
     283            0 :             BlockNumber blkno = BufferGetBlockNumber(buf);
     284              : 
     285              :             /* upgrade our lock if necessary */
     286            0 :             if (access == BT_READ)
     287              :             {
     288            0 :                 _bt_unlockbuf(rel, buf);
     289            0 :                 _bt_lockbuf(rel, buf, BT_WRITE);
     290              :             }
     291              : 
     292            0 :             if (P_INCOMPLETE_SPLIT(opaque))
     293            0 :                 _bt_finish_split(rel, heaprel, buf, stack);
     294              :             else
     295            0 :                 _bt_relbuf(rel, buf);
     296              : 
     297              :             /* re-acquire the lock in the right mode, and re-check */
     298            0 :             buf = _bt_getbuf(rel, blkno, access);
     299            0 :             continue;
     300              :         }
     301              : 
     302      5571973 :         if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
     303              :         {
     304              :             /* step right one page */
     305          891 :             buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
     306          891 :             continue;
     307              :         }
     308              :         else
     309              :             break;
     310              :     }
     311              : 
     312     22663861 :     if (P_IGNORE(opaque))
     313            0 :         elog(ERROR, "fell off the end of index \"%s\"",
     314              :              RelationGetRelationName(rel));
     315              : 
     316     22663861 :     return buf;
     317              : }
     318              : 
     319              : /*
     320              :  *  _bt_binsrch() -- Do a binary search for a key on a particular page.
     321              :  *
     322              :  * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
     323              :  * of the last key < given scankey, or last key <= given scankey if nextkey
     324              :  * is true.  (Since _bt_compare treats the first data key of such a page as
     325              :  * minus infinity, there will be at least one key < scankey, so the result
     326              :  * always points at one of the keys on the page.)
     327              :  *
     328              :  * On a leaf page, _bt_binsrch() returns the final result of the initial
     329              :  * positioning process that started with _bt_first's call to _bt_search.
     330              :  * We're returning a non-pivot tuple offset, so things are a little different.
     331              :  * It is possible that we'll return an offset that's either past the last
     332              :  * non-pivot slot, or (in the case of a backward scan) before the first slot.
     333              :  *
     334              :  * This procedure is not responsible for walking right, it just examines
     335              :  * the given page.  _bt_binsrch() has no lock or refcount side effects
     336              :  * on the buffer.
     337              :  */
     338              : static OffsetNumber
     339     18313150 : _bt_binsrch(Relation rel,
     340              :             BTScanInsert key,
     341              :             Buffer buf)
     342              : {
     343              :     Page        page;
     344              :     BTPageOpaque opaque;
     345              :     OffsetNumber low,
     346              :                 high;
     347              :     int32       result,
     348              :                 cmpval;
     349              : 
     350     18313150 :     page = BufferGetPage(buf);
     351     18313150 :     opaque = BTPageGetOpaque(page);
     352              : 
     353              :     /* Requesting nextkey semantics while using scantid seems nonsensical */
     354              :     Assert(!key->nextkey || key->scantid == NULL);
     355              :     /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
     356              :     Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
     357              : 
     358     18313150 :     low = P_FIRSTDATAKEY(opaque);
     359     18313150 :     high = PageGetMaxOffsetNumber(page);
     360              : 
     361              :     /*
     362              :      * If there are no keys on the page, return the first available slot. Note
     363              :      * this covers two cases: the page is really empty (no keys), or it
     364              :      * contains only a high key.  The latter case is possible after vacuuming.
     365              :      * This can never happen on an internal page, however, since they are
     366              :      * never empty (an internal page must have at least one child).
     367              :      */
     368     18313150 :     if (unlikely(high < low))
     369         3594 :         return low;
     370              : 
     371              :     /*
     372              :      * Binary search to find the first key on the page >= scan key, or first
     373              :      * key > scankey when nextkey is true.
     374              :      *
     375              :      * For nextkey=false (cmpval=1), the loop invariant is: all slots before
     376              :      * 'low' are < scan key, all slots at or after 'high' are >= scan key.
     377              :      *
     378              :      * For nextkey=true (cmpval=0), the loop invariant is: all slots before
     379              :      * 'low' are <= scan key, all slots at or after 'high' are > scan key.
     380              :      *
     381              :      * We can fall out when high == low.
     382              :      */
     383     18309556 :     high++;                     /* establish the loop invariant for high */
     384              : 
     385     18309556 :     cmpval = key->nextkey ? 0 : 1;   /* select comparison value */
     386              : 
     387    119928793 :     while (high > low)
     388              :     {
     389    101619237 :         OffsetNumber mid = low + ((high - low) / 2);
     390              : 
     391              :         /* We have low <= mid < high, so mid points at a real slot */
     392              : 
     393    101619237 :         result = _bt_compare(rel, key, page, mid);
     394              : 
     395    101619237 :         if (result >= cmpval)
     396     62481585 :             low = mid + 1;
     397              :         else
     398     39137652 :             high = mid;
     399              :     }
     400              : 
     401              :     /*
     402              :      * At this point we have high == low.
     403              :      *
     404              :      * On a leaf page we always return the first non-pivot tuple >= scan key
     405              :      * (resp. > scan key) for forward scan callers.  For backward scans, it's
     406              :      * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
     407              :      */
     408     18309556 :     if (P_ISLEAF(opaque))
     409              :     {
     410              :         /*
     411              :          * In the backward scan case we're supposed to locate the last
     412              :          * matching tuple on the leaf level -- not the first matching tuple
     413              :          * (the last tuple will be the first one returned by the scan).
     414              :          *
     415              :          * At this point we've located the first non-pivot tuple immediately
     416              :          * after the last matching tuple (which might just be maxoff + 1).
     417              :          * Compensate by stepping back.
     418              :          */
     419      8296103 :         if (key->backward)
     420        30686 :             return OffsetNumberPrev(low);
     421              : 
     422      8265417 :         return low;
     423              :     }
     424              : 
     425              :     /*
     426              :      * On a non-leaf page, return the last key < scan key (resp. <= scan key).
     427              :      * There must be one if _bt_compare() is playing by the rules.
     428              :      *
     429              :      * _bt_compare() will seldom see any exactly-matching pivot tuples, since
     430              :      * a truncated -inf heap TID is usually enough to prevent it altogether.
     431              :      * Even omitted scan key entries are treated as > truncated attributes.
     432              :      *
     433              :      * However, during backward scans _bt_compare() interprets omitted scan
     434              :      * key attributes as == corresponding truncated -inf attributes instead.
     435              :      * This works just like < would work here.  Under this scheme, < strategy
     436              :      * backward scans will always directly descend to the correct leaf page.
     437              :      * In particular, they will never incur an "extra" leaf page access with a
     438              :      * scan key that happens to contain the same prefix of values as some
     439              :      * pivot tuple's untruncated prefix.  VACUUM relies on this guarantee when
     440              :      * it uses a leaf page high key to "re-find" a page undergoing deletion.
     441              :      */
     442              :     Assert(low > P_FIRSTDATAKEY(opaque));
     443              : 
     444     10013453 :     return OffsetNumberPrev(low);
     445              : }
     446              : 
     447              : /*
     448              :  *
     449              :  *  _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
     450              :  *
     451              :  * Like _bt_binsrch(), but with support for caching the binary search
     452              :  * bounds.  Only used during insertion, and only on the leaf page that it
     453              :  * looks like caller will insert tuple on.  Exclusive-locked and pinned
     454              :  * leaf page is contained within insertstate.
     455              :  *
     456              :  * Caches the bounds fields in insertstate so that a subsequent call can
     457              :  * reuse the low and strict high bounds of original binary search.  Callers
     458              :  * that use these fields directly must be prepared for the case where low
     459              :  * and/or stricthigh are not on the same page (one or both exceed maxoff
     460              :  * for the page).  The case where there are no items on the page (high <
     461              :  * low) makes bounds invalid.
     462              :  *
     463              :  * Caller is responsible for invalidating bounds when it modifies the page
     464              :  * before calling here a second time, and for dealing with posting list
     465              :  * tuple matches (callers can use insertstate's postingoff field to
     466              :  * determine which existing heap TID will need to be replaced by a posting
     467              :  * list split).
     468              :  */
     469              : OffsetNumber
     470      6583135 : _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
     471              : {
     472      6583135 :     BTScanInsert key = insertstate->itup_key;
     473              :     Page        page;
     474              :     BTPageOpaque opaque;
     475              :     OffsetNumber low,
     476              :                 high,
     477              :                 stricthigh;
     478              :     int32       result,
     479              :                 cmpval;
     480              : 
     481      6583135 :     page = BufferGetPage(insertstate->buf);
     482      6583135 :     opaque = BTPageGetOpaque(page);
     483              : 
     484              :     Assert(P_ISLEAF(opaque));
     485              :     Assert(!key->nextkey);
     486              :     Assert(insertstate->postingoff == 0);
     487              : 
     488      6583135 :     if (!insertstate->bounds_valid)
     489              :     {
     490              :         /* Start new binary search */
     491      3940688 :         low = P_FIRSTDATAKEY(opaque);
     492      3940688 :         high = PageGetMaxOffsetNumber(page);
     493              :     }
     494              :     else
     495              :     {
     496              :         /* Restore result of previous binary search against same page */
     497      2642447 :         low = insertstate->low;
     498      2642447 :         high = insertstate->stricthigh;
     499              :     }
     500              : 
     501              :     /* If there are no keys on the page, return the first available slot */
     502      6583135 :     if (unlikely(high < low))
     503              :     {
     504              :         /* Caller can't reuse bounds */
     505        12197 :         insertstate->low = InvalidOffsetNumber;
     506        12197 :         insertstate->stricthigh = InvalidOffsetNumber;
     507        12197 :         insertstate->bounds_valid = false;
     508        12197 :         return low;
     509              :     }
     510              : 
     511              :     /*
     512              :      * Binary search to find the first key on the page >= scan key. (nextkey
     513              :      * is always false when inserting).
     514              :      *
     515              :      * The loop invariant is: all slots before 'low' are < scan key, all slots
     516              :      * at or after 'high' are >= scan key.  'stricthigh' is > scan key, and is
     517              :      * maintained to save additional search effort for caller.
     518              :      *
     519              :      * We can fall out when high == low.
     520              :      */
     521      6570938 :     if (!insertstate->bounds_valid)
     522      3928491 :         high++;                 /* establish the loop invariant for high */
     523      6570938 :     stricthigh = high;          /* high initially strictly higher */
     524              : 
     525      6570938 :     cmpval = 1;                 /* !nextkey comparison value */
     526              : 
     527     35379341 :     while (high > low)
     528              :     {
     529     28808403 :         OffsetNumber mid = low + ((high - low) / 2);
     530              : 
     531              :         /* We have low <= mid < high, so mid points at a real slot */
     532              : 
     533     28808403 :         result = _bt_compare(rel, key, page, mid);
     534              : 
     535     28808403 :         if (result >= cmpval)
     536     21809999 :             low = mid + 1;
     537              :         else
     538              :         {
     539      6998404 :             high = mid;
     540      6998404 :             if (result != 0)
     541      6438801 :                 stricthigh = high;
     542              :         }
     543              : 
     544              :         /*
     545              :          * If tuple at offset located by binary search is a posting list whose
     546              :          * TID range overlaps with caller's scantid, perform posting list
     547              :          * binary search to set postingoff for caller.  Caller must split the
     548              :          * posting list when postingoff is set.  This should happen
     549              :          * infrequently.
     550              :          */
     551     28808403 :         if (unlikely(result == 0 && key->scantid != NULL))
     552              :         {
     553              :             /*
     554              :              * postingoff should never be set more than once per leaf page
     555              :              * binary search.  That would mean that there are duplicate table
     556              :              * TIDs in the index, which is never okay.  Check for that here.
     557              :              */
     558       214954 :             if (insertstate->postingoff != 0)
     559            0 :                 ereport(ERROR,
     560              :                         (errcode(ERRCODE_INDEX_CORRUPTED),
     561              :                          errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
     562              :                                          ItemPointerGetBlockNumber(key->scantid),
     563              :                                          ItemPointerGetOffsetNumber(key->scantid),
     564              :                                          low, stricthigh,
     565              :                                          BufferGetBlockNumber(insertstate->buf),
     566              :                                          RelationGetRelationName(rel))));
     567              : 
     568       214954 :             insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
     569              :         }
     570              :     }
     571              : 
     572              :     /*
     573              :      * On a leaf page, a binary search always returns the first key >= scan
     574              :      * key (at least in !nextkey case), which could be the last slot + 1. This
     575              :      * is also the lower bound of cached search.
     576              :      *
     577              :      * stricthigh may also be the last slot + 1, which prevents caller from
     578              :      * using bounds directly, but is still useful to us if we're called a
     579              :      * second time with cached bounds (cached low will be < stricthigh when
     580              :      * that happens).
     581              :      */
     582      6570938 :     insertstate->low = low;
     583      6570938 :     insertstate->stricthigh = stricthigh;
     584      6570938 :     insertstate->bounds_valid = true;
     585              : 
     586      6570938 :     return low;
     587              : }
     588              : 
     589              : /*----------
     590              :  *  _bt_binsrch_posting() -- posting list binary search.
     591              :  *
     592              :  * Helper routine for _bt_binsrch_insert().
     593              :  *
     594              :  * Returns offset into posting list where caller's scantid belongs.
     595              :  *----------
     596              :  */
     597              : static int
     598       214954 : _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
     599              : {
     600              :     IndexTuple  itup;
     601              :     ItemId      itemid;
     602              :     int         low,
     603              :                 high,
     604              :                 mid,
     605              :                 res;
     606              : 
     607              :     /*
     608              :      * If this isn't a posting tuple, then the index must be corrupt (if it is
     609              :      * an ordinary non-pivot tuple then there must be an existing tuple with a
     610              :      * heap TID that equals inserter's new heap TID/scantid).  Defensively
     611              :      * check that tuple is a posting list tuple whose posting list range
     612              :      * includes caller's scantid.
     613              :      *
     614              :      * (This is also needed because contrib/amcheck's rootdescend option needs
     615              :      * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
     616              :      */
     617       214954 :     itemid = PageGetItemId(page, offnum);
     618       214954 :     itup = (IndexTuple) PageGetItem(page, itemid);
     619       214954 :     if (!BTreeTupleIsPosting(itup))
     620       201098 :         return 0;
     621              : 
     622              :     Assert(key->heapkeyspace && key->allequalimage);
     623              : 
     624              :     /*
     625              :      * In the event that posting list tuple has LP_DEAD bit set, indicate this
     626              :      * to _bt_binsrch_insert() caller by returning -1, a sentinel value.  A
     627              :      * second call to _bt_binsrch_insert() can take place when its caller has
     628              :      * removed the dead item.
     629              :      */
     630        13856 :     if (ItemIdIsDead(itemid))
     631            1 :         return -1;
     632              : 
     633              :     /* "high" is past end of posting list for loop invariant */
     634        13855 :     low = 0;
     635        13855 :     high = BTreeTupleGetNPosting(itup);
     636              :     Assert(high >= 2);
     637              : 
     638       112549 :     while (high > low)
     639              :     {
     640        98694 :         mid = low + ((high - low) / 2);
     641        98694 :         res = ItemPointerCompare(key->scantid,
     642        98694 :                                  BTreeTupleGetPostingN(itup, mid));
     643              : 
     644        98694 :         if (res > 0)
     645        51613 :             low = mid + 1;
     646        47081 :         else if (res < 0)
     647        47081 :             high = mid;
     648              :         else
     649            0 :             return mid;
     650              :     }
     651              : 
     652              :     /* Exact match not found */
     653        13855 :     return low;
     654              : }
     655              : 
     656              : /*----------
     657              :  *  _bt_compare() -- Compare insertion-type scankey to tuple on a page.
     658              :  *
     659              :  *  page/offnum: location of btree item to be compared to.
     660              :  *
     661              :  *      This routine returns:
     662              :  *          <0 if scankey < tuple at offnum;
     663              :  *           0 if scankey == tuple at offnum;
     664              :  *          >0 if scankey > tuple at offnum.
     665              :  *
     666              :  * NULLs in the keys are treated as sortable values.  Therefore
     667              :  * "equality" does not necessarily mean that the item should be returned
     668              :  * to the caller as a matching key.  Similarly, an insertion scankey
     669              :  * with its scantid set is treated as equal to a posting tuple whose TID
     670              :  * range overlaps with their scantid.  There generally won't be a
     671              :  * matching TID in the posting tuple, which caller must handle
     672              :  * themselves (e.g., by splitting the posting list tuple).
     673              :  *
     674              :  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
     675              :  * "minus infinity": this routine will always claim it is less than the
     676              :  * scankey.  The actual key value stored is explicitly truncated to 0
     677              :  * attributes (explicitly minus infinity) with version 3+ indexes, but
     678              :  * that isn't relied upon.  This allows us to implement the Lehman and
     679              :  * Yao convention that the first down-link pointer is before the first
     680              :  * key.  See backend/access/nbtree/README for details.
     681              :  *----------
     682              :  */
     683              : int32
     684    144604286 : _bt_compare(Relation rel,
     685              :             BTScanInsert key,
     686              :             Page page,
     687              :             OffsetNumber offnum)
     688              : {
     689    144604286 :     TupleDesc   itupdesc = RelationGetDescr(rel);
     690    144604286 :     BTPageOpaque opaque = BTPageGetOpaque(page);
     691              :     IndexTuple  itup;
     692              :     ItemPointer heapTid;
     693              :     ScanKey     scankey;
     694              :     int         ncmpkey;
     695              :     int         ntupatts;
     696              :     int32       result;
     697              : 
     698              :     Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
     699              :     Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
     700              :     Assert(key->heapkeyspace || key->scantid == NULL);
     701              : 
     702              :     /*
     703              :      * Force result ">" if target item is first data item on an internal page
     704              :      * --- see NOTE above.
     705              :      */
     706    144604286 :     if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
     707      2053255 :         return 1;
     708              : 
     709    142551031 :     itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
     710    142551031 :     ntupatts = BTreeTupleGetNAtts(itup, rel);
     711              : 
     712              :     /*
     713              :      * The scan key is set up with the attribute number associated with each
     714              :      * term in the key.  It is important that, if the index is multi-key, the
     715              :      * scan contain the first k key attributes, and that they be in order.  If
     716              :      * you think about how multi-key ordering works, you'll understand why
     717              :      * this is.
     718              :      *
     719              :      * We don't test for violation of this condition here, however.  The
     720              :      * initial setup for the index scan had better have gotten it right (see
     721              :      * _bt_first).
     722              :      */
     723              : 
     724    142551031 :     ncmpkey = Min(ntupatts, key->keysz);
     725              :     Assert(key->heapkeyspace || ncmpkey == key->keysz);
     726              :     Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
     727    142551031 :     scankey = key->scankeys;
     728    179254633 :     for (int i = 1; i <= ncmpkey; i++)
     729              :     {
     730              :         Datum       datum;
     731              :         bool        isNull;
     732              : 
     733    166751608 :         datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
     734              : 
     735    166751608 :         if (scankey->sk_flags & SK_ISNULL)   /* key is NULL */
     736              :         {
     737       294180 :             if (isNull)
     738        78692 :                 result = 0;     /* NULL "=" NULL */
     739       215488 :             else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
     740          312 :                 result = -1;    /* NULL "<" NOT_NULL */
     741              :             else
     742       215176 :                 result = 1;     /* NULL ">" NOT_NULL */
     743              :         }
     744    166457428 :         else if (isNull)        /* key is NOT_NULL and item is NULL */
     745              :         {
     746          132 :             if (scankey->sk_flags & SK_BT_NULLS_FIRST)
     747            0 :                 result = 1;     /* NOT_NULL ">" NULL */
     748              :             else
     749          132 :                 result = -1;    /* NOT_NULL "<" NULL */
     750              :         }
     751              :         else
     752              :         {
     753              :             /*
     754              :              * The sk_func needs to be passed the index value as left arg and
     755              :              * the sk_argument as right arg (they might be of different
     756              :              * types).  Since it is convenient for callers to think of
     757              :              * _bt_compare as comparing the scankey to the index item, we have
     758              :              * to flip the sign of the comparison result.  (Unless it's a DESC
     759              :              * column, in which case we *don't* flip the sign.)
     760              :              */
     761    166457296 :             result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
     762              :                                                      scankey->sk_collation,
     763              :                                                      datum,
     764              :                                                      scankey->sk_argument));
     765              : 
     766    166457296 :             if (!(scankey->sk_flags & SK_BT_DESC))
     767    166457263 :                 INVERT_COMPARE_RESULT(result);
     768              :         }
     769              : 
     770              :         /* if the keys are unequal, return the difference */
     771    166751608 :         if (result != 0)
     772    130048006 :             return result;
     773              : 
     774     36703602 :         scankey++;
     775              :     }
     776              : 
     777              :     /*
     778              :      * All non-truncated attributes (other than heap TID) were found to be
     779              :      * equal.  Treat truncated attributes as minus infinity when scankey has a
     780              :      * key attribute value that would otherwise be compared directly.
     781              :      *
     782              :      * Note: it doesn't matter if ntupatts includes non-key attributes;
     783              :      * scankey won't, so explicitly excluding non-key attributes isn't
     784              :      * necessary.
     785              :      */
     786     12503025 :     if (key->keysz > ntupatts)
     787       108745 :         return 1;
     788              : 
     789              :     /*
     790              :      * Use the heap TID attribute and scantid to try to break the tie.  The
     791              :      * rules are the same as any other key attribute -- only the
     792              :      * representation differs.
     793              :      */
     794     12394280 :     heapTid = BTreeTupleGetHeapTID(itup);
     795     12394280 :     if (key->scantid == NULL)
     796              :     {
     797              :         /*
     798              :          * Forward scans have a scankey that is considered greater than a
     799              :          * truncated pivot tuple if and when the scankey has equal values for
     800              :          * attributes up to and including the least significant untruncated
     801              :          * attribute in tuple.  Even attributes that were omitted from the
     802              :          * scan key are considered greater than -inf truncated attributes.
     803              :          * (See _bt_binsrch for an explanation of our backward scan behavior.)
     804              :          *
     805              :          * For example, if an index has the minimum two attributes (single
     806              :          * user key attribute, plus heap TID attribute), and a page's high key
     807              :          * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
     808              :          * will not descend to the page to the left.  The search will descend
     809              :          * right instead.  The truncated attribute in pivot tuple means that
     810              :          * all non-pivot tuples on the page to the left are strictly < 'foo',
     811              :          * so it isn't necessary to descend left.  In other words, search
     812              :          * doesn't have to descend left because it isn't interested in a match
     813              :          * that has a heap TID value of -inf.
     814              :          *
     815              :          * Note: the heap TID part of the test ensures that scankey is being
     816              :          * compared to a pivot tuple with one or more truncated -inf key
     817              :          * attributes.  The heap TID attribute is the last key attribute in
     818              :          * every index, of course, but other than that it isn't special.
     819              :          */
     820     10062616 :         if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
     821         5068 :             key->heapkeyspace)
     822         5068 :             return 1;
     823              : 
     824              :         /* All provided scankey arguments found to be equal */
     825     10057548 :         return 0;
     826              :     }
     827              : 
     828              :     /*
     829              :      * Treat truncated heap TID as minus infinity, since scankey has a key
     830              :      * attribute value (scantid) that would otherwise be compared directly
     831              :      */
     832              :     Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
     833      2331664 :     if (heapTid == NULL)
     834         2095 :         return 1;
     835              : 
     836              :     /*
     837              :      * Scankey must be treated as equal to a posting list tuple if its scantid
     838              :      * value falls within the range of the posting list.  In all other cases
     839              :      * there can only be a single heap TID value, which is compared directly
     840              :      * with scantid.
     841              :      */
     842              :     Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
     843      2329569 :     result = ItemPointerCompare(key->scantid, heapTid);
     844      2329569 :     if (result <= 0 || !BTreeTupleIsPosting(itup))
     845      2240429 :         return result;
     846              :     else
     847              :     {
     848        89140 :         result = ItemPointerCompare(key->scantid,
     849        89140 :                                     BTreeTupleGetMaxHeapTID(itup));
     850        89140 :         if (result > 0)
     851        75284 :             return 1;
     852              :     }
     853              : 
     854        13856 :     return 0;
     855              : }
     856              : 
     857              : /*
     858              :  *  _bt_first() -- Find the first item in a scan.
     859              :  *
     860              :  *      We need to be clever about the direction of scan, the search
     861              :  *      conditions, and the tree ordering.  We find the first item (or,
     862              :  *      if backwards scan, the last item) in the tree that satisfies the
     863              :  *      qualifications in the scan key.  On success exit, data about the
     864              :  *      matching tuple(s) on the page has been loaded into so->currPos.  We'll
     865              :  *      drop all locks and hold onto a pin on page's buffer, except during
     866              :  *      so->dropPin scans, when we drop both the lock and the pin.
     867              :  *      _bt_returnitem sets the next item to return to scan on success exit.
     868              :  *
     869              :  * If there are no matching items in the index, we return false, with no
     870              :  * pins or locks held.  so->currPos will remain invalid.
     871              :  *
     872              :  * Note that scan->keyData[], and the so->keyData[] scankey built from it,
     873              :  * are both search-type scankeys (see nbtree/README for more about this).
     874              :  * Within this routine, we build a temporary insertion-type scankey to use
     875              :  * in locating the scan start position.
     876              :  */
     877              : bool
     878      8638181 : _bt_first(IndexScanDesc scan, ScanDirection dir)
     879              : {
     880      8638181 :     Relation    rel = scan->indexRelation;
     881      8638181 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     882              :     BTStack     stack;
     883              :     OffsetNumber offnum;
     884              :     BTScanInsertData inskey;
     885              :     ScanKey     startKeys[INDEX_MAX_KEYS];
     886              :     ScanKeyData notnullkey;
     887      8638181 :     int         keysz = 0;
     888      8638181 :     StrategyNumber strat_total = InvalidStrategy;
     889      8638181 :     BlockNumber blkno = InvalidBlockNumber,
     890              :                 lastcurrblkno;
     891              : 
     892              :     Assert(!BTScanPosIsValid(so->currPos));
     893              : 
     894              :     /*
     895              :      * Examine the scan keys and eliminate any redundant keys; also mark the
     896              :      * keys that must be matched to continue the scan.
     897              :      */
     898      8638181 :     _bt_preprocess_keys(scan);
     899              : 
     900              :     /*
     901              :      * Quit now if _bt_preprocess_keys() discovered that the scan keys can
     902              :      * never be satisfied (eg, x == 1 AND x > 2).
     903              :      */
     904      8638181 :     if (!so->qual_ok)
     905              :     {
     906              :         Assert(!so->needPrimScan);
     907         1215 :         _bt_parallel_done(scan);
     908         1215 :         return false;
     909              :     }
     910              : 
     911              :     /*
     912              :      * If this is a parallel scan, we must seize the scan.  _bt_readfirstpage
     913              :      * will likely release the parallel scan later on.
     914              :      */
     915      8636966 :     if (scan->parallel_scan != NULL &&
     916          223 :         !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
     917          147 :         return false;
     918              : 
     919              :     /*
     920              :      * Initialize the scan's arrays (if any) for the current scan direction
     921              :      * (except when they were already set to later values as part of
     922              :      * scheduling the primitive index scan that is now underway)
     923              :      */
     924      8636819 :     if (so->numArrayKeys && !so->needPrimScan)
     925        35699 :         _bt_start_array_keys(scan, dir);
     926              : 
     927      8636819 :     if (blkno != InvalidBlockNumber)
     928              :     {
     929              :         /*
     930              :          * We anticipated calling _bt_search, but another worker bet us to it.
     931              :          * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
     932              :          */
     933              :         Assert(scan->parallel_scan != NULL);
     934              :         Assert(!so->needPrimScan);
     935              :         Assert(blkno != P_NONE);
     936              : 
     937           14 :         if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
     938            0 :             return false;
     939              : 
     940           14 :         _bt_returnitem(scan, so);
     941           14 :         return true;
     942              :     }
     943              : 
     944              :     /*
     945              :      * Count an indexscan for stats, now that we know that we'll call
     946              :      * _bt_search/_bt_endpoint below
     947              :      */
     948      8636805 :     pgstat_count_index_scan(rel);
     949      8636805 :     if (scan->instrument)
     950       437229 :         scan->instrument->nsearches++;
     951              : 
     952              :     /*----------
     953              :      * Examine the scan keys to discover where we need to start the scan.
     954              :      * The selected scan keys (at most one per index column) are remembered by
     955              :      * storing their addresses into the local startKeys[] array.  The final
     956              :      * startKeys[] entry's strategy is set in strat_total. (Actually, there
     957              :      * are a couple of cases where we force a less/more restrictive strategy.)
     958              :      *
     959              :      * We must use the key that was marked required (in the direction opposite
     960              :      * our own scan's) during preprocessing.  Each index attribute can only
     961              :      * have one such required key.  In general, the keys that we use to find
     962              :      * an initial position when scanning forwards are the same keys that end
     963              :      * the scan on the leaf level when scanning backwards (and vice-versa).
     964              :      *
     965              :      * When the scan keys include cross-type operators, _bt_preprocess_keys
     966              :      * may not be able to eliminate redundant keys; in such cases it will
     967              :      * arbitrarily pick a usable key for each attribute (and scan direction),
     968              :      * ensuring that there is no more than one key required in each direction.
     969              :      * We stop considering further keys once we reach the first nonrequired
     970              :      * key (which must come after all required keys), so this can't affect us.
     971              :      *
     972              :      * The required keys that we use as starting boundaries have to be =, >,
     973              :      * or >= keys for a forward scan or =, <, <= keys for a backwards scan.
     974              :      * We can use keys for multiple attributes so long as the prior attributes
     975              :      * had only =, >= (resp. =, <=) keys.  These rules are very similar to the
     976              :      * rules that preprocessing used to determine which keys to mark required.
     977              :      * We cannot always use every required key as a positioning key, though.
     978              :      * Skip arrays necessitate independently applying our own rules here.
     979              :      * Skip arrays are always generally considered = array keys, but we'll
     980              :      * nevertheless treat them as inequalities at certain points of the scan.
     981              :      * When that happens, it _might_ have implications for the number of
     982              :      * required keys that we can safely use for initial positioning purposes.
     983              :      *
     984              :      * For example, a forward scan with a skip array on its leading attribute
     985              :      * (with no low_compare/high_compare) will have at least two required scan
     986              :      * keys, but we won't use any of them as boundary keys during the scan's
     987              :      * initial call here.  Our positioning key during the first call here can
     988              :      * be thought of as representing "> -infinity".  Similarly, if such a skip
     989              :      * array's low_compare is "a > 'foo'", then we position using "a > 'foo'"
     990              :      * during the scan's initial call here; a lower-order key such as "b = 42"
     991              :      * can't be used until the "a" array advances beyond MINVAL/low_compare.
     992              :      *
     993              :      * On the other hand, if such a skip array's low_compare was "a >= 'foo'",
     994              :      * then we _can_ use "a >= 'foo' AND b = 42" during the initial call here.
     995              :      * A subsequent call here might have us use "a = 'fop' AND b = 42".  Note
     996              :      * that we treat = and >= as equivalent when scanning forwards (just as we
     997              :      * treat = and <= as equivalent when scanning backwards).  We effectively
     998              :      * do the same thing (though with a distinct "a" element/value) each time.
     999              :      *
    1000              :      * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP
    1001              :      * array keys whose array is "null_elem=true") imply a NOT NULL qualifier.
    1002              :      * If the index stores nulls at the end of the index we'll be starting
    1003              :      * from, and we have no boundary key for the column (which means the key
    1004              :      * we deduced NOT NULL from is an inequality key that constrains the other
    1005              :      * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
    1006              :      * use as a boundary key.  If we didn't do this, we might find ourselves
    1007              :      * traversing a lot of null entries at the start of the scan.
    1008              :      *
    1009              :      * In this loop, row-comparison keys are treated the same as keys on their
    1010              :      * first (leftmost) columns.  We'll add all lower-order columns of the row
    1011              :      * comparison that were marked required during preprocessing below.
    1012              :      *
    1013              :      * _bt_advance_array_keys needs to know exactly how we'll reposition the
    1014              :      * scan (should it opt to schedule another primitive index scan).  It is
    1015              :      * critical that primscans only be scheduled when they'll definitely make
    1016              :      * some useful progress.  _bt_advance_array_keys does this by calling
    1017              :      * _bt_checkkeys routines that report whether a tuple is past the end of
    1018              :      * matches for the scan's keys (given the scan's current array elements).
    1019              :      * If the page's final tuple is "after the end of matches" for a scan that
    1020              :      * uses the *opposite* scan direction, then it must follow that it's also
    1021              :      * "before the start of matches" for the actual current scan direction.
    1022              :      * It is therefore essential that all of our initial positioning rules are
    1023              :      * symmetric with _bt_checkkeys's corresponding continuescan=false rule.
    1024              :      * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
    1025              :      * need to be kept in sync.
    1026              :      *----------
    1027              :      */
    1028      8636805 :     if (so->numberOfKeys > 0)
    1029              :     {
    1030              :         AttrNumber  curattr;
    1031              :         ScanKey     bkey;
    1032              :         ScanKey     impliesNN;
    1033              :         ScanKey     cur;
    1034              : 
    1035              :         /*
    1036              :          * bkey will be set to the key that preprocessing left behind as the
    1037              :          * boundary key for this attribute, in this scan direction (if any)
    1038              :          */
    1039      8630052 :         cur = so->keyData;
    1040      8630052 :         curattr = 1;
    1041      8630052 :         bkey = NULL;
    1042              :         /* Also remember any scankey that implies a NOT NULL constraint */
    1043      8630052 :         impliesNN = NULL;
    1044              : 
    1045              :         /*
    1046              :          * Loop iterates from 0 to numberOfKeys inclusive; we use the last
    1047              :          * pass to handle after-last-key processing.  Actual exit from the
    1048              :          * loop is at one of the "break" statements below.
    1049              :          */
    1050     22293897 :         for (int i = 0;; cur++, i++)
    1051              :         {
    1052     22293897 :             if (i >= so->numberOfKeys || cur->sk_attno != curattr)
    1053              :             {
    1054              :                 /* Done looking for the curattr boundary key */
    1055              :                 Assert(bkey == NULL ||
    1056              :                        (bkey->sk_attno == curattr &&
    1057              :                         (bkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
    1058              :                 Assert(impliesNN == NULL ||
    1059              :                        (impliesNN->sk_attno == curattr &&
    1060              :                         (impliesNN->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
    1061              : 
    1062              :                 /*
    1063              :                  * If this is a scan key for a skip array whose current
    1064              :                  * element is MINVAL, choose low_compare (when scanning
    1065              :                  * backwards it'll be MAXVAL, and we'll choose high_compare).
    1066              :                  *
    1067              :                  * Note: if the array's low_compare key makes 'bkey' NULL,
    1068              :                  * then we behave as if the array's first element is -inf,
    1069              :                  * except when !array->null_elem implies a usable NOT NULL
    1070              :                  * constraint.
    1071              :                  */
    1072     13662933 :                 if (bkey != NULL &&
    1073     13624838 :                     (bkey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))
    1074              :                 {
    1075         1897 :                     int         ikey = bkey - so->keyData;
    1076         1897 :                     ScanKey     skipequalitykey = bkey;
    1077         1897 :                     BTArrayKeyInfo *array = NULL;
    1078              : 
    1079         1950 :                     for (int arridx = 0; arridx < so->numArrayKeys; arridx++)
    1080              :                     {
    1081         1950 :                         array = &so->arrayKeys[arridx];
    1082         1950 :                         if (array->scan_key == ikey)
    1083         1897 :                             break;
    1084              :                     }
    1085              : 
    1086         1897 :                     if (ScanDirectionIsForward(dir))
    1087              :                     {
    1088              :                         Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL));
    1089         1888 :                         bkey = array->low_compare;
    1090              :                     }
    1091              :                     else
    1092              :                     {
    1093              :                         Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL));
    1094            9 :                         bkey = array->high_compare;
    1095              :                     }
    1096              : 
    1097              :                     Assert(bkey == NULL ||
    1098              :                            bkey->sk_attno == skipequalitykey->sk_attno);
    1099              : 
    1100         1897 :                     if (!array->null_elem)
    1101           98 :                         impliesNN = skipequalitykey;
    1102              :                     else
    1103              :                         Assert(bkey == NULL && impliesNN == NULL);
    1104              :                 }
    1105              : 
    1106              :                 /*
    1107              :                  * If we didn't find a usable boundary key, see if we can
    1108              :                  * deduce a NOT NULL key
    1109              :                  */
    1110     13701058 :                 if (bkey == NULL && impliesNN != NULL &&
    1111        38125 :                     ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
    1112              :                      ScanDirectionIsForward(dir) :
    1113              :                      ScanDirectionIsBackward(dir)))
    1114              :                 {
    1115              :                     /* Final startKeys[] entry will be deduced NOT NULL key */
    1116           15 :                     bkey = &notnullkey;
    1117           15 :                     ScanKeyEntryInitialize(bkey,
    1118              :                                            (SK_SEARCHNOTNULL | SK_ISNULL |
    1119           15 :                                             (impliesNN->sk_flags &
    1120              :                                              (SK_BT_DESC | SK_BT_NULLS_FIRST))),
    1121              :                                            curattr,
    1122              :                                            ScanDirectionIsForward(dir) ?
    1123              :                                            BTGreaterStrategyNumber : BTLessStrategyNumber,
    1124              :                                            InvalidOid,
    1125              :                                            InvalidOid,
    1126              :                                            InvalidOid,
    1127              :                                            (Datum) 0);
    1128              :                 }
    1129              : 
    1130              :                 /*
    1131              :                  * If preprocessing didn't leave a usable boundary key, quit;
    1132              :                  * else save the boundary key pointer in startKeys[]
    1133              :                  */
    1134     13662933 :                 if (bkey == NULL)
    1135        39909 :                     break;
    1136     13623024 :                 startKeys[keysz++] = bkey;
    1137              : 
    1138              :                 /*
    1139              :                  * We can only consider adding more boundary keys when the one
    1140              :                  * that we just chose to add uses either the = or >= strategy
    1141              :                  * (during backwards scans we can only do so when the key that
    1142              :                  * we just added to startKeys[] uses the = or <= strategy)
    1143              :                  */
    1144     13623024 :                 strat_total = bkey->sk_strategy;
    1145     13623024 :                 if (strat_total == BTGreaterStrategyNumber ||
    1146              :                     strat_total == BTLessStrategyNumber)
    1147              :                     break;
    1148              : 
    1149              :                 /*
    1150              :                  * If the key that we just added to startKeys[] is a skip
    1151              :                  * array = key whose current element is marked NEXT or PRIOR,
    1152              :                  * make strat_total > or < (and stop adding boundary keys).
    1153              :                  * This can only happen with opclasses that lack skip support.
    1154              :                  */
    1155     12719297 :                 if (bkey->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))
    1156              :                 {
    1157              :                     Assert(bkey->sk_flags & SK_BT_SKIP);
    1158              :                     Assert(strat_total == BTEqualStrategyNumber);
    1159              : 
    1160            6 :                     if (ScanDirectionIsForward(dir))
    1161              :                     {
    1162              :                         Assert(!(bkey->sk_flags & SK_BT_PRIOR));
    1163            3 :                         strat_total = BTGreaterStrategyNumber;
    1164              :                     }
    1165              :                     else
    1166              :                     {
    1167              :                         Assert(!(bkey->sk_flags & SK_BT_NEXT));
    1168            3 :                         strat_total = BTLessStrategyNumber;
    1169              :                     }
    1170              : 
    1171              :                     /*
    1172              :                      * We're done.  We'll never find an exact = match for a
    1173              :                      * NEXT or PRIOR sentinel sk_argument value.  There's no
    1174              :                      * sense in trying to add more keys to startKeys[].
    1175              :                      */
    1176            6 :                     break;
    1177              :                 }
    1178              : 
    1179              :                 /*
    1180              :                  * Done if that was the last scan key output by preprocessing.
    1181              :                  * Also done if we've now examined all keys marked required.
    1182              :                  */
    1183     12719291 :                 if (i >= so->numberOfKeys ||
    1184      5032884 :                     !(cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
    1185              :                     break;
    1186              : 
    1187              :                 /*
    1188              :                  * Reset for next attr.
    1189              :                  */
    1190              :                 Assert(cur->sk_attno == curattr + 1);
    1191      5032881 :                 curattr = cur->sk_attno;
    1192      5032881 :                 bkey = NULL;
    1193      5032881 :                 impliesNN = NULL;
    1194              :             }
    1195              : 
    1196              :             /*
    1197              :              * If we've located the starting boundary key for curattr, we have
    1198              :              * no interest in curattr's other required key
    1199              :              */
    1200     13663845 :             if (bkey != NULL)
    1201          900 :                 continue;
    1202              : 
    1203              :             /*
    1204              :              * Is this key the starting boundary key for curattr?
    1205              :              *
    1206              :              * If not, does it imply a NOT NULL constraint?  (Because
    1207              :              * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
    1208              :              * *any* inequality key works for that; we need not test.)
    1209              :              */
    1210     13662945 :             switch (cur->sk_strategy)
    1211              :             {
    1212        68685 :                 case BTLessStrategyNumber:
    1213              :                 case BTLessEqualStrategyNumber:
    1214        68685 :                     if (ScanDirectionIsBackward(dir))
    1215        30599 :                         bkey = cur;
    1216        38086 :                     else if (impliesNN == NULL)
    1217        38086 :                         impliesNN = cur;
    1218        68685 :                     break;
    1219     12716057 :                 case BTEqualStrategyNumber:
    1220     12716057 :                     bkey = cur;
    1221     12716057 :                     break;
    1222       878203 :                 case BTGreaterEqualStrategyNumber:
    1223              :                 case BTGreaterStrategyNumber:
    1224       878203 :                     if (ScanDirectionIsForward(dir))
    1225       878182 :                         bkey = cur;
    1226           21 :                     else if (impliesNN == NULL)
    1227           21 :                         impliesNN = cur;
    1228       878203 :                     break;
    1229              :             }
    1230              :         }
    1231              :     }
    1232              : 
    1233              :     /*
    1234              :      * If we found no usable boundary keys, we have to start from one end of
    1235              :      * the tree.  Walk down that edge to the first or last key, and scan from
    1236              :      * there.
    1237              :      *
    1238              :      * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
    1239              :      */
    1240      8636805 :     if (keysz == 0)
    1241        46295 :         return _bt_endpoint(scan, dir);
    1242              : 
    1243              :     /*
    1244              :      * We want to start the scan somewhere within the index.  Set up an
    1245              :      * insertion scankey we can use to search for the boundary point we
    1246              :      * identified above.  The insertion scankey is built using the keys
    1247              :      * identified by startKeys[].  (Remaining insertion scankey fields are
    1248              :      * initialized after initial-positioning scan keys are finalized.)
    1249              :      */
    1250              :     Assert(keysz <= INDEX_MAX_KEYS);
    1251     22213510 :     for (int i = 0; i < keysz; i++)
    1252              :     {
    1253     13623024 :         ScanKey     bkey = startKeys[i];
    1254              : 
    1255              :         Assert(bkey->sk_attno == i + 1);
    1256              : 
    1257     13623024 :         if (bkey->sk_flags & SK_ROW_HEADER)
    1258              :         {
    1259              :             /*
    1260              :              * Row comparison header: look to the first row member instead
    1261              :              */
    1262           24 :             ScanKey     subkey = (ScanKey) DatumGetPointer(bkey->sk_argument);
    1263           24 :             bool        loosen_strat = false,
    1264           24 :                         tighten_strat = false;
    1265              : 
    1266              :             /*
    1267              :              * Cannot be a NULL in the first row member: _bt_preprocess_keys
    1268              :              * would've marked the qual as unsatisfiable, preventing us from
    1269              :              * ever getting this far
    1270              :              */
    1271              :             Assert(subkey->sk_flags & SK_ROW_MEMBER);
    1272              :             Assert(subkey->sk_attno == bkey->sk_attno);
    1273              :             Assert(!(subkey->sk_flags & SK_ISNULL));
    1274              : 
    1275              :             /*
    1276              :              * This is either a > or >= key (during backwards scans it is
    1277              :              * either < or <=) that was marked required during preprocessing.
    1278              :              * Later so->keyData[] keys can't have been marked required, so
    1279              :              * our row compare header key must be the final startKeys[] entry.
    1280              :              */
    1281              :             Assert(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD));
    1282              :             Assert(subkey->sk_strategy == bkey->sk_strategy);
    1283              :             Assert(subkey->sk_strategy == strat_total);
    1284              :             Assert(i == keysz - 1);
    1285              : 
    1286              :             /*
    1287              :              * The member scankeys are already in insertion format (ie, they
    1288              :              * have sk_func = 3-way-comparison function)
    1289              :              */
    1290           24 :             memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
    1291              : 
    1292              :             /*
    1293              :              * Now look to later row compare members.
    1294              :              *
    1295              :              * If there's an "index attribute gap" between two row compare
    1296              :              * members, the second member won't have been marked required, and
    1297              :              * so can't be used as a starting boundary key here.  The part of
    1298              :              * the row comparison that we do still use has to be treated as a
    1299              :              * ">=" or "<=" condition.  For example, a qual "(a, c) > (1, 42)"
    1300              :              * with an omitted intervening index attribute "b" will use an
    1301              :              * insertion scan key "a >= 1".  Even the first "a = 1" tuple on
    1302              :              * the leaf level might satisfy the row compare qual.
    1303              :              *
    1304              :              * We're able to use a _more_ restrictive strategy when we reach a
    1305              :              * NULL row compare member, since they're always unsatisfiable.
    1306              :              * For example, a qual "(a, b, c) >= (1, NULL, 77)" will use an
    1307              :              * insertion scan key "a > 1".  All tuples where "a = 1" cannot
    1308              :              * possibly satisfy the row compare qual, so this is safe.
    1309              :              */
    1310              :             Assert(!(subkey->sk_flags & SK_ROW_END));
    1311              :             for (;;)
    1312              :             {
    1313           24 :                 subkey++;
    1314              :                 Assert(subkey->sk_flags & SK_ROW_MEMBER);
    1315              : 
    1316           24 :                 if (subkey->sk_flags & SK_ISNULL)
    1317              :                 {
    1318              :                     /*
    1319              :                      * NULL member key, can only use earlier keys.
    1320              :                      *
    1321              :                      * We deliberately avoid checking if this key is marked
    1322              :                      * required.  All earlier keys are required, and this key
    1323              :                      * is unsatisfiable either way, so we can't miss anything.
    1324              :                      */
    1325            6 :                     tighten_strat = true;
    1326            6 :                     break;
    1327              :                 }
    1328              : 
    1329           18 :                 if (!(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
    1330              :                 {
    1331              :                     /* nonrequired member key, can only use earlier keys */
    1332            6 :                     loosen_strat = true;
    1333            6 :                     break;
    1334              :                 }
    1335              : 
    1336              :                 Assert(subkey->sk_attno == keysz + 1);
    1337              :                 Assert(subkey->sk_strategy == bkey->sk_strategy);
    1338              :                 Assert(keysz < INDEX_MAX_KEYS);
    1339              : 
    1340           12 :                 memcpy(inskey.scankeys + keysz, subkey, sizeof(ScanKeyData));
    1341           12 :                 keysz++;
    1342              : 
    1343           12 :                 if (subkey->sk_flags & SK_ROW_END)
    1344           12 :                     break;
    1345              :             }
    1346              :             Assert(!(loosen_strat && tighten_strat));
    1347           24 :             if (loosen_strat)
    1348              :             {
    1349              :                 /* Use less restrictive strategy (and fewer member keys) */
    1350            6 :                 switch (strat_total)
    1351              :                 {
    1352            3 :                     case BTLessStrategyNumber:
    1353            3 :                         strat_total = BTLessEqualStrategyNumber;
    1354            3 :                         break;
    1355            3 :                     case BTGreaterStrategyNumber:
    1356            3 :                         strat_total = BTGreaterEqualStrategyNumber;
    1357            3 :                         break;
    1358              :                 }
    1359              :             }
    1360           24 :             if (tighten_strat)
    1361              :             {
    1362              :                 /* Use more restrictive strategy (and fewer member keys) */
    1363            6 :                 switch (strat_total)
    1364              :                 {
    1365            3 :                     case BTLessEqualStrategyNumber:
    1366            3 :                         strat_total = BTLessStrategyNumber;
    1367            3 :                         break;
    1368            3 :                     case BTGreaterEqualStrategyNumber:
    1369            3 :                         strat_total = BTGreaterStrategyNumber;
    1370            3 :                         break;
    1371              :                 }
    1372              :             }
    1373              : 
    1374              :             /* Done (row compare header key is always last startKeys[] key) */
    1375           24 :             break;
    1376              :         }
    1377              : 
    1378              :         /*
    1379              :          * Ordinary comparison key/search-style key.
    1380              :          *
    1381              :          * Transform the search-style scan key to an insertion scan key by
    1382              :          * replacing the sk_func with the appropriate btree 3-way-comparison
    1383              :          * function.
    1384              :          *
    1385              :          * If scankey operator is not a cross-type comparison, we can use the
    1386              :          * cached comparison function; otherwise gotta look it up in the
    1387              :          * catalogs.  (That can't lead to infinite recursion, since no
    1388              :          * indexscan initiated by syscache lookup will use cross-data-type
    1389              :          * operators.)
    1390              :          *
    1391              :          * We support the convention that sk_subtype == InvalidOid means the
    1392              :          * opclass input type; this hack simplifies life for ScanKeyInit().
    1393              :          */
    1394     13623000 :         if (bkey->sk_subtype == rel->rd_opcintype[i] ||
    1395     13161516 :             bkey->sk_subtype == InvalidOid)
    1396     13617497 :         {
    1397              :             FmgrInfo   *procinfo;
    1398              : 
    1399     13617497 :             procinfo = index_getprocinfo(rel, bkey->sk_attno, BTORDER_PROC);
    1400     13617497 :             ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
    1401              :                                            bkey->sk_flags,
    1402     13617497 :                                            bkey->sk_attno,
    1403              :                                            InvalidStrategy,
    1404              :                                            bkey->sk_subtype,
    1405              :                                            bkey->sk_collation,
    1406              :                                            procinfo,
    1407              :                                            bkey->sk_argument);
    1408              :         }
    1409              :         else
    1410              :         {
    1411              :             RegProcedure cmp_proc;
    1412              : 
    1413         5503 :             cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
    1414         5503 :                                          rel->rd_opcintype[i],
    1415              :                                          bkey->sk_subtype, BTORDER_PROC);
    1416         5503 :             if (!RegProcedureIsValid(cmp_proc))
    1417            0 :                 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
    1418              :                      BTORDER_PROC, rel->rd_opcintype[i], bkey->sk_subtype,
    1419              :                      bkey->sk_attno, RelationGetRelationName(rel));
    1420         5503 :             ScanKeyEntryInitialize(inskey.scankeys + i,
    1421              :                                    bkey->sk_flags,
    1422         5503 :                                    bkey->sk_attno,
    1423              :                                    InvalidStrategy,
    1424              :                                    bkey->sk_subtype,
    1425              :                                    bkey->sk_collation,
    1426              :                                    cmp_proc,
    1427              :                                    bkey->sk_argument);
    1428              :         }
    1429              :     }
    1430              : 
    1431              :     /*----------
    1432              :      * Examine the selected initial-positioning strategy to determine exactly
    1433              :      * where we need to start the scan, and set flag variables to control the
    1434              :      * initial descent by _bt_search (and our _bt_binsrch call for the leaf
    1435              :      * page _bt_search returns).
    1436              :      *----------
    1437              :      */
    1438      8590510 :     _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
    1439      8590510 :     inskey.anynullkeys = false; /* unused */
    1440      8590510 :     inskey.scantid = NULL;
    1441      8590510 :     inskey.keysz = keysz;
    1442      8590510 :     switch (strat_total)
    1443              :     {
    1444        30602 :         case BTLessStrategyNumber:
    1445              : 
    1446        30602 :             inskey.nextkey = false;
    1447        30602 :             inskey.backward = true;
    1448        30602 :             break;
    1449              : 
    1450            9 :         case BTLessEqualStrategyNumber:
    1451              : 
    1452            9 :             inskey.nextkey = true;
    1453            9 :             inskey.backward = true;
    1454            9 :             break;
    1455              : 
    1456      7681705 :         case BTEqualStrategyNumber:
    1457              : 
    1458              :             /*
    1459              :              * If a backward scan was specified, need to start with last equal
    1460              :              * item not first one.
    1461              :              */
    1462      7681705 :             if (ScanDirectionIsBackward(dir))
    1463              :             {
    1464              :                 /*
    1465              :                  * This is the same as the <= strategy
    1466              :                  */
    1467          103 :                 inskey.nextkey = true;
    1468          103 :                 inskey.backward = true;
    1469              :             }
    1470              :             else
    1471              :             {
    1472              :                 /*
    1473              :                  * This is the same as the >= strategy
    1474              :                  */
    1475      7681602 :                 inskey.nextkey = false;
    1476      7681602 :                 inskey.backward = false;
    1477              :             }
    1478      7681705 :             break;
    1479              : 
    1480         5063 :         case BTGreaterEqualStrategyNumber:
    1481              : 
    1482              :             /*
    1483              :              * Find first item >= scankey
    1484              :              */
    1485         5063 :             inskey.nextkey = false;
    1486         5063 :             inskey.backward = false;
    1487         5063 :             break;
    1488              : 
    1489       873131 :         case BTGreaterStrategyNumber:
    1490              : 
    1491              :             /*
    1492              :              * Find first item > scankey
    1493              :              */
    1494       873131 :             inskey.nextkey = true;
    1495       873131 :             inskey.backward = false;
    1496       873131 :             break;
    1497              : 
    1498            0 :         default:
    1499              :             /* can't get here, but keep compiler quiet */
    1500            0 :             elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
    1501              :             return false;
    1502              :     }
    1503              : 
    1504              :     /*
    1505              :      * Use the manufactured insertion scan key to descend the tree and
    1506              :      * position ourselves on the target leaf page.
    1507              :      */
    1508              :     Assert(ScanDirectionIsBackward(dir) == inskey.backward);
    1509      8590510 :     stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
    1510              : 
    1511              :     /* don't need to keep the stack around... */
    1512      8590510 :     _bt_freestack(stack);
    1513              : 
    1514      8590510 :     if (!BufferIsValid(so->currPos.buf))
    1515              :     {
    1516              :         Assert(!so->needPrimScan);
    1517              : 
    1518              :         /*
    1519              :          * We only get here if the index is completely empty. Lock relation
    1520              :          * because nothing finer to lock exists.  Without a buffer lock, it's
    1521              :          * possible for another transaction to insert data between
    1522              :          * _bt_search() and PredicateLockRelation().  We have to try again
    1523              :          * after taking the relation-level predicate lock, to close a narrow
    1524              :          * window where we wouldn't scan concurrently inserted tuples, but the
    1525              :          * writer wouldn't see our predicate lock.
    1526              :          */
    1527       290813 :         if (IsolationIsSerializable())
    1528              :         {
    1529         2831 :             PredicateLockRelation(rel, scan->xs_snapshot);
    1530         2831 :             stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
    1531         2831 :             _bt_freestack(stack);
    1532              :         }
    1533              : 
    1534       290813 :         if (!BufferIsValid(so->currPos.buf))
    1535              :         {
    1536       290813 :             _bt_parallel_done(scan);
    1537       290813 :             return false;
    1538              :         }
    1539              :     }
    1540              : 
    1541              :     /* position to the precise item on the page */
    1542      8299697 :     offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
    1543              : 
    1544              :     /*
    1545              :      * Now load data from the first page of the scan (usually the page
    1546              :      * currently in so->currPos.buf).
    1547              :      *
    1548              :      * If inskey.nextkey = false and inskey.backward = false, offnum is
    1549              :      * positioned at the first non-pivot tuple >= inskey.scankeys.
    1550              :      *
    1551              :      * If inskey.nextkey = false and inskey.backward = true, offnum is
    1552              :      * positioned at the last non-pivot tuple < inskey.scankeys.
    1553              :      *
    1554              :      * If inskey.nextkey = true and inskey.backward = false, offnum is
    1555              :      * positioned at the first non-pivot tuple > inskey.scankeys.
    1556              :      *
    1557              :      * If inskey.nextkey = true and inskey.backward = true, offnum is
    1558              :      * positioned at the last non-pivot tuple <= inskey.scankeys.
    1559              :      *
    1560              :      * It's possible that _bt_binsrch returned an offnum that is out of bounds
    1561              :      * for the page.  For example, when inskey is both < the leaf page's high
    1562              :      * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
    1563              :      */
    1564      8299697 :     if (!_bt_readfirstpage(scan, offnum, dir))
    1565      2094018 :         return false;
    1566              : 
    1567      6205679 :     _bt_returnitem(scan, so);
    1568      6205679 :     return true;
    1569              : }
    1570              : 
    1571              : /*
    1572              :  *  _bt_next() -- Get the next item in a scan.
    1573              :  *
    1574              :  *      On entry, so->currPos describes the current page, which may be pinned
    1575              :  *      but is not locked, and so->currPos.itemIndex identifies which item was
    1576              :  *      previously returned.
    1577              :  *
    1578              :  *      On success exit, so->currPos is updated as needed, and _bt_returnitem
    1579              :  *      sets the next item to return to the scan.  so->currPos remains valid.
    1580              :  *
    1581              :  *      On failure exit (no more tuples), we invalidate so->currPos.  It'll
    1582              :  *      still be possible for the scan to return tuples by changing direction,
    1583              :  *      though we'll need to call _bt_first anew in that other direction.
    1584              :  */
    1585              : bool
    1586     10291843 : _bt_next(IndexScanDesc scan, ScanDirection dir)
    1587              : {
    1588     10291843 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1589              : 
    1590              :     Assert(BTScanPosIsValid(so->currPos));
    1591              : 
    1592              :     /*
    1593              :      * Advance to next tuple on current page; or if there's no more, try to
    1594              :      * step to the next page with data.
    1595              :      */
    1596     10291843 :     if (ScanDirectionIsForward(dir))
    1597              :     {
    1598     10279844 :         if (++so->currPos.itemIndex > so->currPos.lastItem)
    1599              :         {
    1600      1406513 :             if (!_bt_steppage(scan, dir))
    1601      1393140 :                 return false;
    1602              :         }
    1603              :     }
    1604              :     else
    1605              :     {
    1606        11999 :         if (--so->currPos.itemIndex < so->currPos.firstItem)
    1607              :         {
    1608           56 :             if (!_bt_steppage(scan, dir))
    1609           46 :                 return false;
    1610              :         }
    1611              :     }
    1612              : 
    1613      8898657 :     _bt_returnitem(scan, so);
    1614      8898657 :     return true;
    1615              : }
    1616              : 
    1617              : /*
    1618              :  * Return the index item from so->currPos.items[so->currPos.itemIndex] to the
    1619              :  * index scan by setting the relevant fields in caller's index scan descriptor
    1620              :  */
    1621              : static inline void
    1622     15145889 : _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
    1623              : {
    1624     15145889 :     BTScanPosItem *currItem = &so->currPos.items[so->currPos.itemIndex];
    1625              : 
    1626              :     /* Most recent _bt_readpage must have succeeded */
    1627              :     Assert(BTScanPosIsValid(so->currPos));
    1628              :     Assert(so->currPos.itemIndex >= so->currPos.firstItem);
    1629              :     Assert(so->currPos.itemIndex <= so->currPos.lastItem);
    1630              : 
    1631              :     /* Return next item, per amgettuple contract */
    1632     15145889 :     scan->xs_heaptid = currItem->heapTid;
    1633     15145889 :     if (so->currTuples)
    1634      2110424 :         scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
    1635     15145889 : }
    1636              : 
    1637              : /*
    1638              :  *  _bt_steppage() -- Step to next page containing valid data for scan
    1639              :  *
    1640              :  * Wrapper on _bt_readnextpage that performs final steps for the current page.
    1641              :  *
    1642              :  * On entry, so->currPos must be valid.  Its buffer will be pinned, though
    1643              :  * never locked. (Actually, when so->dropPin there won't even be a pin held,
    1644              :  * though so->currPos.currPage must still be set to a valid block number.)
    1645              :  */
    1646              : static bool
    1647      3501570 : _bt_steppage(IndexScanDesc scan, ScanDirection dir)
    1648              : {
    1649      3501570 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1650              :     BlockNumber blkno,
    1651              :                 lastcurrblkno;
    1652              : 
    1653              :     Assert(BTScanPosIsValid(so->currPos));
    1654              : 
    1655              :     /* Before leaving current page, deal with any killed items */
    1656      3501570 :     if (so->numKilled > 0)
    1657        42247 :         _bt_killitems(scan);
    1658              : 
    1659              :     /*
    1660              :      * Before we modify currPos, make a copy of the page data if there was a
    1661              :      * mark position that needs it.
    1662              :      */
    1663      3501570 :     if (so->markItemIndex >= 0)
    1664              :     {
    1665              :         /* bump pin on current buffer for assignment to mark buffer */
    1666          185 :         if (BTScanPosIsPinned(so->currPos))
    1667          174 :             IncrBufferRefCount(so->currPos.buf);
    1668          185 :         memcpy(&so->markPos, &so->currPos,
    1669              :                offsetof(BTScanPosData, items[1]) +
    1670          185 :                so->currPos.lastItem * sizeof(BTScanPosItem));
    1671          185 :         if (so->markTuples)
    1672          174 :             memcpy(so->markTuples, so->currTuples,
    1673          174 :                    so->currPos.nextTupleOffset);
    1674          185 :         so->markPos.itemIndex = so->markItemIndex;
    1675          185 :         so->markItemIndex = -1;
    1676              : 
    1677              :         /*
    1678              :          * If we're just about to start the next primitive index scan
    1679              :          * (possible with a scan that has arrays keys, and needs to skip to
    1680              :          * continue in the current scan direction), moreLeft/moreRight only
    1681              :          * indicate the end of the current primitive index scan.  They must
    1682              :          * never be taken to indicate that the top-level index scan has ended
    1683              :          * (that would be wrong).
    1684              :          *
    1685              :          * We could handle this case by treating the current array keys as
    1686              :          * markPos state.  But depending on the current array state like this
    1687              :          * would add complexity.  Instead, we just unset markPos's copy of
    1688              :          * moreRight or moreLeft (whichever might be affected), while making
    1689              :          * btrestrpos reset the scan's arrays to their initial scan positions.
    1690              :          * In effect, btrestrpos leaves advancing the arrays up to the first
    1691              :          * _bt_readpage call (that takes place after it has restored markPos).
    1692              :          */
    1693          185 :         if (so->needPrimScan)
    1694              :         {
    1695            0 :             if (ScanDirectionIsForward(so->currPos.dir))
    1696            0 :                 so->markPos.moreRight = true;
    1697              :             else
    1698            0 :                 so->markPos.moreLeft = true;
    1699              :         }
    1700              : 
    1701              :         /* mark/restore not supported by parallel scans */
    1702              :         Assert(!scan->parallel_scan);
    1703              :     }
    1704              : 
    1705      3501570 :     BTScanPosUnpinIfPinned(so->currPos);
    1706              : 
    1707              :     /* Walk to the next page with data */
    1708      3501570 :     if (ScanDirectionIsForward(dir))
    1709      3501450 :         blkno = so->currPos.nextPage;
    1710              :     else
    1711          120 :         blkno = so->currPos.prevPage;
    1712      3501570 :     lastcurrblkno = so->currPos.currPage;
    1713              : 
    1714              :     /*
    1715              :      * Cancel primitive index scans that were scheduled when the call to
    1716              :      * _bt_readpage for currPos happened to use the opposite direction to the
    1717              :      * one that we're stepping in now.  (It's okay to leave the scan's array
    1718              :      * keys as-is, since the next _bt_readpage will advance them.)
    1719              :      */
    1720      3501570 :     if (so->currPos.dir != dir)
    1721           18 :         so->needPrimScan = false;
    1722              : 
    1723      3501570 :     return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
    1724              : }
    1725              : 
    1726              : /*
    1727              :  *  _bt_readfirstpage() -- Read first page containing valid data for _bt_first
    1728              :  *
    1729              :  * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
    1730              :  * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
    1731              :  * When we're passed an offnum past the end of the page, we might still manage
    1732              :  * to stop the scan on this page by calling _bt_checkkeys against the high
    1733              :  * key.  See _bt_readpage for full details.
    1734              :  *
    1735              :  * On entry, so->currPos must be pinned and locked (so offnum stays valid).
    1736              :  * Parallel scan callers must have seized the scan before calling here.
    1737              :  *
    1738              :  * On exit, we'll have updated so->currPos and retained locks and pins
    1739              :  * according to the same rules as those laid out for _bt_readnextpage exit.
    1740              :  * Like _bt_readnextpage, our return value indicates if there are any matching
    1741              :  * records in the given direction.
    1742              :  *
    1743              :  * We always release the scan for a parallel scan caller, regardless of
    1744              :  * success or failure; we'll call _bt_parallel_release as soon as possible.
    1745              :  */
    1746              : static bool
    1747      8342127 : _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
    1748              : {
    1749      8342127 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1750              : 
    1751      8342127 :     so->numKilled = 0;           /* just paranoia */
    1752      8342127 :     so->markItemIndex = -1;      /* ditto */
    1753              : 
    1754              :     /* Initialize so->currPos for the first page (page in so->currPos.buf) */
    1755      8342127 :     if (so->needPrimScan)
    1756              :     {
    1757              :         Assert(so->numArrayKeys);
    1758              : 
    1759         8794 :         so->currPos.moreLeft = true;
    1760         8794 :         so->currPos.moreRight = true;
    1761         8794 :         so->needPrimScan = false;
    1762              :     }
    1763      8333333 :     else if (ScanDirectionIsForward(dir))
    1764              :     {
    1765      8302604 :         so->currPos.moreLeft = false;
    1766      8302604 :         so->currPos.moreRight = true;
    1767              :     }
    1768              :     else
    1769              :     {
    1770        30729 :         so->currPos.moreLeft = true;
    1771        30729 :         so->currPos.moreRight = false;
    1772              :     }
    1773              : 
    1774              :     /*
    1775              :      * Attempt to load matching tuples from the first page.
    1776              :      *
    1777              :      * Note that _bt_readpage will finish initializing the so->currPos fields.
    1778              :      * _bt_readpage also releases parallel scan (even when it returns false).
    1779              :      */
    1780      8342127 :     if (_bt_readpage(scan, dir, offnum, true))
    1781              :     {
    1782      6247126 :         Relation    rel = scan->indexRelation;
    1783              : 
    1784              :         /*
    1785              :          * _bt_readpage succeeded.  Drop the lock (and maybe the pin) on
    1786              :          * so->currPos.buf in preparation for btgettuple returning tuples.
    1787              :          */
    1788              :         Assert(BTScanPosIsPinned(so->currPos));
    1789      6247126 :         _bt_drop_lock_and_maybe_pin(rel, so);
    1790      6247126 :         return true;
    1791              :     }
    1792              : 
    1793              :     /* There's no actually-matching data on the page in so->currPos.buf */
    1794      2095001 :     _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
    1795              : 
    1796              :     /* Call _bt_readnextpage using its _bt_steppage wrapper function */
    1797      2095001 :     if (!_bt_steppage(scan, dir))
    1798      2094909 :         return false;
    1799              : 
    1800              :     /* _bt_readpage for a later page (now in so->currPos) succeeded */
    1801           92 :     return true;
    1802              : }
    1803              : 
    1804              : /*
    1805              :  *  _bt_readnextpage() -- Read next page containing valid data for _bt_next
    1806              :  *
    1807              :  * Caller's blkno is the next interesting page's link, taken from either the
    1808              :  * previously-saved right link or left link.  lastcurrblkno is the page that
    1809              :  * was current at the point where the blkno link was saved, which we use to
    1810              :  * reason about concurrent page splits/page deletions during backwards scans.
    1811              :  * In the common case where seized=false, blkno is either so->currPos.nextPage
    1812              :  * or so->currPos.prevPage, and lastcurrblkno is so->currPos.currPage.
    1813              :  *
    1814              :  * On entry, so->currPos shouldn't be locked by caller.  so->currPos.buf must
    1815              :  * be InvalidBuffer/unpinned as needed by caller (note that lastcurrblkno
    1816              :  * won't need to be read again in almost all cases).  Parallel scan callers
    1817              :  * that seized the scan before calling here should pass seized=true; such a
    1818              :  * caller's blkno and lastcurrblkno arguments come from the seized scan.
    1819              :  * seized=false callers just pass us the blkno/lastcurrblkno taken from their
    1820              :  * so->currPos, which (along with so->currPos itself) can be used to end the
    1821              :  * scan.  A seized=false caller's blkno can never be assumed to be the page
    1822              :  * that must be read next during a parallel scan, though.  We must figure that
    1823              :  * part out for ourselves by seizing the scan (the correct page to read might
    1824              :  * already be beyond the seized=false caller's blkno during a parallel scan,
    1825              :  * unless blkno/so->currPos.nextPage/so->currPos.prevPage is already P_NONE,
    1826              :  * or unless so->currPos.moreRight/so->currPos.moreLeft is already unset).
    1827              :  *
    1828              :  * On success exit, so->currPos is updated to contain data from the next
    1829              :  * interesting page, and we return true.  We hold a pin on the buffer on
    1830              :  * success exit (except during so->dropPin index scans, when we drop the pin
    1831              :  * eagerly to avoid blocking VACUUM).
    1832              :  *
    1833              :  * If there are no more matching records in the given direction, we invalidate
    1834              :  * so->currPos (while ensuring it retains no locks or pins), and return false.
    1835              :  *
    1836              :  * We always release the scan for a parallel scan caller, regardless of
    1837              :  * success or failure; we'll call _bt_parallel_release as soon as possible.
    1838              :  */
    1839              : static bool
    1840      3501584 : _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
    1841              :                  BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
    1842              : {
    1843      3501584 :     Relation    rel = scan->indexRelation;
    1844      3501584 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1845              : 
    1846              :     Assert(so->currPos.currPage == lastcurrblkno || seized);
    1847              :     Assert(!(blkno == P_NONE && seized));
    1848              :     Assert(!BTScanPosIsPinned(so->currPos));
    1849              : 
    1850              :     /*
    1851              :      * Remember that the scan already read lastcurrblkno, a page to the left
    1852              :      * of blkno (or remember reading a page to the right, for backwards scans)
    1853              :      */
    1854      3501584 :     if (ScanDirectionIsForward(dir))
    1855      3501464 :         so->currPos.moreLeft = true;
    1856              :     else
    1857          120 :         so->currPos.moreRight = true;
    1858              : 
    1859              :     for (;;)
    1860         1206 :     {
    1861              :         Page        page;
    1862              :         BTPageOpaque opaque;
    1863              : 
    1864      3502790 :         if (blkno == P_NONE ||
    1865              :             (ScanDirectionIsForward(dir) ?
    1866      1095611 :              !so->currPos.moreRight : !so->currPos.moreLeft))
    1867              :         {
    1868              :             /* most recent _bt_readpage call (for lastcurrblkno) ended scan */
    1869              :             Assert(so->currPos.currPage == lastcurrblkno && !seized);
    1870      3488081 :             BTScanPosInvalidate(so->currPos);
    1871      3488081 :             _bt_parallel_done(scan);    /* iff !so->needPrimScan */
    1872      3488081 :             return false;
    1873              :         }
    1874              : 
    1875              :         Assert(!so->needPrimScan);
    1876              : 
    1877              :         /* parallel scan must never actually visit so->currPos blkno */
    1878        14709 :         if (!seized && scan->parallel_scan != NULL &&
    1879          606 :             !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
    1880              :         {
    1881              :             /* whole scan is now done (or another primitive scan required) */
    1882           14 :             BTScanPosInvalidate(so->currPos);
    1883           14 :             return false;
    1884              :         }
    1885              : 
    1886        14695 :         if (ScanDirectionIsForward(dir))
    1887              :         {
    1888              :             /* read blkno, but check for interrupts first */
    1889        14626 :             CHECK_FOR_INTERRUPTS();
    1890        14626 :             so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
    1891              :         }
    1892              :         else
    1893              :         {
    1894              :             /* read blkno, avoiding race (also checks for interrupts) */
    1895           69 :             so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,
    1896              :                                                          lastcurrblkno);
    1897           69 :             if (so->currPos.buf == InvalidBuffer)
    1898              :             {
    1899              :                 /* must have been a concurrent deletion of leftmost page */
    1900            0 :                 BTScanPosInvalidate(so->currPos);
    1901            0 :                 _bt_parallel_done(scan);
    1902            0 :                 return false;
    1903              :             }
    1904              :         }
    1905              : 
    1906        14695 :         page = BufferGetPage(so->currPos.buf);
    1907        14695 :         opaque = BTPageGetOpaque(page);
    1908        14695 :         lastcurrblkno = blkno;
    1909        14695 :         if (likely(!P_IGNORE(opaque)))
    1910              :         {
    1911              :             /* see if there are any matches on this page */
    1912        14695 :             if (ScanDirectionIsForward(dir))
    1913              :             {
    1914              :                 /* note that this will clear moreRight if we can stop */
    1915        14626 :                 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), seized))
    1916        13430 :                     break;
    1917         1196 :                 blkno = so->currPos.nextPage;
    1918              :             }
    1919              :             else
    1920              :             {
    1921              :                 /* note that this will clear moreLeft if we can stop */
    1922           69 :                 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), seized))
    1923           59 :                     break;
    1924           10 :                 blkno = so->currPos.prevPage;
    1925              :             }
    1926              :         }
    1927              :         else
    1928              :         {
    1929              :             /* _bt_readpage not called, so do all this for ourselves */
    1930            0 :             if (ScanDirectionIsForward(dir))
    1931            0 :                 blkno = opaque->btpo_next;
    1932              :             else
    1933            0 :                 blkno = opaque->btpo_prev;
    1934            0 :             if (scan->parallel_scan != NULL)
    1935            0 :                 _bt_parallel_release(scan, blkno, lastcurrblkno);
    1936              :         }
    1937              : 
    1938              :         /* no matching tuples on this page */
    1939         1206 :         _bt_relbuf(rel, so->currPos.buf);
    1940         1206 :         seized = false;         /* released by _bt_readpage (or by us) */
    1941              :     }
    1942              : 
    1943              :     /*
    1944              :      * _bt_readpage succeeded.  Drop the lock (and maybe the pin) on
    1945              :      * so->currPos.buf in preparation for btgettuple returning tuples.
    1946              :      */
    1947              :     Assert(so->currPos.currPage == blkno);
    1948              :     Assert(BTScanPosIsPinned(so->currPos));
    1949        13489 :     _bt_drop_lock_and_maybe_pin(rel, so);
    1950              : 
    1951        13489 :     return true;
    1952              : }
    1953              : 
    1954              : /*
    1955              :  * _bt_lock_and_validate_left() -- lock caller's left sibling blkno,
    1956              :  * recovering from concurrent page splits/page deletions when necessary
    1957              :  *
    1958              :  * Called during backwards scans, to deal with their unique concurrency rules.
    1959              :  *
    1960              :  * blkno points to the block number of the page that we expect to move the
    1961              :  * scan to.  We'll successfully move the scan there when we find that its
    1962              :  * right sibling link still points to lastcurrblkno (the page we just read).
    1963              :  * Otherwise, we have to figure out which page is the correct one for the scan
    1964              :  * to now read the hard way, reasoning about concurrent splits and deletions.
    1965              :  * See nbtree/README.
    1966              :  *
    1967              :  * On return, we have both a pin and a read lock on the returned page, whose
    1968              :  * block number will be set in *blkno.  Returns InvalidBuffer if there is no
    1969              :  * page to the left (no lock or pin is held in that case).
    1970              :  *
    1971              :  * It is possible for the returned leaf page to be half-dead; caller must
    1972              :  * check that condition and step left again when required.
    1973              :  */
    1974              : static Buffer
    1975           69 : _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
    1976              :                            BlockNumber lastcurrblkno)
    1977              : {
    1978           69 :     BlockNumber origblkno = *blkno; /* detects circular links */
    1979              : 
    1980              :     for (;;)
    1981            0 :     {
    1982              :         Buffer      buf;
    1983              :         Page        page;
    1984              :         BTPageOpaque opaque;
    1985              :         int         tries;
    1986              : 
    1987              :         /* check for interrupts while we're not holding any buffer lock */
    1988           69 :         CHECK_FOR_INTERRUPTS();
    1989           69 :         buf = _bt_getbuf(rel, *blkno, BT_READ);
    1990           69 :         page = BufferGetPage(buf);
    1991           69 :         opaque = BTPageGetOpaque(page);
    1992              : 
    1993              :         /*
    1994              :          * If this isn't the page we want, walk right till we find what we
    1995              :          * want --- but go no more than four hops (an arbitrary limit). If we
    1996              :          * don't find the correct page by then, the most likely bet is that
    1997              :          * lastcurrblkno got deleted and isn't in the sibling chain at all
    1998              :          * anymore, not that its left sibling got split more than four times.
    1999              :          *
    2000              :          * Note that it is correct to test P_ISDELETED not P_IGNORE here,
    2001              :          * because half-dead pages are still in the sibling chain.
    2002              :          */
    2003           69 :         tries = 0;
    2004              :         for (;;)
    2005              :         {
    2006           69 :             if (likely(!P_ISDELETED(opaque) &&
    2007              :                        opaque->btpo_next == lastcurrblkno))
    2008              :             {
    2009              :                 /* Found desired page, return it */
    2010           69 :                 return buf;
    2011              :             }
    2012            0 :             if (P_RIGHTMOST(opaque) || ++tries > 4)
    2013              :                 break;
    2014              :             /* step right */
    2015            0 :             *blkno = opaque->btpo_next;
    2016            0 :             buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);
    2017            0 :             page = BufferGetPage(buf);
    2018            0 :             opaque = BTPageGetOpaque(page);
    2019              :         }
    2020              : 
    2021              :         /*
    2022              :          * Return to the original page (usually the page most recently read by
    2023              :          * _bt_readpage, which is passed by caller as lastcurrblkno) to see
    2024              :          * what's up with its prev sibling link
    2025              :          */
    2026            0 :         buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
    2027            0 :         page = BufferGetPage(buf);
    2028            0 :         opaque = BTPageGetOpaque(page);
    2029            0 :         if (P_ISDELETED(opaque))
    2030              :         {
    2031              :             /*
    2032              :              * It was deleted.  Move right to first nondeleted page (there
    2033              :              * must be one); that is the page that has acquired the deleted
    2034              :              * one's keyspace, so stepping left from it will take us where we
    2035              :              * want to be.
    2036              :              */
    2037              :             for (;;)
    2038              :             {
    2039            0 :                 if (P_RIGHTMOST(opaque))
    2040            0 :                     elog(ERROR, "fell off the end of index \"%s\"",
    2041              :                          RelationGetRelationName(rel));
    2042            0 :                 lastcurrblkno = opaque->btpo_next;
    2043            0 :                 buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
    2044            0 :                 page = BufferGetPage(buf);
    2045            0 :                 opaque = BTPageGetOpaque(page);
    2046            0 :                 if (!P_ISDELETED(opaque))
    2047            0 :                     break;
    2048              :             }
    2049              :         }
    2050              :         else
    2051              :         {
    2052              :             /*
    2053              :              * Original lastcurrblkno wasn't deleted; the explanation had
    2054              :              * better be that the page to the left got split or deleted.
    2055              :              * Without this check, we risk going into an infinite loop.
    2056              :              */
    2057            0 :             if (opaque->btpo_prev == origblkno)
    2058            0 :                 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
    2059              :                      lastcurrblkno, RelationGetRelationName(rel));
    2060              :             /* Okay to try again, since left sibling link changed */
    2061              :         }
    2062              : 
    2063              :         /*
    2064              :          * Original lastcurrblkno from caller was concurrently deleted (could
    2065              :          * also have been a great many concurrent left sibling page splits).
    2066              :          * Found a non-deleted page that should now act as our lastcurrblkno.
    2067              :          */
    2068            0 :         if (P_LEFTMOST(opaque))
    2069              :         {
    2070              :             /* New lastcurrblkno has no left sibling (concurrently deleted) */
    2071            0 :             _bt_relbuf(rel, buf);
    2072            0 :             break;
    2073              :         }
    2074              : 
    2075              :         /* Start from scratch with new lastcurrblkno's blkno/prev link */
    2076            0 :         *blkno = origblkno = opaque->btpo_prev;
    2077            0 :         _bt_relbuf(rel, buf);
    2078              :     }
    2079              : 
    2080            0 :     return InvalidBuffer;
    2081              : }
    2082              : 
    2083              : /*
    2084              :  * _bt_get_endpoint() -- Find the first or last page on a given tree level
    2085              :  *
    2086              :  * If the index is empty, we will return InvalidBuffer; any other failure
    2087              :  * condition causes ereport().  We will not return a dead page.
    2088              :  *
    2089              :  * The returned buffer is pinned and read-locked.
    2090              :  */
    2091              : Buffer
    2092        46304 : _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
    2093              : {
    2094              :     Buffer      buf;
    2095              :     Page        page;
    2096              :     BTPageOpaque opaque;
    2097              :     OffsetNumber offnum;
    2098              :     BlockNumber blkno;
    2099              :     IndexTuple  itup;
    2100              : 
    2101              :     /*
    2102              :      * If we are looking for a leaf page, okay to descend from fast root;
    2103              :      * otherwise better descend from true root.  (There is no point in being
    2104              :      * smarter about intermediate levels.)
    2105              :      */
    2106        46304 :     if (level == 0)
    2107        46295 :         buf = _bt_getroot(rel, NULL, BT_READ);
    2108              :     else
    2109            9 :         buf = _bt_gettrueroot(rel);
    2110              : 
    2111        46304 :     if (!BufferIsValid(buf))
    2112         3865 :         return InvalidBuffer;
    2113              : 
    2114        42439 :     page = BufferGetPage(buf);
    2115        42439 :     opaque = BTPageGetOpaque(page);
    2116              : 
    2117              :     for (;;)
    2118              :     {
    2119              :         /*
    2120              :          * If we landed on a deleted page, step right to find a live page
    2121              :          * (there must be one).  Also, if we want the rightmost page, step
    2122              :          * right if needed to get to it (this could happen if the page split
    2123              :          * since we obtained a pointer to it).
    2124              :          */
    2125        54426 :         while (P_IGNORE(opaque) ||
    2126           33 :                (rightmost && !P_RIGHTMOST(opaque)))
    2127              :         {
    2128            0 :             blkno = opaque->btpo_next;
    2129            0 :             if (blkno == P_NONE)
    2130            0 :                 elog(ERROR, "fell off the end of index \"%s\"",
    2131              :                      RelationGetRelationName(rel));
    2132            0 :             buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    2133            0 :             page = BufferGetPage(buf);
    2134            0 :             opaque = BTPageGetOpaque(page);
    2135              :         }
    2136              : 
    2137              :         /* Done? */
    2138        54426 :         if (opaque->btpo_level == level)
    2139        42439 :             break;
    2140        11987 :         if (opaque->btpo_level < level)
    2141            0 :             ereport(ERROR,
    2142              :                     (errcode(ERRCODE_INDEX_CORRUPTED),
    2143              :                      errmsg_internal("btree level %u not found in index \"%s\"",
    2144              :                                      level, RelationGetRelationName(rel))));
    2145              : 
    2146              :         /* Descend to leftmost or rightmost child page */
    2147        11987 :         if (rightmost)
    2148            3 :             offnum = PageGetMaxOffsetNumber(page);
    2149              :         else
    2150        11984 :             offnum = P_FIRSTDATAKEY(opaque);
    2151              : 
    2152        11987 :         if (offnum < 1 || offnum > PageGetMaxOffsetNumber(page))
    2153            0 :             elog(PANIC, "offnum out of range");
    2154              : 
    2155        11987 :         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
    2156        11987 :         blkno = BTreeTupleGetDownLink(itup);
    2157              : 
    2158        11987 :         buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    2159        11987 :         page = BufferGetPage(buf);
    2160        11987 :         opaque = BTPageGetOpaque(page);
    2161              :     }
    2162              : 
    2163        42439 :     return buf;
    2164              : }
    2165              : 
    2166              : /*
    2167              :  *  _bt_endpoint() -- Find the first or last page in the index, and scan
    2168              :  * from there to the first key satisfying all the quals.
    2169              :  *
    2170              :  * This is used by _bt_first() to set up a scan when we've determined
    2171              :  * that the scan must start at the beginning or end of the index (for
    2172              :  * a forward or backward scan respectively).
    2173              :  *
    2174              :  * Parallel scan callers must have seized the scan before calling here.
    2175              :  * Exit conditions are the same as for _bt_first().
    2176              :  */
    2177              : static bool
    2178        46295 : _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
    2179              : {
    2180        46295 :     Relation    rel = scan->indexRelation;
    2181        46295 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    2182              :     Page        page;
    2183              :     BTPageOpaque opaque;
    2184              :     OffsetNumber start;
    2185              : 
    2186              :     Assert(!BTScanPosIsValid(so->currPos));
    2187              :     Assert(!so->needPrimScan);
    2188              : 
    2189              :     /*
    2190              :      * Scan down to the leftmost or rightmost leaf page.  This is a simplified
    2191              :      * version of _bt_search().
    2192              :      */
    2193        46295 :     so->currPos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
    2194              : 
    2195        46295 :     if (!BufferIsValid(so->currPos.buf))
    2196              :     {
    2197              :         /*
    2198              :          * Empty index. Lock the whole relation, as nothing finer to lock
    2199              :          * exists.
    2200              :          */
    2201         3865 :         PredicateLockRelation(rel, scan->xs_snapshot);
    2202         3865 :         _bt_parallel_done(scan);
    2203         3865 :         return false;
    2204              :     }
    2205              : 
    2206        42430 :     page = BufferGetPage(so->currPos.buf);
    2207        42430 :     opaque = BTPageGetOpaque(page);
    2208              :     Assert(P_ISLEAF(opaque));
    2209              : 
    2210        42430 :     if (ScanDirectionIsForward(dir))
    2211              :     {
    2212              :         /* There could be dead pages to the left, so not this: */
    2213              :         /* Assert(P_LEFTMOST(opaque)); */
    2214              : 
    2215        42400 :         start = P_FIRSTDATAKEY(opaque);
    2216              :     }
    2217           30 :     else if (ScanDirectionIsBackward(dir))
    2218              :     {
    2219              :         Assert(P_RIGHTMOST(opaque));
    2220              : 
    2221           30 :         start = PageGetMaxOffsetNumber(page);
    2222              :     }
    2223              :     else
    2224              :     {
    2225            0 :         elog(ERROR, "invalid scan direction: %d", (int) dir);
    2226              :         start = 0;              /* keep compiler quiet */
    2227              :     }
    2228              : 
    2229              :     /*
    2230              :      * Now load data from the first page of the scan.
    2231              :      */
    2232        42430 :     if (!_bt_readfirstpage(scan, start, dir))
    2233          891 :         return false;
    2234              : 
    2235        41539 :     _bt_returnitem(scan, so);
    2236        41539 :     return true;
    2237              : }
        

Generated by: LCOV version 2.0-1