LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtsearch.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 88.1 % 545 480
Test Date: 2026-03-15 16:15:05 Functions: 100.0 % 16 16
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * nbtsearch.c
       4              :  *    Search code for postgres btrees.
       5              :  *
       6              :  *
       7              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       8              :  * Portions Copyright (c) 1994, Regents of the University of California
       9              :  *
      10              :  * IDENTIFICATION
      11              :  *    src/backend/access/nbtree/nbtsearch.c
      12              :  *
      13              :  *-------------------------------------------------------------------------
      14              :  */
      15              : 
      16              : #include "postgres.h"
      17              : 
      18              : #include "access/nbtree.h"
      19              : #include "access/relscan.h"
      20              : #include "access/xact.h"
      21              : #include "executor/instrument_node.h"
      22              : #include "miscadmin.h"
      23              : #include "pgstat.h"
      24              : #include "storage/predicate.h"
      25              : #include "utils/lsyscache.h"
      26              : #include "utils/rel.h"
      27              : 
      28              : 
      29              : static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so);
      30              : static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
      31              :                             Buffer buf, bool forupdate, BTStack stack,
      32              :                             int access);
      33              : static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
      34              : static int  _bt_binsrch_posting(BTScanInsert key, Page page,
      35              :                                 OffsetNumber offnum);
      36              : static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so);
      37              : static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
      38              : static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,
      39              :                               ScanDirection dir);
      40              : static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
      41              :                              BlockNumber lastcurrblkno, ScanDirection dir,
      42              :                              bool seized);
      43              : static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
      44              :                                          BlockNumber lastcurrblkno);
      45              : static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
      46              : 
      47              : 
      48              : /*
      49              :  *  _bt_drop_lock_and_maybe_pin()
      50              :  *
      51              :  * Unlock so->currPos.buf.  If scan is so->dropPin, drop the pin, too.
      52              :  * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock.
      53              :  */
      54              : static inline void
      55      6415080 : _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so)
      56              : {
      57      6415080 :     if (!so->dropPin)
      58              :     {
      59              :         /* Just drop the lock (not the pin) */
      60       260200 :         _bt_unlockbuf(rel, so->currPos.buf);
      61       260200 :         return;
      62              :     }
      63              : 
      64              :     /*
      65              :      * Drop both the lock and the pin.
      66              :      *
      67              :      * Have to set so->currPos.lsn so that _bt_killitems has a way to detect
      68              :      * when concurrent heap TID recycling by VACUUM might have taken place.
      69              :      */
      70      6154880 :     so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
      71      6154880 :     _bt_relbuf(rel, so->currPos.buf);
      72      6154880 :     so->currPos.buf = InvalidBuffer;
      73              : }
      74              : 
      75              : /*
      76              :  *  _bt_search() -- Search the tree for a particular scankey,
      77              :  *      or more precisely for the first leaf page it could be on.
      78              :  *
      79              :  * The passed scankey is an insertion-type scankey (see nbtree/README),
      80              :  * but it can omit the rightmost column(s) of the index.
      81              :  *
      82              :  * If returnstack is true, return value is a stack of parent-page pointers
      83              :  * (i.e. there is no entry for the leaf level/page).  If returnstack is false,
      84              :  * we just return NULL.  This scheme allows callers that don't need a descent
      85              :  * stack to avoid palloc churn.
      86              :  *
      87              :  * When we return, *bufP is set to the address of the leaf-page buffer, which
      88              :  * is locked and pinned.  No locks are held on the parent pages, however!
      89              :  *
      90              :  * The returned buffer is locked according to access parameter.  Additionally,
      91              :  * access = BT_WRITE will allow an empty root page to be created and returned.
      92              :  * When access = BT_READ, an empty index will result in *bufP being set to
      93              :  * InvalidBuffer.  Also, in BT_WRITE mode, any incomplete splits encountered
      94              :  * during the search will be finished.
      95              :  *
      96              :  * heaprel must be provided by callers that pass access = BT_WRITE, since we
      97              :  * might need to allocate a new root page for caller -- see _bt_allocbuf.
      98              :  */
      99              : BTStack
     100     13029400 : _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
     101              :            int access, bool returnstack)
     102              : {
     103     13029400 :     BTStack     stack_in = NULL;
     104     13029400 :     int         page_access = BT_READ;
     105              : 
     106              :     /* heaprel must be set whenever _bt_allocbuf is reachable */
     107              :     Assert(access == BT_READ || access == BT_WRITE);
     108              :     Assert(access == BT_READ || heaprel != NULL);
     109              : 
     110              :     /* Get the root page to start with */
     111     13029400 :     *bufP = _bt_getroot(rel, heaprel, access);
     112              : 
     113              :     /* If index is empty and access = BT_READ, no root page is created. */
     114     13029400 :     if (!BufferIsValid(*bufP))
     115       293837 :         return (BTStack) NULL;
     116              : 
     117              :     /* Loop iterates once per level descended in the tree */
     118              :     for (;;)
     119     10518665 :     {
     120              :         Page        page;
     121              :         BTPageOpaque opaque;
     122              :         OffsetNumber offnum;
     123              :         ItemId      itemid;
     124              :         IndexTuple  itup;
     125              :         BlockNumber child;
     126              :         BTStack     new_stack;
     127              : 
     128              :         /*
     129              :          * Race -- the page we just grabbed may have split since we read its
     130              :          * downlink in its parent page (or the metapage).  If it has, we may
     131              :          * need to move right to its new sibling.  Do that.
     132              :          *
     133              :          * In write-mode, allow _bt_moveright to finish any incomplete splits
     134              :          * along the way.  Strictly speaking, we'd only need to finish an
     135              :          * incomplete split on the leaf page we're about to insert to, not on
     136              :          * any of the upper levels (internal pages with incomplete splits are
     137              :          * also taken care of in _bt_getstackbuf).  But this is a good
     138              :          * opportunity to finish splits of internal pages too.
     139              :          */
     140     23254228 :         *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
     141              :                               stack_in, page_access);
     142              : 
     143              :         /* if this is a leaf page, we're done */
     144     23254228 :         page = BufferGetPage(*bufP);
     145     23254228 :         opaque = BTPageGetOpaque(page);
     146     23254228 :         if (P_ISLEAF(opaque))
     147     12735563 :             break;
     148              : 
     149              :         /*
     150              :          * Find the appropriate pivot tuple on this page.  Its downlink points
     151              :          * to the child page that we're about to descend to.
     152              :          */
     153     10518665 :         offnum = _bt_binsrch(rel, key, *bufP);
     154     10518665 :         itemid = PageGetItemId(page, offnum);
     155     10518665 :         itup = (IndexTuple) PageGetItem(page, itemid);
     156              :         Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
     157     10518665 :         child = BTreeTupleGetDownLink(itup);
     158              : 
     159              :         /*
     160              :          * We need to save the location of the pivot tuple we chose in a new
     161              :          * stack entry for this page/level.  If caller ends up splitting a
     162              :          * page one level down, it usually ends up inserting a new pivot
     163              :          * tuple/downlink immediately after the location recorded here.
     164              :          */
     165     10518665 :         if (returnstack)
     166              :         {
     167      3666816 :             new_stack = (BTStack) palloc_object(BTStackData);
     168      3666816 :             new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
     169      3666816 :             new_stack->bts_offset = offnum;
     170      3666816 :             new_stack->bts_parent = stack_in;
     171      3666816 :             stack_in = new_stack;
     172              :         }
     173              : 
     174              :         /*
     175              :          * Page level 1 is lowest non-leaf page level prior to leaves.  So, if
     176              :          * we're on the level 1 and asked to lock leaf page in write mode,
     177              :          * then lock next page in write mode, because it must be a leaf.
     178              :          */
     179     10518665 :         if (opaque->btpo_level == 1 && access == BT_WRITE)
     180      3598590 :             page_access = BT_WRITE;
     181              : 
     182              :         /* drop the read lock on the page, then acquire one on its child */
     183     10518665 :         *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
     184              : 
     185              :         /* okay, all set to move down a level */
     186              :     }
     187              : 
     188              :     /*
     189              :      * If we're asked to lock leaf in write mode, but didn't manage to, then
     190              :      * relock.  This should only happen when the root page is a leaf page (and
     191              :      * the only page in the index other than the metapage).
     192              :      */
     193     12735563 :     if (access == BT_WRITE && page_access == BT_READ)
     194              :     {
     195              :         /* trade in our read lock for a write lock */
     196       466714 :         _bt_unlockbuf(rel, *bufP);
     197       466714 :         _bt_lockbuf(rel, *bufP, BT_WRITE);
     198              : 
     199              :         /*
     200              :          * Race -- the leaf page may have split after we dropped the read lock
     201              :          * but before we acquired a write lock.  If it has, we may need to
     202              :          * move right to its new sibling.  Do that.
     203              :          */
     204       466714 :         *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
     205              :     }
     206              : 
     207     12735563 :     return stack_in;
     208              : }
     209              : 
     210              : /*
     211              :  *  _bt_moveright() -- move right in the btree if necessary.
     212              :  *
     213              :  * When we follow a pointer to reach a page, it is possible that
     214              :  * the page has changed in the meanwhile.  If this happens, we're
     215              :  * guaranteed that the page has "split right" -- that is, that any
     216              :  * data that appeared on the page originally is either on the page
     217              :  * or strictly to the right of it.
     218              :  *
     219              :  * This routine decides whether or not we need to move right in the
     220              :  * tree by examining the high key entry on the page.  If that entry is
     221              :  * strictly less than the scankey, or <= the scankey in the
     222              :  * key.nextkey=true case, then we followed the wrong link and we need
     223              :  * to move right.
     224              :  *
     225              :  * The passed insertion-type scankey can omit the rightmost column(s) of the
     226              :  * index. (see nbtree/README)
     227              :  *
     228              :  * When key.nextkey is false (the usual case), we are looking for the first
     229              :  * item >= key.  When key.nextkey is true, we are looking for the first item
     230              :  * strictly greater than key.
     231              :  *
     232              :  * If forupdate is true, we will attempt to finish any incomplete splits
     233              :  * that we encounter.  This is required when locking a target page for an
     234              :  * insertion, because we don't allow inserting on a page before the split is
     235              :  * completed.  'heaprel' and 'stack' are only used if forupdate is true.
     236              :  *
     237              :  * On entry, we have the buffer pinned and a lock of the type specified by
     238              :  * 'access'.  If we move right, we release the buffer and lock and acquire
     239              :  * the same on the right sibling.  Return value is the buffer we stop at.
     240              :  */
     241              : static Buffer
     242     23720942 : _bt_moveright(Relation rel,
     243              :               Relation heaprel,
     244              :               BTScanInsert key,
     245              :               Buffer buf,
     246              :               bool forupdate,
     247              :               BTStack stack,
     248              :               int access)
     249              : {
     250              :     Page        page;
     251              :     BTPageOpaque opaque;
     252              :     int32       cmpval;
     253              : 
     254              :     Assert(!forupdate || heaprel != NULL);
     255              : 
     256              :     /*
     257              :      * When nextkey = false (normal case): if the scan key that brought us to
     258              :      * this page is > the high key stored on the page, then the page has split
     259              :      * and we need to move right.  (pg_upgrade'd !heapkeyspace indexes could
     260              :      * have some duplicates to the right as well as the left, but that's
     261              :      * something that's only ever dealt with on the leaf level, after
     262              :      * _bt_search has found an initial leaf page.)
     263              :      *
     264              :      * When nextkey = true: move right if the scan key is >= page's high key.
     265              :      * (Note that key.scantid cannot be set in this case.)
     266              :      *
     267              :      * The page could even have split more than once, so scan as far as
     268              :      * needed.
     269              :      *
     270              :      * We also have to move right if we followed a link that brought us to a
     271              :      * dead page.
     272              :      */
     273     23720942 :     cmpval = key->nextkey ? 0 : 1;
     274              : 
     275              :     for (;;)
     276              :     {
     277     23721674 :         page = BufferGetPage(buf);
     278     23721674 :         opaque = BTPageGetOpaque(page);
     279              : 
     280     23721674 :         if (P_RIGHTMOST(opaque))
     281     18016129 :             break;
     282              : 
     283              :         /*
     284              :          * Finish any incomplete splits we encounter along the way.
     285              :          */
     286      5705545 :         if (forupdate && P_INCOMPLETE_SPLIT(opaque))
     287            0 :         {
     288            0 :             BlockNumber blkno = BufferGetBlockNumber(buf);
     289              : 
     290              :             /* upgrade our lock if necessary */
     291            0 :             if (access == BT_READ)
     292              :             {
     293            0 :                 _bt_unlockbuf(rel, buf);
     294            0 :                 _bt_lockbuf(rel, buf, BT_WRITE);
     295              :             }
     296              : 
     297            0 :             if (P_INCOMPLETE_SPLIT(opaque))
     298            0 :                 _bt_finish_split(rel, heaprel, buf, stack);
     299              :             else
     300            0 :                 _bt_relbuf(rel, buf);
     301              : 
     302              :             /* re-acquire the lock in the right mode, and re-check */
     303            0 :             buf = _bt_getbuf(rel, blkno, access);
     304            0 :             continue;
     305              :         }
     306              : 
     307      5705545 :         if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
     308              :         {
     309              :             /* step right one page */
     310          732 :             buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
     311          732 :             continue;
     312              :         }
     313              :         else
     314              :             break;
     315              :     }
     316              : 
     317     23720942 :     if (P_IGNORE(opaque))
     318            0 :         elog(ERROR, "fell off the end of index \"%s\"",
     319              :              RelationGetRelationName(rel));
     320              : 
     321     23720942 :     return buf;
     322              : }
     323              : 
     324              : /*
     325              :  *  _bt_binsrch() -- Do a binary search for a key on a particular page.
     326              :  *
     327              :  * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
     328              :  * of the last key < given scankey, or last key <= given scankey if nextkey
     329              :  * is true.  (Since _bt_compare treats the first data key of such a page as
     330              :  * minus infinity, there will be at least one key < scankey, so the result
     331              :  * always points at one of the keys on the page.)
     332              :  *
     333              :  * On a leaf page, _bt_binsrch() returns the final result of the initial
     334              :  * positioning process that started with _bt_first's call to _bt_search.
     335              :  * We're returning a non-pivot tuple offset, so things are a little different.
     336              :  * It is possible that we'll return an offset that's either past the last
     337              :  * non-pivot slot, or (in the case of a backward scan) before the first slot.
     338              :  *
     339              :  * This procedure is not responsible for walking right, it just examines
     340              :  * the given page.  _bt_binsrch() has no lock or refcount side effects
     341              :  * on the buffer.
     342              :  */
     343              : static OffsetNumber
     344     18985004 : _bt_binsrch(Relation rel,
     345              :             BTScanInsert key,
     346              :             Buffer buf)
     347              : {
     348              :     Page        page;
     349              :     BTPageOpaque opaque;
     350              :     OffsetNumber low,
     351              :                 high;
     352              :     int32       result,
     353              :                 cmpval;
     354              : 
     355     18985004 :     page = BufferGetPage(buf);
     356     18985004 :     opaque = BTPageGetOpaque(page);
     357              : 
     358              :     /* Requesting nextkey semantics while using scantid seems nonsensical */
     359              :     Assert(!key->nextkey || key->scantid == NULL);
     360              :     /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
     361              :     Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
     362              : 
     363     18985004 :     low = P_FIRSTDATAKEY(opaque);
     364     18985004 :     high = PageGetMaxOffsetNumber(page);
     365              : 
     366              :     /*
     367              :      * If there are no keys on the page, return the first available slot. Note
     368              :      * this covers two cases: the page is really empty (no keys), or it
     369              :      * contains only a high key.  The latter case is possible after vacuuming.
     370              :      * This can never happen on an internal page, however, since they are
     371              :      * never empty (an internal page must have at least one child).
     372              :      */
     373     18985004 :     if (unlikely(high < low))
     374         6694 :         return low;
     375              : 
     376              :     /*
     377              :      * Binary search to find the first key on the page >= scan key, or first
     378              :      * key > scankey when nextkey is true.
     379              :      *
     380              :      * For nextkey=false (cmpval=1), the loop invariant is: all slots before
     381              :      * 'low' are < scan key, all slots at or after 'high' are >= scan key.
     382              :      *
     383              :      * For nextkey=true (cmpval=0), the loop invariant is: all slots before
     384              :      * 'low' are <= scan key, all slots at or after 'high' are > scan key.
     385              :      *
     386              :      * We can fall out when high == low.
     387              :      */
     388     18978310 :     high++;                     /* establish the loop invariant for high */
     389              : 
     390     18978310 :     cmpval = key->nextkey ? 0 : 1;   /* select comparison value */
     391              : 
     392    124185333 :     while (high > low)
     393              :     {
     394    105207023 :         OffsetNumber mid = low + ((high - low) / 2);
     395              : 
     396              :         /* We have low <= mid < high, so mid points at a real slot */
     397              : 
     398    105207023 :         result = _bt_compare(rel, key, page, mid);
     399              : 
     400    105207023 :         if (result >= cmpval)
     401     65218222 :             low = mid + 1;
     402              :         else
     403     39988801 :             high = mid;
     404              :     }
     405              : 
     406              :     /*
     407              :      * At this point we have high == low.
     408              :      *
     409              :      * On a leaf page we always return the first non-pivot tuple >= scan key
     410              :      * (resp. > scan key) for forward scan callers.  For backward scans, it's
     411              :      * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
     412              :      */
     413     18978310 :     if (P_ISLEAF(opaque))
     414              :     {
     415              :         /*
     416              :          * In the backward scan case we're supposed to locate the last
     417              :          * matching tuple on the leaf level -- not the first matching tuple
     418              :          * (the last tuple will be the first one returned by the scan).
     419              :          *
     420              :          * At this point we've located the first non-pivot tuple immediately
     421              :          * after the last matching tuple (which might just be maxoff + 1).
     422              :          * Compensate by stepping back.
     423              :          */
     424      8459645 :         if (key->backward)
     425        30878 :             return OffsetNumberPrev(low);
     426              : 
     427      8428767 :         return low;
     428              :     }
     429              : 
     430              :     /*
     431              :      * On a non-leaf page, return the last key < scan key (resp. <= scan key).
     432              :      * There must be one if _bt_compare() is playing by the rules.
     433              :      *
     434              :      * _bt_compare() will seldom see any exactly-matching pivot tuples, since
     435              :      * a truncated -inf heap TID is usually enough to prevent it altogether.
     436              :      * Even omitted scan key entries are treated as > truncated attributes.
     437              :      *
     438              :      * However, during backward scans _bt_compare() interprets omitted scan
     439              :      * key attributes as == corresponding truncated -inf attributes instead.
     440              :      * This works just like < would work here.  Under this scheme, < strategy
     441              :      * backward scans will always directly descend to the correct leaf page.
     442              :      * In particular, they will never incur an "extra" leaf page access with a
     443              :      * scan key that happens to contain the same prefix of values as some
     444              :      * pivot tuple's untruncated prefix.  VACUUM relies on this guarantee when
     445              :      * it uses a leaf page high key to "re-find" a page undergoing deletion.
     446              :      */
     447              :     Assert(low > P_FIRSTDATAKEY(opaque));
     448              : 
     449     10518665 :     return OffsetNumberPrev(low);
     450              : }
     451              : 
     452              : /*
     453              :  *
     454              :  *  _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
     455              :  *
     456              :  * Like _bt_binsrch(), but with support for caching the binary search
     457              :  * bounds.  Only used during insertion, and only on the leaf page that it
     458              :  * looks like caller will insert tuple on.  Exclusive-locked and pinned
     459              :  * leaf page is contained within insertstate.
     460              :  *
     461              :  * Caches the bounds fields in insertstate so that a subsequent call can
     462              :  * reuse the low and strict high bounds of original binary search.  Callers
     463              :  * that use these fields directly must be prepared for the case where low
     464              :  * and/or stricthigh are not on the same page (one or both exceed maxoff
     465              :  * for the page).  The case where there are no items on the page (high <
     466              :  * low) makes bounds invalid.
     467              :  *
     468              :  * Caller is responsible for invalidating bounds when it modifies the page
     469              :  * before calling here a second time, and for dealing with posting list
     470              :  * tuple matches (callers can use insertstate's postingoff field to
     471              :  * determine which existing heap TID will need to be replaced by a posting
     472              :  * list split).
     473              :  */
     474              : OffsetNumber
     475      7311873 : _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
     476              : {
     477      7311873 :     BTScanInsert key = insertstate->itup_key;
     478              :     Page        page;
     479              :     BTPageOpaque opaque;
     480              :     OffsetNumber low,
     481              :                 high,
     482              :                 stricthigh;
     483              :     int32       result,
     484              :                 cmpval;
     485              : 
     486      7311873 :     page = BufferGetPage(insertstate->buf);
     487      7311873 :     opaque = BTPageGetOpaque(page);
     488              : 
     489              :     Assert(P_ISLEAF(opaque));
     490              :     Assert(!key->nextkey);
     491              :     Assert(insertstate->postingoff == 0);
     492              : 
     493      7311873 :     if (!insertstate->bounds_valid)
     494              :     {
     495              :         /* Start new binary search */
     496      4308784 :         low = P_FIRSTDATAKEY(opaque);
     497      4308784 :         high = PageGetMaxOffsetNumber(page);
     498              :     }
     499              :     else
     500              :     {
     501              :         /* Restore result of previous binary search against same page */
     502      3003089 :         low = insertstate->low;
     503      3003089 :         high = insertstate->stricthigh;
     504              :     }
     505              : 
     506              :     /* If there are no keys on the page, return the first available slot */
     507      7311873 :     if (unlikely(high < low))
     508              :     {
     509              :         /* Caller can't reuse bounds */
     510        11991 :         insertstate->low = InvalidOffsetNumber;
     511        11991 :         insertstate->stricthigh = InvalidOffsetNumber;
     512        11991 :         insertstate->bounds_valid = false;
     513        11991 :         return low;
     514              :     }
     515              : 
     516              :     /*
     517              :      * Binary search to find the first key on the page >= scan key. (nextkey
     518              :      * is always false when inserting).
     519              :      *
     520              :      * The loop invariant is: all slots before 'low' are < scan key, all slots
     521              :      * at or after 'high' are >= scan key.  'stricthigh' is > scan key, and is
     522              :      * maintained to save additional search effort for caller.
     523              :      *
     524              :      * We can fall out when high == low.
     525              :      */
     526      7299882 :     if (!insertstate->bounds_valid)
     527      4296793 :         high++;                 /* establish the loop invariant for high */
     528      7299882 :     stricthigh = high;          /* high initially strictly higher */
     529              : 
     530      7299882 :     cmpval = 1;                 /* !nextkey comparison value */
     531              : 
     532     38743257 :     while (high > low)
     533              :     {
     534     31443375 :         OffsetNumber mid = low + ((high - low) / 2);
     535              : 
     536              :         /* We have low <= mid < high, so mid points at a real slot */
     537              : 
     538     31443375 :         result = _bt_compare(rel, key, page, mid);
     539              : 
     540     31443375 :         if (result >= cmpval)
     541     24485895 :             low = mid + 1;
     542              :         else
     543              :         {
     544      6957480 :             high = mid;
     545      6957480 :             if (result != 0)
     546      6387793 :                 stricthigh = high;
     547              :         }
     548              : 
     549              :         /*
     550              :          * If tuple at offset located by binary search is a posting list whose
     551              :          * TID range overlaps with caller's scantid, perform posting list
     552              :          * binary search to set postingoff for caller.  Caller must split the
     553              :          * posting list when postingoff is set.  This should happen
     554              :          * infrequently.
     555              :          */
     556     31443375 :         if (unlikely(result == 0 && key->scantid != NULL))
     557              :         {
     558              :             /*
     559              :              * postingoff should never be set more than once per leaf page
     560              :              * binary search.  That would mean that there are duplicate table
     561              :              * TIDs in the index, which is never okay.  Check for that here.
     562              :              */
     563       215658 :             if (insertstate->postingoff != 0)
     564            0 :                 ereport(ERROR,
     565              :                         (errcode(ERRCODE_INDEX_CORRUPTED),
     566              :                          errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
     567              :                                          ItemPointerGetBlockNumber(key->scantid),
     568              :                                          ItemPointerGetOffsetNumber(key->scantid),
     569              :                                          low, stricthigh,
     570              :                                          BufferGetBlockNumber(insertstate->buf),
     571              :                                          RelationGetRelationName(rel))));
     572              : 
     573       215658 :             insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
     574              :         }
     575              :     }
     576              : 
     577              :     /*
     578              :      * On a leaf page, a binary search always returns the first key >= scan
     579              :      * key (at least in !nextkey case), which could be the last slot + 1. This
     580              :      * is also the lower bound of cached search.
     581              :      *
     582              :      * stricthigh may also be the last slot + 1, which prevents caller from
     583              :      * using bounds directly, but is still useful to us if we're called a
     584              :      * second time with cached bounds (cached low will be < stricthigh when
     585              :      * that happens).
     586              :      */
     587      7299882 :     insertstate->low = low;
     588      7299882 :     insertstate->stricthigh = stricthigh;
     589      7299882 :     insertstate->bounds_valid = true;
     590              : 
     591      7299882 :     return low;
     592              : }
     593              : 
     594              : /*----------
     595              :  *  _bt_binsrch_posting() -- posting list binary search.
     596              :  *
     597              :  * Helper routine for _bt_binsrch_insert().
     598              :  *
     599              :  * Returns offset into posting list where caller's scantid belongs.
     600              :  *----------
     601              :  */
     602              : static int
     603       215658 : _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
     604              : {
     605              :     IndexTuple  itup;
     606              :     ItemId      itemid;
     607              :     int         low,
     608              :                 high,
     609              :                 mid,
     610              :                 res;
     611              : 
     612              :     /*
     613              :      * If this isn't a posting tuple, then the index must be corrupt (if it is
     614              :      * an ordinary non-pivot tuple then there must be an existing tuple with a
     615              :      * heap TID that equals inserter's new heap TID/scantid).  Defensively
     616              :      * check that tuple is a posting list tuple whose posting list range
     617              :      * includes caller's scantid.
     618              :      *
     619              :      * (This is also needed because contrib/amcheck's rootdescend option needs
     620              :      * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
     621              :      */
     622       215658 :     itemid = PageGetItemId(page, offnum);
     623       215658 :     itup = (IndexTuple) PageGetItem(page, itemid);
     624       215658 :     if (!BTreeTupleIsPosting(itup))
     625       201098 :         return 0;
     626              : 
     627              :     Assert(key->heapkeyspace && key->allequalimage);
     628              : 
     629              :     /*
     630              :      * In the event that posting list tuple has LP_DEAD bit set, indicate this
     631              :      * to _bt_binsrch_insert() caller by returning -1, a sentinel value.  A
     632              :      * second call to _bt_binsrch_insert() can take place when its caller has
     633              :      * removed the dead item.
     634              :      */
     635        14560 :     if (ItemIdIsDead(itemid))
     636            0 :         return -1;
     637              : 
     638              :     /* "high" is past end of posting list for loop invariant */
     639        14560 :     low = 0;
     640        14560 :     high = BTreeTupleGetNPosting(itup);
     641              :     Assert(high >= 2);
     642              : 
     643       118932 :     while (high > low)
     644              :     {
     645       104372 :         mid = low + ((high - low) / 2);
     646       104372 :         res = ItemPointerCompare(key->scantid,
     647       104372 :                                  BTreeTupleGetPostingN(itup, mid));
     648              : 
     649       104372 :         if (res > 0)
     650        54246 :             low = mid + 1;
     651        50126 :         else if (res < 0)
     652        50126 :             high = mid;
     653              :         else
     654            0 :             return mid;
     655              :     }
     656              : 
     657              :     /* Exact match not found */
     658        14560 :     return low;
     659              : }
     660              : 
     661              : /*----------
     662              :  *  _bt_compare() -- Compare insertion-type scankey to tuple on a page.
     663              :  *
     664              :  *  page/offnum: location of btree item to be compared to.
     665              :  *
     666              :  *      This routine returns:
     667              :  *          <0 if scankey < tuple at offnum;
     668              :  *           0 if scankey == tuple at offnum;
     669              :  *          >0 if scankey > tuple at offnum.
     670              :  *
     671              :  * NULLs in the keys are treated as sortable values.  Therefore
     672              :  * "equality" does not necessarily mean that the item should be returned
     673              :  * to the caller as a matching key.  Similarly, an insertion scankey
     674              :  * with its scantid set is treated as equal to a posting tuple whose TID
     675              :  * range overlaps with their scantid.  There generally won't be a
     676              :  * matching TID in the posting tuple, which caller must handle
     677              :  * themselves (e.g., by splitting the posting list tuple).
     678              :  *
     679              :  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
     680              :  * "minus infinity": this routine will always claim it is less than the
     681              :  * scankey.  The actual key value stored is explicitly truncated to 0
     682              :  * attributes (explicitly minus infinity) with version 3+ indexes, but
     683              :  * that isn't relied upon.  This allows us to implement the Lehman and
     684              :  * Yao convention that the first down-link pointer is before the first
     685              :  * key.  See backend/access/nbtree/README for details.
     686              :  *----------
     687              :  */
     688              : int32
     689    150976008 : _bt_compare(Relation rel,
     690              :             BTScanInsert key,
     691              :             Page page,
     692              :             OffsetNumber offnum)
     693              : {
     694    150976008 :     TupleDesc   itupdesc = RelationGetDescr(rel);
     695    150976008 :     BTPageOpaque opaque = BTPageGetOpaque(page);
     696              :     IndexTuple  itup;
     697              :     ItemPointer heapTid;
     698              :     ScanKey     scankey;
     699              :     int         ncmpkey;
     700              :     int         ntupatts;
     701              :     int32       result;
     702              : 
     703              :     Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
     704              :     Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
     705              :     Assert(key->heapkeyspace || key->scantid == NULL);
     706              : 
     707              :     /*
     708              :      * Force result ">" if target item is first data item on an internal page
     709              :      * --- see NOTE above.
     710              :      */
     711    150976008 :     if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
     712      2169383 :         return 1;
     713              : 
     714    148806625 :     itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
     715    148806625 :     ntupatts = BTreeTupleGetNAtts(itup, rel);
     716              : 
     717              :     /*
     718              :      * The scan key is set up with the attribute number associated with each
     719              :      * term in the key.  It is important that, if the index is multi-key, the
     720              :      * scan contain the first k key attributes, and that they be in order.  If
     721              :      * you think about how multi-key ordering works, you'll understand why
     722              :      * this is.
     723              :      *
     724              :      * We don't test for violation of this condition here, however.  The
     725              :      * initial setup for the index scan had better have gotten it right (see
     726              :      * _bt_first).
     727              :      */
     728              : 
     729    148806625 :     ncmpkey = Min(ntupatts, key->keysz);
     730              :     Assert(key->heapkeyspace || ncmpkey == key->keysz);
     731              :     Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
     732    148806625 :     scankey = key->scankeys;
     733    185715878 :     for (int i = 1; i <= ncmpkey; i++)
     734              :     {
     735              :         Datum       datum;
     736              :         bool        isNull;
     737              : 
     738    173077753 :         datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
     739              : 
     740    173077753 :         if (scankey->sk_flags & SK_ISNULL)   /* key is NULL */
     741              :         {
     742       295473 :             if (isNull)
     743        78692 :                 result = 0;     /* NULL "=" NULL */
     744       216781 :             else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
     745          312 :                 result = -1;    /* NULL "<" NOT_NULL */
     746              :             else
     747       216469 :                 result = 1;     /* NULL ">" NOT_NULL */
     748              :         }
     749    172782280 :         else if (isNull)        /* key is NOT_NULL and item is NULL */
     750              :         {
     751          132 :             if (scankey->sk_flags & SK_BT_NULLS_FIRST)
     752            0 :                 result = 1;     /* NOT_NULL ">" NULL */
     753              :             else
     754          132 :                 result = -1;    /* NOT_NULL "<" NULL */
     755              :         }
     756              :         else
     757              :         {
     758              :             /*
     759              :              * The sk_func needs to be passed the index value as left arg and
     760              :              * the sk_argument as right arg (they might be of different
     761              :              * types).  Since it is convenient for callers to think of
     762              :              * _bt_compare as comparing the scankey to the index item, we have
     763              :              * to flip the sign of the comparison result.  (Unless it's a DESC
     764              :              * column, in which case we *don't* flip the sign.)
     765              :              */
     766    172782148 :             result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
     767              :                                                      scankey->sk_collation,
     768              :                                                      datum,
     769              :                                                      scankey->sk_argument));
     770              : 
     771    172782148 :             if (!(scankey->sk_flags & SK_BT_DESC))
     772    172782115 :                 INVERT_COMPARE_RESULT(result);
     773              :         }
     774              : 
     775              :         /* if the keys are unequal, return the difference */
     776    173077753 :         if (result != 0)
     777    136168500 :             return result;
     778              : 
     779     36909253 :         scankey++;
     780              :     }
     781              : 
     782              :     /*
     783              :      * All non-truncated attributes (other than heap TID) were found to be
     784              :      * equal.  Treat truncated attributes as minus infinity when scankey has a
     785              :      * key attribute value that would otherwise be compared directly.
     786              :      *
     787              :      * Note: it doesn't matter if ntupatts includes non-key attributes;
     788              :      * scankey won't, so explicitly excluding non-key attributes isn't
     789              :      * necessary.
     790              :      */
     791     12638125 :     if (key->keysz > ntupatts)
     792       101788 :         return 1;
     793              : 
     794              :     /*
     795              :      * Use the heap TID attribute and scantid to try to break the tie.  The
     796              :      * rules are the same as any other key attribute -- only the
     797              :      * representation differs.
     798              :      */
     799     12536337 :     heapTid = BTreeTupleGetHeapTID(itup);
     800     12536337 :     if (key->scantid == NULL)
     801              :     {
     802              :         /*
     803              :          * Forward scans have a scankey that is considered greater than a
     804              :          * truncated pivot tuple if and when the scankey has equal values for
     805              :          * attributes up to and including the least significant untruncated
     806              :          * attribute in tuple.  Even attributes that were omitted from the
     807              :          * scan key are considered greater than -inf truncated attributes.
     808              :          * (See _bt_binsrch for an explanation of our backward scan behavior.)
     809              :          *
     810              :          * For example, if an index has the minimum two attributes (single
     811              :          * user key attribute, plus heap TID attribute), and a page's high key
     812              :          * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
     813              :          * will not descend to the page to the left.  The search will descend
     814              :          * right instead.  The truncated attribute in pivot tuple means that
     815              :          * all non-pivot tuples on the page to the left are strictly < 'foo',
     816              :          * so it isn't necessary to descend left.  In other words, search
     817              :          * doesn't have to descend left because it isn't interested in a match
     818              :          * that has a heap TID value of -inf.
     819              :          *
     820              :          * Note: the heap TID part of the test ensures that scankey is being
     821              :          * compared to a pivot tuple with one or more truncated -inf key
     822              :          * attributes.  The heap TID attribute is the last key attribute in
     823              :          * every index, of course, but other than that it isn't special.
     824              :          */
     825     10219930 :         if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
     826         5232 :             key->heapkeyspace)
     827         5232 :             return 1;
     828              : 
     829              :         /* All provided scankey arguments found to be equal */
     830     10214698 :         return 0;
     831              :     }
     832              : 
     833              :     /*
     834              :      * Treat truncated heap TID as minus infinity, since scankey has a key
     835              :      * attribute value (scantid) that would otherwise be compared directly
     836              :      */
     837              :     Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
     838      2316407 :     if (heapTid == NULL)
     839         1981 :         return 1;
     840              : 
     841              :     /*
     842              :      * Scankey must be treated as equal to a posting list tuple if its scantid
     843              :      * value falls within the range of the posting list.  In all other cases
     844              :      * there can only be a single heap TID value, which is compared directly
     845              :      * with scantid.
     846              :      */
     847              :     Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
     848      2314426 :     result = ItemPointerCompare(key->scantid, heapTid);
     849      2314426 :     if (result <= 0 || !BTreeTupleIsPosting(itup))
     850      2223753 :         return result;
     851              :     else
     852              :     {
     853        90673 :         result = ItemPointerCompare(key->scantid,
     854        90673 :                                     BTreeTupleGetMaxHeapTID(itup));
     855        90673 :         if (result > 0)
     856        76113 :             return 1;
     857              :     }
     858              : 
     859        14560 :     return 0;
     860              : }
     861              : 
     862              : /*
     863              :  *  _bt_first() -- Find the first item in a scan.
     864              :  *
     865              :  *      We need to be clever about the direction of scan, the search
     866              :  *      conditions, and the tree ordering.  We find the first item (or,
     867              :  *      if backwards scan, the last item) in the tree that satisfies the
     868              :  *      qualifications in the scan key.  On success exit, data about the
     869              :  *      matching tuple(s) on the page has been loaded into so->currPos.  We'll
     870              :  *      drop all locks and hold onto a pin on page's buffer, except during
     871              :  *      so->dropPin scans, when we drop both the lock and the pin.
     872              :  *      _bt_returnitem sets the next item to return to scan on success exit.
     873              :  *
     874              :  * If there are no matching items in the index, we return false, with no
     875              :  * pins or locks held.  so->currPos will remain invalid.
     876              :  *
     877              :  * Note that scan->keyData[], and the so->keyData[] scankey built from it,
     878              :  * are both search-type scankeys (see nbtree/README for more about this).
     879              :  * Within this routine, we build a temporary insertion-type scankey to use
     880              :  * in locating the scan start position.
     881              :  */
     882              : bool
     883      8805666 : _bt_first(IndexScanDesc scan, ScanDirection dir)
     884              : {
     885      8805666 :     Relation    rel = scan->indexRelation;
     886      8805666 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     887              :     OffsetNumber offnum;
     888              :     BTScanInsertData inskey;
     889              :     ScanKey     startKeys[INDEX_MAX_KEYS];
     890              :     ScanKeyData notnullkey;
     891      8805666 :     int         keysz = 0;
     892      8805666 :     StrategyNumber strat_total = InvalidStrategy;
     893      8805666 :     BlockNumber blkno = InvalidBlockNumber,
     894              :                 lastcurrblkno;
     895              : 
     896              :     Assert(!BTScanPosIsValid(so->currPos));
     897              : 
     898              :     /*
     899              :      * Examine the scan keys and eliminate any redundant keys; also mark the
     900              :      * keys that must be matched to continue the scan.
     901              :      */
     902      8805666 :     _bt_preprocess_keys(scan);
     903              : 
     904              :     /*
     905              :      * Quit now if _bt_preprocess_keys() discovered that the scan keys can
     906              :      * never be satisfied (eg, x == 1 AND x > 2).
     907              :      */
     908      8805666 :     if (!so->qual_ok)
     909              :     {
     910              :         Assert(!so->needPrimScan);
     911         1132 :         _bt_parallel_done(scan);
     912         1132 :         return false;
     913              :     }
     914              : 
     915              :     /*
     916              :      * If this is a parallel scan, we must seize the scan.  _bt_readfirstpage
     917              :      * will likely release the parallel scan later on.
     918              :      */
     919      8804534 :     if (scan->parallel_scan != NULL &&
     920          223 :         !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
     921          145 :         return false;
     922              : 
     923              :     /*
     924              :      * Initialize the scan's arrays (if any) for the current scan direction
     925              :      * (except when they were already set to later values as part of
     926              :      * scheduling the primitive index scan that is now underway)
     927              :      */
     928      8804389 :     if (so->numArrayKeys && !so->needPrimScan)
     929        35708 :         _bt_start_array_keys(scan, dir);
     930              : 
     931      8804389 :     if (blkno != InvalidBlockNumber)
     932              :     {
     933              :         /*
     934              :          * We anticipated calling _bt_search, but another worker bet us to it.
     935              :          * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
     936              :          */
     937              :         Assert(scan->parallel_scan != NULL);
     938              :         Assert(!so->needPrimScan);
     939              :         Assert(blkno != P_NONE);
     940              : 
     941           16 :         if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
     942            0 :             return false;
     943              : 
     944           16 :         _bt_returnitem(scan, so);
     945           16 :         return true;
     946              :     }
     947              : 
     948              :     /*
     949              :      * Count an indexscan for stats, now that we know that we'll call
     950              :      * _bt_search/_bt_endpoint below
     951              :      */
     952      8804373 :     pgstat_count_index_scan(rel);
     953      8804373 :     if (scan->instrument)
     954       548685 :         scan->instrument->nsearches++;
     955              : 
     956              :     /*----------
     957              :      * Examine the scan keys to discover where we need to start the scan.
     958              :      * The selected scan keys (at most one per index column) are remembered by
     959              :      * storing their addresses into the local startKeys[] array.  The final
     960              :      * startKeys[] entry's strategy is set in strat_total. (Actually, there
     961              :      * are a couple of cases where we force a less/more restrictive strategy.)
     962              :      *
     963              :      * We must use the key that was marked required (in the direction opposite
     964              :      * our own scan's) during preprocessing.  Each index attribute can only
     965              :      * have one such required key.  In general, the keys that we use to find
     966              :      * an initial position when scanning forwards are the same keys that end
     967              :      * the scan on the leaf level when scanning backwards (and vice-versa).
     968              :      *
     969              :      * When the scan keys include cross-type operators, _bt_preprocess_keys
     970              :      * may not be able to eliminate redundant keys; in such cases it will
     971              :      * arbitrarily pick a usable key for each attribute (and scan direction),
     972              :      * ensuring that there is no more than one key required in each direction.
     973              :      * We stop considering further keys once we reach the first nonrequired
     974              :      * key (which must come after all required keys), so this can't affect us.
     975              :      *
     976              :      * The required keys that we use as starting boundaries have to be =, >,
     977              :      * or >= keys for a forward scan or =, <, <= keys for a backwards scan.
     978              :      * We can use keys for multiple attributes so long as the prior attributes
     979              :      * had only =, >= (resp. =, <=) keys.  These rules are very similar to the
     980              :      * rules that preprocessing used to determine which keys to mark required.
     981              :      * We cannot always use every required key as a positioning key, though.
     982              :      * Skip arrays necessitate independently applying our own rules here.
     983              :      * Skip arrays are always generally considered = array keys, but we'll
     984              :      * nevertheless treat them as inequalities at certain points of the scan.
     985              :      * When that happens, it _might_ have implications for the number of
     986              :      * required keys that we can safely use for initial positioning purposes.
     987              :      *
     988              :      * For example, a forward scan with a skip array on its leading attribute
     989              :      * (with no low_compare/high_compare) will have at least two required scan
     990              :      * keys, but we won't use any of them as boundary keys during the scan's
     991              :      * initial call here.  Our positioning key during the first call here can
     992              :      * be thought of as representing "> -infinity".  Similarly, if such a skip
     993              :      * array's low_compare is "a > 'foo'", then we position using "a > 'foo'"
     994              :      * during the scan's initial call here; a lower-order key such as "b = 42"
     995              :      * can't be used until the "a" array advances beyond MINVAL/low_compare.
     996              :      *
     997              :      * On the other hand, if such a skip array's low_compare was "a >= 'foo'",
     998              :      * then we _can_ use "a >= 'foo' AND b = 42" during the initial call here.
     999              :      * A subsequent call here might have us use "a = 'fop' AND b = 42".  Note
    1000              :      * that we treat = and >= as equivalent when scanning forwards (just as we
    1001              :      * treat = and <= as equivalent when scanning backwards).  We effectively
    1002              :      * do the same thing (though with a distinct "a" element/value) each time.
    1003              :      *
    1004              :      * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP
    1005              :      * array keys whose array is "null_elem=true") imply a NOT NULL qualifier.
    1006              :      * If the index stores nulls at the end of the index we'll be starting
    1007              :      * from, and we have no boundary key for the column (which means the key
    1008              :      * we deduced NOT NULL from is an inequality key that constrains the other
    1009              :      * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
    1010              :      * use as a boundary key.  If we didn't do this, we might find ourselves
    1011              :      * traversing a lot of null entries at the start of the scan.
    1012              :      *
    1013              :      * In this loop, row-comparison keys are treated the same as keys on their
    1014              :      * first (leftmost) columns.  We'll add all lower-order columns of the row
    1015              :      * comparison that were marked required during preprocessing below.
    1016              :      *
    1017              :      * _bt_advance_array_keys needs to know exactly how we'll reposition the
    1018              :      * scan (should it opt to schedule another primitive index scan).  It is
    1019              :      * critical that primscans only be scheduled when they'll definitely make
    1020              :      * some useful progress.  _bt_advance_array_keys does this by calling
    1021              :      * _bt_checkkeys routines that report whether a tuple is past the end of
    1022              :      * matches for the scan's keys (given the scan's current array elements).
    1023              :      * If the page's final tuple is "after the end of matches" for a scan that
    1024              :      * uses the *opposite* scan direction, then it must follow that it's also
    1025              :      * "before the start of matches" for the actual current scan direction.
    1026              :      * It is therefore essential that all of our initial positioning rules are
    1027              :      * symmetric with _bt_checkkeys's corresponding continuescan=false rule.
    1028              :      * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
    1029              :      * need to be kept in sync.
    1030              :      *----------
    1031              :      */
    1032      8804373 :     if (so->numberOfKeys > 0)
    1033              :     {
    1034              :         AttrNumber  curattr;
    1035              :         ScanKey     bkey;
    1036              :         ScanKey     impliesNN;
    1037              :         ScanKey     cur;
    1038              : 
    1039              :         /*
    1040              :          * bkey will be set to the key that preprocessing left behind as the
    1041              :          * boundary key for this attribute, in this scan direction (if any)
    1042              :          */
    1043      8797574 :         cur = so->keyData;
    1044      8797574 :         curattr = 1;
    1045      8797574 :         bkey = NULL;
    1046              :         /* Also remember any scankey that implies a NOT NULL constraint */
    1047      8797574 :         impliesNN = NULL;
    1048              : 
    1049              :         /*
    1050              :          * Loop iterates from 0 to numberOfKeys inclusive; we use the last
    1051              :          * pass to handle after-last-key processing.  Actual exit from the
    1052              :          * loop is at one of the "break" statements below.
    1053              :          */
    1054     22662182 :         for (int i = 0;; cur++, i++)
    1055              :         {
    1056     22662182 :             if (i >= so->numberOfKeys || cur->sk_attno != curattr)
    1057              :             {
    1058              :                 /* Done looking for the curattr boundary key */
    1059              :                 Assert(bkey == NULL ||
    1060              :                        (bkey->sk_attno == curattr &&
    1061              :                         (bkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
    1062              :                 Assert(impliesNN == NULL ||
    1063              :                        (impliesNN->sk_attno == curattr &&
    1064              :                         (impliesNN->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
    1065              : 
    1066              :                 /*
    1067              :                  * If this is a scan key for a skip array whose current
    1068              :                  * element is MINVAL, choose low_compare (when scanning
    1069              :                  * backwards it'll be MAXVAL, and we'll choose high_compare).
    1070              :                  *
    1071              :                  * Note: if the array's low_compare key makes 'bkey' NULL,
    1072              :                  * then we behave as if the array's first element is -inf,
    1073              :                  * except when !array->null_elem implies a usable NOT NULL
    1074              :                  * constraint.
    1075              :                  */
    1076     13863698 :                 if (bkey != NULL &&
    1077     13824939 :                     (bkey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))
    1078              :                 {
    1079         1897 :                     int         ikey = bkey - so->keyData;
    1080         1897 :                     ScanKey     skipequalitykey = bkey;
    1081         1897 :                     BTArrayKeyInfo *array = NULL;
    1082              : 
    1083         1950 :                     for (int arridx = 0; arridx < so->numArrayKeys; arridx++)
    1084              :                     {
    1085         1950 :                         array = &so->arrayKeys[arridx];
    1086         1950 :                         if (array->scan_key == ikey)
    1087         1897 :                             break;
    1088              :                     }
    1089              : 
    1090         1897 :                     if (ScanDirectionIsForward(dir))
    1091              :                     {
    1092              :                         Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL));
    1093         1888 :                         bkey = array->low_compare;
    1094              :                     }
    1095              :                     else
    1096              :                     {
    1097              :                         Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL));
    1098            9 :                         bkey = array->high_compare;
    1099              :                     }
    1100              : 
    1101              :                     Assert(bkey == NULL ||
    1102              :                            bkey->sk_attno == skipequalitykey->sk_attno);
    1103              : 
    1104         1897 :                     if (!array->null_elem)
    1105           80 :                         impliesNN = skipequalitykey;
    1106              :                     else
    1107              :                         Assert(bkey == NULL && impliesNN == NULL);
    1108              :                 }
    1109              : 
    1110              :                 /*
    1111              :                  * If we didn't find a usable boundary key, see if we can
    1112              :                  * deduce a NOT NULL key
    1113              :                  */
    1114     13902487 :                 if (bkey == NULL && impliesNN != NULL &&
    1115        38789 :                     ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
    1116              :                      ScanDirectionIsForward(dir) :
    1117              :                      ScanDirectionIsBackward(dir)))
    1118              :                 {
    1119              :                     /* Final startKeys[] entry will be deduced NOT NULL key */
    1120           15 :                     bkey = &notnullkey;
    1121           15 :                     ScanKeyEntryInitialize(bkey,
    1122              :                                            (SK_SEARCHNOTNULL | SK_ISNULL |
    1123           15 :                                             (impliesNN->sk_flags &
    1124              :                                              (SK_BT_DESC | SK_BT_NULLS_FIRST))),
    1125              :                                            curattr,
    1126              :                                            ScanDirectionIsForward(dir) ?
    1127              :                                            BTGreaterStrategyNumber : BTLessStrategyNumber,
    1128              :                                            InvalidOid,
    1129              :                                            InvalidOid,
    1130              :                                            InvalidOid,
    1131              :                                            (Datum) 0);
    1132              :                 }
    1133              : 
    1134              :                 /*
    1135              :                  * If preprocessing didn't leave a usable boundary key, quit;
    1136              :                  * else save the boundary key pointer in startKeys[]
    1137              :                  */
    1138     13863698 :                 if (bkey == NULL)
    1139        40591 :                     break;
    1140     13823107 :                 startKeys[keysz++] = bkey;
    1141              : 
    1142              :                 /*
    1143              :                  * We can only consider adding more boundary keys when the one
    1144              :                  * that we just chose to add uses either the = or >= strategy
    1145              :                  * (during backwards scans we can only do so when the key that
    1146              :                  * we just added to startKeys[] uses the = or <= strategy)
    1147              :                  */
    1148     13823107 :                 strat_total = bkey->sk_strategy;
    1149     13823107 :                 if (strat_total == BTGreaterStrategyNumber ||
    1150              :                     strat_total == BTLessStrategyNumber)
    1151              :                     break;
    1152              : 
    1153              :                 /*
    1154              :                  * If the key that we just added to startKeys[] is a skip
    1155              :                  * array = key whose current element is marked NEXT or PRIOR,
    1156              :                  * make strat_total > or < (and stop adding boundary keys).
    1157              :                  * This can only happen with opclasses that lack skip support.
    1158              :                  */
    1159     12912317 :                 if (bkey->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))
    1160              :                 {
    1161              :                     Assert(bkey->sk_flags & SK_BT_SKIP);
    1162              :                     Assert(strat_total == BTEqualStrategyNumber);
    1163              : 
    1164            6 :                     if (ScanDirectionIsForward(dir))
    1165              :                     {
    1166              :                         Assert(!(bkey->sk_flags & SK_BT_PRIOR));
    1167            3 :                         strat_total = BTGreaterStrategyNumber;
    1168              :                     }
    1169              :                     else
    1170              :                     {
    1171              :                         Assert(!(bkey->sk_flags & SK_BT_NEXT));
    1172            3 :                         strat_total = BTLessStrategyNumber;
    1173              :                     }
    1174              : 
    1175              :                     /*
    1176              :                      * We're done.  We'll never find an exact = match for a
    1177              :                      * NEXT or PRIOR sentinel sk_argument value.  There's no
    1178              :                      * sense in trying to add more keys to startKeys[].
    1179              :                      */
    1180            6 :                     break;
    1181              :                 }
    1182              : 
    1183              :                 /*
    1184              :                  * Done if that was the last scan key output by preprocessing.
    1185              :                  * Also done if we've now examined all keys marked required.
    1186              :                  */
    1187     12912311 :                 if (i >= so->numberOfKeys ||
    1188      5066127 :                     !(cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
    1189              :                     break;
    1190              : 
    1191              :                 /*
    1192              :                  * Reset for next attr.
    1193              :                  */
    1194              :                 Assert(cur->sk_attno == curattr + 1);
    1195      5066124 :                 curattr = cur->sk_attno;
    1196      5066124 :                 bkey = NULL;
    1197      5066124 :                 impliesNN = NULL;
    1198              :             }
    1199              : 
    1200              :             /*
    1201              :              * If we've located the starting boundary key for curattr, we have
    1202              :              * no interest in curattr's other required key
    1203              :              */
    1204     13864608 :             if (bkey != NULL)
    1205          898 :                 continue;
    1206              : 
    1207              :             /*
    1208              :              * Is this key the starting boundary key for curattr?
    1209              :              *
    1210              :              * If not, does it imply a NOT NULL constraint?  (Because
    1211              :              * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
    1212              :              * *any* inequality key works for that; we need not test.)
    1213              :              */
    1214     13863710 :             switch (cur->sk_strategy)
    1215              :             {
    1216        69545 :                 case BTLessStrategyNumber:
    1217              :                 case BTLessEqualStrategyNumber:
    1218        69545 :                     if (ScanDirectionIsBackward(dir))
    1219        30795 :                         bkey = cur;
    1220        38750 :                     else if (impliesNN == NULL)
    1221        38750 :                         impliesNN = cur;
    1222        69545 :                     break;
    1223     12909093 :                 case BTEqualStrategyNumber:
    1224     12909093 :                     bkey = cur;
    1225     12909093 :                     break;
    1226       885072 :                 case BTGreaterEqualStrategyNumber:
    1227              :                 case BTGreaterStrategyNumber:
    1228       885072 :                     if (ScanDirectionIsForward(dir))
    1229       885051 :                         bkey = cur;
    1230           21 :                     else if (impliesNN == NULL)
    1231           21 :                         impliesNN = cur;
    1232       885072 :                     break;
    1233              :             }
    1234              :         }
    1235              :     }
    1236              : 
    1237              :     /*
    1238              :      * If we found no usable boundary keys, we have to start from one end of
    1239              :      * the tree.  Walk down that edge to the first or last key, and scan from
    1240              :      * there.
    1241              :      *
    1242              :      * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
    1243              :      */
    1244      8804373 :     if (keysz == 0)
    1245        47023 :         return _bt_endpoint(scan, dir);
    1246              : 
    1247              :     /*
    1248              :      * We want to start the scan somewhere within the index.  Set up an
    1249              :      * insertion scankey we can use to search for the boundary point we
    1250              :      * identified above.  The insertion scankey is built using the keys
    1251              :      * identified by startKeys[].  (Remaining insertion scankey fields are
    1252              :      * initialized after initial-positioning scan keys are finalized.)
    1253              :      */
    1254              :     Assert(keysz <= INDEX_MAX_KEYS);
    1255     22580433 :     for (int i = 0; i < keysz; i++)
    1256              :     {
    1257     13823107 :         ScanKey     bkey = startKeys[i];
    1258              : 
    1259              :         Assert(bkey->sk_attno == i + 1);
    1260              : 
    1261     13823107 :         if (bkey->sk_flags & SK_ROW_HEADER)
    1262              :         {
    1263              :             /*
    1264              :              * Row comparison header: look to the first row member instead
    1265              :              */
    1266           24 :             ScanKey     subkey = (ScanKey) DatumGetPointer(bkey->sk_argument);
    1267           24 :             bool        loosen_strat = false,
    1268           24 :                         tighten_strat = false;
    1269              : 
    1270              :             /*
    1271              :              * Cannot be a NULL in the first row member: _bt_preprocess_keys
    1272              :              * would've marked the qual as unsatisfiable, preventing us from
    1273              :              * ever getting this far
    1274              :              */
    1275              :             Assert(subkey->sk_flags & SK_ROW_MEMBER);
    1276              :             Assert(subkey->sk_attno == bkey->sk_attno);
    1277              :             Assert(!(subkey->sk_flags & SK_ISNULL));
    1278              : 
    1279              :             /*
    1280              :              * This is either a > or >= key (during backwards scans it is
    1281              :              * either < or <=) that was marked required during preprocessing.
    1282              :              * Later so->keyData[] keys can't have been marked required, so
    1283              :              * our row compare header key must be the final startKeys[] entry.
    1284              :              */
    1285              :             Assert(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD));
    1286              :             Assert(subkey->sk_strategy == bkey->sk_strategy);
    1287              :             Assert(subkey->sk_strategy == strat_total);
    1288              :             Assert(i == keysz - 1);
    1289              : 
    1290              :             /*
    1291              :              * The member scankeys are already in insertion format (ie, they
    1292              :              * have sk_func = 3-way-comparison function)
    1293              :              */
    1294           24 :             memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
    1295              : 
    1296              :             /*
    1297              :              * Now look to later row compare members.
    1298              :              *
    1299              :              * If there's an "index attribute gap" between two row compare
    1300              :              * members, the second member won't have been marked required, and
    1301              :              * so can't be used as a starting boundary key here.  The part of
    1302              :              * the row comparison that we do still use has to be treated as a
    1303              :              * ">=" or "<=" condition.  For example, a qual "(a, c) > (1, 42)"
    1304              :              * with an omitted intervening index attribute "b" will use an
    1305              :              * insertion scan key "a >= 1".  Even the first "a = 1" tuple on
    1306              :              * the leaf level might satisfy the row compare qual.
    1307              :              *
    1308              :              * We're able to use a _more_ restrictive strategy when we reach a
    1309              :              * NULL row compare member, since they're always unsatisfiable.
    1310              :              * For example, a qual "(a, b, c) >= (1, NULL, 77)" will use an
    1311              :              * insertion scan key "a > 1".  All tuples where "a = 1" cannot
    1312              :              * possibly satisfy the row compare qual, so this is safe.
    1313              :              */
    1314              :             Assert(!(subkey->sk_flags & SK_ROW_END));
    1315              :             for (;;)
    1316              :             {
    1317           24 :                 subkey++;
    1318              :                 Assert(subkey->sk_flags & SK_ROW_MEMBER);
    1319              : 
    1320           24 :                 if (subkey->sk_flags & SK_ISNULL)
    1321              :                 {
    1322              :                     /*
    1323              :                      * NULL member key, can only use earlier keys.
    1324              :                      *
    1325              :                      * We deliberately avoid checking if this key is marked
    1326              :                      * required.  All earlier keys are required, and this key
    1327              :                      * is unsatisfiable either way, so we can't miss anything.
    1328              :                      */
    1329            6 :                     tighten_strat = true;
    1330            6 :                     break;
    1331              :                 }
    1332              : 
    1333           18 :                 if (!(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
    1334              :                 {
    1335              :                     /* nonrequired member key, can only use earlier keys */
    1336            6 :                     loosen_strat = true;
    1337            6 :                     break;
    1338              :                 }
    1339              : 
    1340              :                 Assert(subkey->sk_attno == keysz + 1);
    1341              :                 Assert(subkey->sk_strategy == bkey->sk_strategy);
    1342              :                 Assert(keysz < INDEX_MAX_KEYS);
    1343              : 
    1344           12 :                 memcpy(inskey.scankeys + keysz, subkey, sizeof(ScanKeyData));
    1345           12 :                 keysz++;
    1346              : 
    1347           12 :                 if (subkey->sk_flags & SK_ROW_END)
    1348           12 :                     break;
    1349              :             }
    1350              :             Assert(!(loosen_strat && tighten_strat));
    1351           24 :             if (loosen_strat)
    1352              :             {
    1353              :                 /* Use less restrictive strategy (and fewer member keys) */
    1354            6 :                 switch (strat_total)
    1355              :                 {
    1356            3 :                     case BTLessStrategyNumber:
    1357            3 :                         strat_total = BTLessEqualStrategyNumber;
    1358            3 :                         break;
    1359            3 :                     case BTGreaterStrategyNumber:
    1360            3 :                         strat_total = BTGreaterEqualStrategyNumber;
    1361            3 :                         break;
    1362              :                 }
    1363              :             }
    1364           24 :             if (tighten_strat)
    1365              :             {
    1366              :                 /* Use more restrictive strategy (and fewer member keys) */
    1367            6 :                 switch (strat_total)
    1368              :                 {
    1369            3 :                     case BTLessEqualStrategyNumber:
    1370            3 :                         strat_total = BTLessStrategyNumber;
    1371            3 :                         break;
    1372            3 :                     case BTGreaterEqualStrategyNumber:
    1373            3 :                         strat_total = BTGreaterStrategyNumber;
    1374            3 :                         break;
    1375              :                 }
    1376              :             }
    1377              : 
    1378              :             /* Done (row compare header key is always last startKeys[] key) */
    1379           24 :             break;
    1380              :         }
    1381              : 
    1382              :         /*
    1383              :          * Ordinary comparison key/search-style key.
    1384              :          *
    1385              :          * Transform the search-style scan key to an insertion scan key by
    1386              :          * replacing the sk_func with the appropriate btree 3-way-comparison
    1387              :          * function.
    1388              :          *
    1389              :          * If scankey operator is not a cross-type comparison, we can use the
    1390              :          * cached comparison function; otherwise gotta look it up in the
    1391              :          * catalogs.  (That can't lead to infinite recursion, since no
    1392              :          * indexscan initiated by syscache lookup will use cross-data-type
    1393              :          * operators.)
    1394              :          *
    1395              :          * We support the convention that sk_subtype == InvalidOid means the
    1396              :          * opclass input type; this hack simplifies life for ScanKeyInit().
    1397              :          */
    1398     13823083 :         if (bkey->sk_subtype == rel->rd_opcintype[i] ||
    1399     13250412 :             bkey->sk_subtype == InvalidOid)
    1400     13817556 :         {
    1401              :             FmgrInfo   *procinfo;
    1402              : 
    1403     13817556 :             procinfo = index_getprocinfo(rel, bkey->sk_attno, BTORDER_PROC);
    1404     13817556 :             ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
    1405              :                                            bkey->sk_flags,
    1406     13817556 :                                            bkey->sk_attno,
    1407              :                                            InvalidStrategy,
    1408              :                                            bkey->sk_subtype,
    1409              :                                            bkey->sk_collation,
    1410              :                                            procinfo,
    1411              :                                            bkey->sk_argument);
    1412              :         }
    1413              :         else
    1414              :         {
    1415              :             RegProcedure cmp_proc;
    1416              : 
    1417         5527 :             cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
    1418         5527 :                                          rel->rd_opcintype[i],
    1419              :                                          bkey->sk_subtype, BTORDER_PROC);
    1420         5527 :             if (!RegProcedureIsValid(cmp_proc))
    1421            0 :                 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
    1422              :                      BTORDER_PROC, rel->rd_opcintype[i], bkey->sk_subtype,
    1423              :                      bkey->sk_attno, RelationGetRelationName(rel));
    1424         5527 :             ScanKeyEntryInitialize(inskey.scankeys + i,
    1425              :                                    bkey->sk_flags,
    1426         5527 :                                    bkey->sk_attno,
    1427              :                                    InvalidStrategy,
    1428              :                                    bkey->sk_subtype,
    1429              :                                    bkey->sk_collation,
    1430              :                                    cmp_proc,
    1431              :                                    bkey->sk_argument);
    1432              :         }
    1433              :     }
    1434              : 
    1435              :     /*----------
    1436              :      * Examine the selected initial-positioning strategy to determine exactly
    1437              :      * where we need to start the scan, and set flag variables to control the
    1438              :      * initial descent by _bt_search (and our _bt_binsrch call for the leaf
    1439              :      * page _bt_search returns).
    1440              :      *----------
    1441              :      */
    1442      8757350 :     _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
    1443      8757350 :     inskey.anynullkeys = false; /* unused */
    1444      8757350 :     inskey.scantid = NULL;
    1445      8757350 :     inskey.keysz = keysz;
    1446      8757350 :     switch (strat_total)
    1447              :     {
    1448        30798 :         case BTLessStrategyNumber:
    1449              : 
    1450        30798 :             inskey.nextkey = false;
    1451        30798 :             inskey.backward = true;
    1452        30798 :             break;
    1453              : 
    1454            9 :         case BTLessEqualStrategyNumber:
    1455              : 
    1456            9 :             inskey.nextkey = true;
    1457            9 :             inskey.backward = true;
    1458            9 :             break;
    1459              : 
    1460      7841480 :         case BTEqualStrategyNumber:
    1461              : 
    1462              :             /*
    1463              :              * If a backward scan was specified, need to start with last equal
    1464              :              * item not first one.
    1465              :              */
    1466      7841480 :             if (ScanDirectionIsBackward(dir))
    1467              :             {
    1468              :                 /*
    1469              :                  * This is the same as the <= strategy
    1470              :                  */
    1471          103 :                 inskey.nextkey = true;
    1472          103 :                 inskey.backward = true;
    1473              :             }
    1474              :             else
    1475              :             {
    1476              :                 /*
    1477              :                  * This is the same as the >= strategy
    1478              :                  */
    1479      7841377 :                 inskey.nextkey = false;
    1480      7841377 :                 inskey.backward = false;
    1481              :             }
    1482      7841480 :             break;
    1483              : 
    1484         5065 :         case BTGreaterEqualStrategyNumber:
    1485              : 
    1486              :             /*
    1487              :              * Find first item >= scankey
    1488              :              */
    1489         5065 :             inskey.nextkey = false;
    1490         5065 :             inskey.backward = false;
    1491         5065 :             break;
    1492              : 
    1493       879998 :         case BTGreaterStrategyNumber:
    1494              : 
    1495              :             /*
    1496              :              * Find first item > scankey
    1497              :              */
    1498       879998 :             inskey.nextkey = true;
    1499       879998 :             inskey.backward = false;
    1500       879998 :             break;
    1501              : 
    1502            0 :         default:
    1503              :             /* can't get here, but keep compiler quiet */
    1504            0 :             elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
    1505              :             return false;
    1506              :     }
    1507              : 
    1508              :     /*
    1509              :      * Use the manufactured insertion scan key to descend the tree and
    1510              :      * position ourselves on the target leaf page.
    1511              :      */
    1512              :     Assert(ScanDirectionIsBackward(dir) == inskey.backward);
    1513      8757350 :     _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ, false);
    1514              : 
    1515      8757350 :     if (!BufferIsValid(so->currPos.buf))
    1516              :     {
    1517              :         Assert(!so->needPrimScan);
    1518              : 
    1519              :         /*
    1520              :          * We only get here if the index is completely empty. Lock relation
    1521              :          * because nothing finer to lock exists.  Without a buffer lock, it's
    1522              :          * possible for another transaction to insert data between
    1523              :          * _bt_search() and PredicateLockRelation().  We have to try again
    1524              :          * after taking the relation-level predicate lock, to close a narrow
    1525              :          * window where we wouldn't scan concurrently inserted tuples, but the
    1526              :          * writer wouldn't see our predicate lock.
    1527              :          */
    1528       291011 :         if (IsolationIsSerializable())
    1529              :         {
    1530         2826 :             PredicateLockRelation(rel, scan->xs_snapshot);
    1531         2826 :             _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ, false);
    1532              :         }
    1533              : 
    1534       291011 :         if (!BufferIsValid(so->currPos.buf))
    1535              :         {
    1536       291011 :             _bt_parallel_done(scan);
    1537       291011 :             return false;
    1538              :         }
    1539              :     }
    1540              : 
    1541              :     /* position to the precise item on the page */
    1542      8466339 :     offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
    1543              : 
    1544              :     /*
    1545              :      * Now load data from the first page of the scan (usually the page
    1546              :      * currently in so->currPos.buf).
    1547              :      *
    1548              :      * If inskey.nextkey = false and inskey.backward = false, offnum is
    1549              :      * positioned at the first non-pivot tuple >= inskey.scankeys.
    1550              :      *
    1551              :      * If inskey.nextkey = false and inskey.backward = true, offnum is
    1552              :      * positioned at the last non-pivot tuple < inskey.scankeys.
    1553              :      *
    1554              :      * If inskey.nextkey = true and inskey.backward = false, offnum is
    1555              :      * positioned at the first non-pivot tuple > inskey.scankeys.
    1556              :      *
    1557              :      * If inskey.nextkey = true and inskey.backward = true, offnum is
    1558              :      * positioned at the last non-pivot tuple <= inskey.scankeys.
    1559              :      *
    1560              :      * It's possible that _bt_binsrch returned an offnum that is out of bounds
    1561              :      * for the page.  For example, when inskey is both < the leaf page's high
    1562              :      * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
    1563              :      */
    1564      8466339 :     if (!_bt_readfirstpage(scan, offnum, dir))
    1565      2109177 :         return false;
    1566              : 
    1567      6357162 :     _bt_returnitem(scan, so);
    1568      6357162 :     return true;
    1569              : }
    1570              : 
    1571              : /*
    1572              :  *  _bt_next() -- Get the next item in a scan.
    1573              :  *
    1574              :  *      On entry, so->currPos describes the current page, which may be pinned
    1575              :  *      but is not locked, and so->currPos.itemIndex identifies which item was
    1576              :  *      previously returned.
    1577              :  *
    1578              :  *      On success exit, so->currPos is updated as needed, and _bt_returnitem
    1579              :  *      sets the next item to return to the scan.  so->currPos remains valid.
    1580              :  *
    1581              :  *      On failure exit (no more tuples), we invalidate so->currPos.  It'll
    1582              :  *      still be possible for the scan to return tuples by changing direction,
    1583              :  *      though we'll need to call _bt_first anew in that other direction.
    1584              :  */
    1585              : bool
    1586     10294035 : _bt_next(IndexScanDesc scan, ScanDirection dir)
    1587              : {
    1588     10294035 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1589              : 
    1590              :     Assert(BTScanPosIsValid(so->currPos));
    1591              : 
    1592              :     /*
    1593              :      * Advance to next tuple on current page; or if there's no more, try to
    1594              :      * step to the next page with data.
    1595              :      */
    1596     10294035 :     if (ScanDirectionIsForward(dir))
    1597              :     {
    1598     10281102 :         if (++so->currPos.itemIndex > so->currPos.lastItem)
    1599              :         {
    1600      1425079 :             if (!_bt_steppage(scan, dir))
    1601      1409377 :                 return false;
    1602              :         }
    1603              :     }
    1604              :     else
    1605              :     {
    1606        12933 :         if (--so->currPos.itemIndex < so->currPos.firstItem)
    1607              :         {
    1608           58 :             if (!_bt_steppage(scan, dir))
    1609           46 :                 return false;
    1610              :         }
    1611              :     }
    1612              : 
    1613      8884612 :     _bt_returnitem(scan, so);
    1614      8884612 :     return true;
    1615              : }
    1616              : 
    1617              : /*
    1618              :  * Return the index item from so->currPos.items[so->currPos.itemIndex] to the
    1619              :  * index scan by setting the relevant fields in caller's index scan descriptor
    1620              :  */
    1621              : static inline void
    1622     15283978 : _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
    1623              : {
    1624     15283978 :     BTScanPosItem *currItem = &so->currPos.items[so->currPos.itemIndex];
    1625              : 
    1626              :     /* Most recent _bt_readpage must have succeeded */
    1627              :     Assert(BTScanPosIsValid(so->currPos));
    1628              :     Assert(so->currPos.itemIndex >= so->currPos.firstItem);
    1629              :     Assert(so->currPos.itemIndex <= so->currPos.lastItem);
    1630              : 
    1631              :     /* Return next item, per amgettuple contract */
    1632     15283978 :     scan->xs_heaptid = currItem->heapTid;
    1633     15283978 :     if (so->currTuples)
    1634      2087797 :         scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
    1635     15283978 : }
    1636              : 
    1637              : /*
    1638              :  *  _bt_steppage() -- Step to next page containing valid data for scan
    1639              :  *
    1640              :  * Wrapper on _bt_readnextpage that performs final steps for the current page.
    1641              :  *
    1642              :  * On entry, so->currPos must be valid.  Its buffer will be pinned, though
    1643              :  * never locked. (Actually, when so->dropPin there won't even be a pin held,
    1644              :  * though so->currPos.currPage must still be set to a valid block number.)
    1645              :  */
    1646              : static bool
    1647      3535309 : _bt_steppage(IndexScanDesc scan, ScanDirection dir)
    1648              : {
    1649      3535309 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1650              :     BlockNumber blkno,
    1651              :                 lastcurrblkno;
    1652              : 
    1653              :     Assert(BTScanPosIsValid(so->currPos));
    1654              : 
    1655              :     /* Before leaving current page, deal with any killed items */
    1656      3535309 :     if (so->numKilled > 0)
    1657        43246 :         _bt_killitems(scan);
    1658              : 
    1659              :     /*
    1660              :      * Before we modify currPos, make a copy of the page data if there was a
    1661              :      * mark position that needs it.
    1662              :      */
    1663      3535309 :     if (so->markItemIndex >= 0)
    1664              :     {
    1665              :         /* bump pin on current buffer for assignment to mark buffer */
    1666          181 :         if (BTScanPosIsPinned(so->currPos))
    1667          174 :             IncrBufferRefCount(so->currPos.buf);
    1668          181 :         memcpy(&so->markPos, &so->currPos,
    1669              :                offsetof(BTScanPosData, items[1]) +
    1670          181 :                so->currPos.lastItem * sizeof(BTScanPosItem));
    1671          181 :         if (so->markTuples)
    1672          174 :             memcpy(so->markTuples, so->currTuples,
    1673          174 :                    so->currPos.nextTupleOffset);
    1674          181 :         so->markPos.itemIndex = so->markItemIndex;
    1675          181 :         so->markItemIndex = -1;
    1676              : 
    1677              :         /*
    1678              :          * If we're just about to start the next primitive index scan
    1679              :          * (possible with a scan that has arrays keys, and needs to skip to
    1680              :          * continue in the current scan direction), moreLeft/moreRight only
    1681              :          * indicate the end of the current primitive index scan.  They must
    1682              :          * never be taken to indicate that the top-level index scan has ended
    1683              :          * (that would be wrong).
    1684              :          *
    1685              :          * We could handle this case by treating the current array keys as
    1686              :          * markPos state.  But depending on the current array state like this
    1687              :          * would add complexity.  Instead, we just unset markPos's copy of
    1688              :          * moreRight or moreLeft (whichever might be affected), while making
    1689              :          * btrestrpos reset the scan's arrays to their initial scan positions.
    1690              :          * In effect, btrestrpos leaves advancing the arrays up to the first
    1691              :          * _bt_readpage call (that takes place after it has restored markPos).
    1692              :          */
    1693          181 :         if (so->needPrimScan)
    1694              :         {
    1695            0 :             if (ScanDirectionIsForward(so->currPos.dir))
    1696            0 :                 so->markPos.moreRight = true;
    1697              :             else
    1698            0 :                 so->markPos.moreLeft = true;
    1699              :         }
    1700              : 
    1701              :         /* mark/restore not supported by parallel scans */
    1702              :         Assert(!scan->parallel_scan);
    1703              :     }
    1704              : 
    1705      3535309 :     BTScanPosUnpinIfPinned(so->currPos);
    1706              : 
    1707              :     /* Walk to the next page with data */
    1708      3535309 :     if (ScanDirectionIsForward(dir))
    1709      3535177 :         blkno = so->currPos.nextPage;
    1710              :     else
    1711          132 :         blkno = so->currPos.prevPage;
    1712      3535309 :     lastcurrblkno = so->currPos.currPage;
    1713              : 
    1714              :     /*
    1715              :      * Cancel primitive index scans that were scheduled when the call to
    1716              :      * _bt_readpage for currPos happened to use the opposite direction to the
    1717              :      * one that we're stepping in now.  (It's okay to leave the scan's array
    1718              :      * keys as-is, since the next _bt_readpage will advance them.)
    1719              :      */
    1720      3535309 :     if (so->currPos.dir != dir)
    1721           18 :         so->needPrimScan = false;
    1722              : 
    1723      3535309 :     return _bt_readnextpage(scan, blkno, lastcurrblkno, dir, false);
    1724              : }
    1725              : 
    1726              : /*
    1727              :  *  _bt_readfirstpage() -- Read first page containing valid data for _bt_first
    1728              :  *
    1729              :  * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
    1730              :  * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
    1731              :  * When we're passed an offnum past the end of the page, we might still manage
    1732              :  * to stop the scan on this page by calling _bt_checkkeys against the high
    1733              :  * key.  See _bt_readpage for full details.
    1734              :  *
    1735              :  * On entry, so->currPos must be pinned and locked (so offnum stays valid).
    1736              :  * Parallel scan callers must have seized the scan before calling here.
    1737              :  *
    1738              :  * On exit, we'll have updated so->currPos and retained locks and pins
    1739              :  * according to the same rules as those laid out for _bt_readnextpage exit.
    1740              :  * Like _bt_readnextpage, our return value indicates if there are any matching
    1741              :  * records in the given direction.
    1742              :  *
    1743              :  * We always release the scan for a parallel scan caller, regardless of
    1744              :  * success or failure; we'll call _bt_parallel_release as soon as possible.
    1745              :  */
    1746              : static bool
    1747      8509421 : _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
    1748              : {
    1749      8509421 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1750              : 
    1751      8509421 :     so->numKilled = 0;           /* just paranoia */
    1752      8509421 :     so->markItemIndex = -1;      /* ditto */
    1753              : 
    1754              :     /* Initialize so->currPos for the first page (page in so->currPos.buf) */
    1755      8509421 :     if (so->needPrimScan)
    1756              :     {
    1757              :         Assert(so->numArrayKeys);
    1758              : 
    1759         8802 :         so->currPos.moreLeft = true;
    1760         8802 :         so->currPos.moreRight = true;
    1761         8802 :         so->needPrimScan = false;
    1762              :     }
    1763      8500619 :     else if (ScanDirectionIsForward(dir))
    1764              :     {
    1765      8469694 :         so->currPos.moreLeft = false;
    1766      8469694 :         so->currPos.moreRight = true;
    1767              :     }
    1768              :     else
    1769              :     {
    1770        30925 :         so->currPos.moreLeft = true;
    1771        30925 :         so->currPos.moreRight = false;
    1772              :     }
    1773              : 
    1774              :     /*
    1775              :      * Attempt to load matching tuples from the first page.
    1776              :      *
    1777              :      * Note that _bt_readpage will finish initializing the so->currPos fields.
    1778              :      * _bt_readpage also releases parallel scan (even when it returns false).
    1779              :      */
    1780      8509421 :     if (_bt_readpage(scan, dir, offnum, true))
    1781              :     {
    1782      6399249 :         Relation    rel = scan->indexRelation;
    1783              : 
    1784              :         /*
    1785              :          * _bt_readpage succeeded.  Drop the lock (and maybe the pin) on
    1786              :          * so->currPos.buf in preparation for btgettuple returning tuples.
    1787              :          */
    1788              :         Assert(BTScanPosIsPinned(so->currPos));
    1789      6399249 :         _bt_drop_lock_and_maybe_pin(rel, so);
    1790      6399249 :         return true;
    1791              :     }
    1792              : 
    1793              :     /* There's no actually-matching data on the page in so->currPos.buf */
    1794      2110172 :     _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
    1795              : 
    1796              :     /* Call _bt_readnextpage using its _bt_steppage wrapper function */
    1797      2110172 :     if (!_bt_steppage(scan, dir))
    1798      2110071 :         return false;
    1799              : 
    1800              :     /* _bt_readpage for a later page (now in so->currPos) succeeded */
    1801          101 :     return true;
    1802              : }
    1803              : 
    1804              : /*
    1805              :  *  _bt_readnextpage() -- Read next page containing valid data for _bt_next
    1806              :  *
    1807              :  * Caller's blkno is the next interesting page's link, taken from either the
    1808              :  * previously-saved right link or left link.  lastcurrblkno is the page that
    1809              :  * was current at the point where the blkno link was saved, which we use to
    1810              :  * reason about concurrent page splits/page deletions during backwards scans.
    1811              :  * In the common case where seized=false, blkno is either so->currPos.nextPage
    1812              :  * or so->currPos.prevPage, and lastcurrblkno is so->currPos.currPage.
    1813              :  *
    1814              :  * On entry, so->currPos shouldn't be locked by caller.  so->currPos.buf must
    1815              :  * be InvalidBuffer/unpinned as needed by caller (note that lastcurrblkno
    1816              :  * won't need to be read again in almost all cases).  Parallel scan callers
    1817              :  * that seized the scan before calling here should pass seized=true; such a
    1818              :  * caller's blkno and lastcurrblkno arguments come from the seized scan.
    1819              :  * seized=false callers just pass us the blkno/lastcurrblkno taken from their
    1820              :  * so->currPos, which (along with so->currPos itself) can be used to end the
    1821              :  * scan.  A seized=false caller's blkno can never be assumed to be the page
    1822              :  * that must be read next during a parallel scan, though.  We must figure that
    1823              :  * part out for ourselves by seizing the scan (the correct page to read might
    1824              :  * already be beyond the seized=false caller's blkno during a parallel scan,
    1825              :  * unless blkno/so->currPos.nextPage/so->currPos.prevPage is already P_NONE,
    1826              :  * or unless so->currPos.moreRight/so->currPos.moreLeft is already unset).
    1827              :  *
    1828              :  * On success exit, so->currPos is updated to contain data from the next
    1829              :  * interesting page, and we return true.  We hold a pin on the buffer on
    1830              :  * success exit (except during so->dropPin index scans, when we drop the pin
    1831              :  * eagerly to avoid blocking VACUUM).
    1832              :  *
    1833              :  * If there are no more matching records in the given direction, we invalidate
    1834              :  * so->currPos (while ensuring it retains no locks or pins), and return false.
    1835              :  *
    1836              :  * We always release the scan for a parallel scan caller, regardless of
    1837              :  * success or failure; we'll call _bt_parallel_release as soon as possible.
    1838              :  */
    1839              : static bool
    1840      3535325 : _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
    1841              :                  BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
    1842              : {
    1843      3535325 :     Relation    rel = scan->indexRelation;
    1844      3535325 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1845              : 
    1846              :     Assert(so->currPos.currPage == lastcurrblkno || seized);
    1847              :     Assert(!(blkno == P_NONE && seized));
    1848              :     Assert(!BTScanPosIsPinned(so->currPos));
    1849              : 
    1850              :     /*
    1851              :      * Remember that the scan already read lastcurrblkno, a page to the left
    1852              :      * of blkno (or remember reading a page to the right, for backwards scans)
    1853              :      */
    1854      3535325 :     if (ScanDirectionIsForward(dir))
    1855      3535193 :         so->currPos.moreLeft = true;
    1856              :     else
    1857          132 :         so->currPos.moreRight = true;
    1858              : 
    1859              :     for (;;)
    1860         1217 :     {
    1861              :         Page        page;
    1862              :         BTPageOpaque opaque;
    1863              : 
    1864      3536542 :         if (blkno == P_NONE ||
    1865              :             (ScanDirectionIsForward(dir) ?
    1866      1109670 :              !so->currPos.moreRight : !so->currPos.moreLeft))
    1867              :         {
    1868              :             /* most recent _bt_readpage call (for lastcurrblkno) ended scan */
    1869              :             Assert(so->currPos.currPage == lastcurrblkno && !seized);
    1870      3519478 :             BTScanPosInvalidate(so->currPos);
    1871      3519478 :             _bt_parallel_done(scan);    /* iff !so->needPrimScan */
    1872      3519478 :             return false;
    1873              :         }
    1874              : 
    1875              :         Assert(!so->needPrimScan);
    1876              : 
    1877              :         /* parallel scan must never actually visit so->currPos blkno */
    1878        17064 :         if (!seized && scan->parallel_scan != NULL &&
    1879          606 :             !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, false))
    1880              :         {
    1881              :             /* whole scan is now done (or another primitive scan required) */
    1882           16 :             BTScanPosInvalidate(so->currPos);
    1883           16 :             return false;
    1884              :         }
    1885              : 
    1886        17048 :         if (ScanDirectionIsForward(dir))
    1887              :         {
    1888              :             /* read blkno, but check for interrupts first */
    1889        16966 :             CHECK_FOR_INTERRUPTS();
    1890        16966 :             so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
    1891              :         }
    1892              :         else
    1893              :         {
    1894              :             /* read blkno, avoiding race (also checks for interrupts) */
    1895           82 :             so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno,
    1896              :                                                          lastcurrblkno);
    1897           82 :             if (so->currPos.buf == InvalidBuffer)
    1898              :             {
    1899              :                 /* must have been a concurrent deletion of leftmost page */
    1900            0 :                 BTScanPosInvalidate(so->currPos);
    1901            0 :                 _bt_parallel_done(scan);
    1902            0 :                 return false;
    1903              :             }
    1904              :         }
    1905              : 
    1906        17048 :         page = BufferGetPage(so->currPos.buf);
    1907        17048 :         opaque = BTPageGetOpaque(page);
    1908        17048 :         lastcurrblkno = blkno;
    1909        17048 :         if (likely(!P_IGNORE(opaque)))
    1910              :         {
    1911              :             /* see if there are any matches on this page */
    1912        17048 :             if (ScanDirectionIsForward(dir))
    1913              :             {
    1914              :                 /* note that this will clear moreRight if we can stop */
    1915        16966 :                 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), seized))
    1916        15760 :                     break;
    1917         1206 :                 blkno = so->currPos.nextPage;
    1918              :             }
    1919              :             else
    1920              :             {
    1921              :                 /* note that this will clear moreLeft if we can stop */
    1922           82 :                 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), seized))
    1923           71 :                     break;
    1924           11 :                 blkno = so->currPos.prevPage;
    1925              :             }
    1926              :         }
    1927              :         else
    1928              :         {
    1929              :             /* _bt_readpage not called, so do all this for ourselves */
    1930            0 :             if (ScanDirectionIsForward(dir))
    1931            0 :                 blkno = opaque->btpo_next;
    1932              :             else
    1933            0 :                 blkno = opaque->btpo_prev;
    1934            0 :             if (scan->parallel_scan != NULL)
    1935            0 :                 _bt_parallel_release(scan, blkno, lastcurrblkno);
    1936              :         }
    1937              : 
    1938              :         /* no matching tuples on this page */
    1939         1217 :         _bt_relbuf(rel, so->currPos.buf);
    1940         1217 :         seized = false;         /* released by _bt_readpage (or by us) */
    1941              :     }
    1942              : 
    1943              :     /*
    1944              :      * _bt_readpage succeeded.  Drop the lock (and maybe the pin) on
    1945              :      * so->currPos.buf in preparation for btgettuple returning tuples.
    1946              :      */
    1947              :     Assert(so->currPos.currPage == blkno);
    1948              :     Assert(BTScanPosIsPinned(so->currPos));
    1949        15831 :     _bt_drop_lock_and_maybe_pin(rel, so);
    1950              : 
    1951        15831 :     return true;
    1952              : }
    1953              : 
    1954              : /*
    1955              :  * _bt_lock_and_validate_left() -- lock caller's left sibling blkno,
    1956              :  * recovering from concurrent page splits/page deletions when necessary
    1957              :  *
    1958              :  * Called during backwards scans, to deal with their unique concurrency rules.
    1959              :  *
    1960              :  * blkno points to the block number of the page that we expect to move the
    1961              :  * scan to.  We'll successfully move the scan there when we find that its
    1962              :  * right sibling link still points to lastcurrblkno (the page we just read).
    1963              :  * Otherwise, we have to figure out which page is the correct one for the scan
    1964              :  * to now read the hard way, reasoning about concurrent splits and deletions.
    1965              :  * See nbtree/README.
    1966              :  *
    1967              :  * On return, we have both a pin and a read lock on the returned page, whose
    1968              :  * block number will be set in *blkno.  Returns InvalidBuffer if there is no
    1969              :  * page to the left (no lock or pin is held in that case).
    1970              :  *
    1971              :  * It is possible for the returned leaf page to be half-dead; caller must
    1972              :  * check that condition and step left again when required.
    1973              :  */
    1974              : static Buffer
    1975           82 : _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
    1976              :                            BlockNumber lastcurrblkno)
    1977              : {
    1978           82 :     BlockNumber origblkno = *blkno; /* detects circular links */
    1979              : 
    1980              :     for (;;)
    1981            0 :     {
    1982              :         Buffer      buf;
    1983              :         Page        page;
    1984              :         BTPageOpaque opaque;
    1985              :         int         tries;
    1986              : 
    1987              :         /* check for interrupts while we're not holding any buffer lock */
    1988           82 :         CHECK_FOR_INTERRUPTS();
    1989           82 :         buf = _bt_getbuf(rel, *blkno, BT_READ);
    1990           82 :         page = BufferGetPage(buf);
    1991           82 :         opaque = BTPageGetOpaque(page);
    1992              : 
    1993              :         /*
    1994              :          * If this isn't the page we want, walk right till we find what we
    1995              :          * want --- but go no more than four hops (an arbitrary limit). If we
    1996              :          * don't find the correct page by then, the most likely bet is that
    1997              :          * lastcurrblkno got deleted and isn't in the sibling chain at all
    1998              :          * anymore, not that its left sibling got split more than four times.
    1999              :          *
    2000              :          * Note that it is correct to test P_ISDELETED not P_IGNORE here,
    2001              :          * because half-dead pages are still in the sibling chain.
    2002              :          */
    2003           82 :         tries = 0;
    2004              :         for (;;)
    2005              :         {
    2006           82 :             if (likely(!P_ISDELETED(opaque) &&
    2007              :                        opaque->btpo_next == lastcurrblkno))
    2008              :             {
    2009              :                 /* Found desired page, return it */
    2010           82 :                 return buf;
    2011              :             }
    2012            0 :             if (P_RIGHTMOST(opaque) || ++tries > 4)
    2013              :                 break;
    2014              :             /* step right */
    2015            0 :             *blkno = opaque->btpo_next;
    2016            0 :             buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ);
    2017            0 :             page = BufferGetPage(buf);
    2018            0 :             opaque = BTPageGetOpaque(page);
    2019              :         }
    2020              : 
    2021              :         /*
    2022              :          * Return to the original page (usually the page most recently read by
    2023              :          * _bt_readpage, which is passed by caller as lastcurrblkno) to see
    2024              :          * what's up with its prev sibling link
    2025              :          */
    2026            0 :         buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
    2027            0 :         page = BufferGetPage(buf);
    2028            0 :         opaque = BTPageGetOpaque(page);
    2029            0 :         if (P_ISDELETED(opaque))
    2030              :         {
    2031              :             /*
    2032              :              * It was deleted.  Move right to first nondeleted page (there
    2033              :              * must be one); that is the page that has acquired the deleted
    2034              :              * one's keyspace, so stepping left from it will take us where we
    2035              :              * want to be.
    2036              :              */
    2037              :             for (;;)
    2038              :             {
    2039            0 :                 if (P_RIGHTMOST(opaque))
    2040            0 :                     elog(ERROR, "fell off the end of index \"%s\"",
    2041              :                          RelationGetRelationName(rel));
    2042            0 :                 lastcurrblkno = opaque->btpo_next;
    2043            0 :                 buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ);
    2044            0 :                 page = BufferGetPage(buf);
    2045            0 :                 opaque = BTPageGetOpaque(page);
    2046            0 :                 if (!P_ISDELETED(opaque))
    2047            0 :                     break;
    2048              :             }
    2049              :         }
    2050              :         else
    2051              :         {
    2052              :             /*
    2053              :              * Original lastcurrblkno wasn't deleted; the explanation had
    2054              :              * better be that the page to the left got split or deleted.
    2055              :              * Without this check, we risk going into an infinite loop.
    2056              :              */
    2057            0 :             if (opaque->btpo_prev == origblkno)
    2058            0 :                 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
    2059              :                      lastcurrblkno, RelationGetRelationName(rel));
    2060              :             /* Okay to try again, since left sibling link changed */
    2061              :         }
    2062              : 
    2063              :         /*
    2064              :          * Original lastcurrblkno from caller was concurrently deleted (could
    2065              :          * also have been a great many concurrent left sibling page splits).
    2066              :          * Found a non-deleted page that should now act as our lastcurrblkno.
    2067              :          */
    2068            0 :         if (P_LEFTMOST(opaque))
    2069              :         {
    2070              :             /* New lastcurrblkno has no left sibling (concurrently deleted) */
    2071            0 :             _bt_relbuf(rel, buf);
    2072            0 :             break;
    2073              :         }
    2074              : 
    2075              :         /* Start from scratch with new lastcurrblkno's blkno/prev link */
    2076            0 :         *blkno = origblkno = opaque->btpo_prev;
    2077            0 :         _bt_relbuf(rel, buf);
    2078              :     }
    2079              : 
    2080            0 :     return InvalidBuffer;
    2081              : }
    2082              : 
    2083              : /*
    2084              :  * _bt_get_endpoint() -- Find the first or last page on a given tree level
    2085              :  *
    2086              :  * If the index is empty, we will return InvalidBuffer; any other failure
    2087              :  * condition causes ereport().  We will not return a dead page.
    2088              :  *
    2089              :  * The returned buffer is pinned and read-locked.
    2090              :  */
    2091              : Buffer
    2092        47035 : _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
    2093              : {
    2094              :     Buffer      buf;
    2095              :     Page        page;
    2096              :     BTPageOpaque opaque;
    2097              :     OffsetNumber offnum;
    2098              :     BlockNumber blkno;
    2099              :     IndexTuple  itup;
    2100              : 
    2101              :     /*
    2102              :      * If we are looking for a leaf page, okay to descend from fast root;
    2103              :      * otherwise better descend from true root.  (There is no point in being
    2104              :      * smarter about intermediate levels.)
    2105              :      */
    2106        47035 :     if (level == 0)
    2107        47023 :         buf = _bt_getroot(rel, NULL, BT_READ);
    2108              :     else
    2109           12 :         buf = _bt_gettrueroot(rel);
    2110              : 
    2111        47035 :     if (!BufferIsValid(buf))
    2112         3941 :         return InvalidBuffer;
    2113              : 
    2114        43094 :     page = BufferGetPage(buf);
    2115        43094 :     opaque = BTPageGetOpaque(page);
    2116              : 
    2117              :     for (;;)
    2118              :     {
    2119              :         /*
    2120              :          * If we landed on a deleted page, step right to find a live page
    2121              :          * (there must be one).  Also, if we want the rightmost page, step
    2122              :          * right if needed to get to it (this could happen if the page split
    2123              :          * since we obtained a pointer to it).
    2124              :          */
    2125        54997 :         while (P_IGNORE(opaque) ||
    2126           33 :                (rightmost && !P_RIGHTMOST(opaque)))
    2127              :         {
    2128            0 :             blkno = opaque->btpo_next;
    2129            0 :             if (blkno == P_NONE)
    2130            0 :                 elog(ERROR, "fell off the end of index \"%s\"",
    2131              :                      RelationGetRelationName(rel));
    2132            0 :             buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    2133            0 :             page = BufferGetPage(buf);
    2134            0 :             opaque = BTPageGetOpaque(page);
    2135              :         }
    2136              : 
    2137              :         /* Done? */
    2138        54997 :         if (opaque->btpo_level == level)
    2139        43094 :             break;
    2140        11903 :         if (opaque->btpo_level < level)
    2141            0 :             ereport(ERROR,
    2142              :                     (errcode(ERRCODE_INDEX_CORRUPTED),
    2143              :                      errmsg_internal("btree level %u not found in index \"%s\"",
    2144              :                                      level, RelationGetRelationName(rel))));
    2145              : 
    2146              :         /* Descend to leftmost or rightmost child page */
    2147        11903 :         if (rightmost)
    2148            3 :             offnum = PageGetMaxOffsetNumber(page);
    2149              :         else
    2150        11900 :             offnum = P_FIRSTDATAKEY(opaque);
    2151              : 
    2152        11903 :         if (offnum < 1 || offnum > PageGetMaxOffsetNumber(page))
    2153            0 :             elog(PANIC, "offnum out of range");
    2154              : 
    2155        11903 :         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
    2156        11903 :         blkno = BTreeTupleGetDownLink(itup);
    2157              : 
    2158        11903 :         buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    2159        11903 :         page = BufferGetPage(buf);
    2160        11903 :         opaque = BTPageGetOpaque(page);
    2161              :     }
    2162              : 
    2163        43094 :     return buf;
    2164              : }
    2165              : 
    2166              : /*
    2167              :  *  _bt_endpoint() -- Find the first or last page in the index, and scan
    2168              :  * from there to the first key satisfying all the quals.
    2169              :  *
    2170              :  * This is used by _bt_first() to set up a scan when we've determined
    2171              :  * that the scan must start at the beginning or end of the index (for
    2172              :  * a forward or backward scan respectively).
    2173              :  *
    2174              :  * Parallel scan callers must have seized the scan before calling here.
    2175              :  * Exit conditions are the same as for _bt_first().
    2176              :  */
    2177              : static bool
    2178        47023 : _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
    2179              : {
    2180        47023 :     Relation    rel = scan->indexRelation;
    2181        47023 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    2182              :     Page        page;
    2183              :     BTPageOpaque opaque;
    2184              :     OffsetNumber start;
    2185              : 
    2186              :     Assert(!BTScanPosIsValid(so->currPos));
    2187              :     Assert(!so->needPrimScan);
    2188              : 
    2189              :     /*
    2190              :      * Scan down to the leftmost or rightmost leaf page.  This is a simplified
    2191              :      * version of _bt_search().
    2192              :      */
    2193        47023 :     so->currPos.buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
    2194              : 
    2195        47023 :     if (!BufferIsValid(so->currPos.buf))
    2196              :     {
    2197              :         /*
    2198              :          * Empty index. Lock the whole relation, as nothing finer to lock
    2199              :          * exists.
    2200              :          */
    2201         3941 :         PredicateLockRelation(rel, scan->xs_snapshot);
    2202         3941 :         _bt_parallel_done(scan);
    2203         3941 :         return false;
    2204              :     }
    2205              : 
    2206        43082 :     page = BufferGetPage(so->currPos.buf);
    2207        43082 :     opaque = BTPageGetOpaque(page);
    2208              :     Assert(P_ISLEAF(opaque));
    2209              : 
    2210        43082 :     if (ScanDirectionIsForward(dir))
    2211              :     {
    2212              :         /* There could be dead pages to the left, so not this: */
    2213              :         /* Assert(P_LEFTMOST(opaque)); */
    2214              : 
    2215        43052 :         start = P_FIRSTDATAKEY(opaque);
    2216              :     }
    2217           30 :     else if (ScanDirectionIsBackward(dir))
    2218              :     {
    2219              :         Assert(P_RIGHTMOST(opaque));
    2220              : 
    2221           30 :         start = PageGetMaxOffsetNumber(page);
    2222              :     }
    2223              :     else
    2224              :     {
    2225            0 :         elog(ERROR, "invalid scan direction: %d", (int) dir);
    2226              :         start = 0;              /* keep compiler quiet */
    2227              :     }
    2228              : 
    2229              :     /*
    2230              :      * Now load data from the first page of the scan.
    2231              :      */
    2232        43082 :     if (!_bt_readfirstpage(scan, start, dir))
    2233          894 :         return false;
    2234              : 
    2235        42188 :     _bt_returnitem(scan, so);
    2236        42188 :     return true;
    2237              : }
        

Generated by: LCOV version 2.0-1