LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtsearch.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13beta1 Lines: 574 677 84.8 %
Date: 2020-06-01 08:06:25 Functions: 19 20 95.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtsearch.c
       4             :  *    Search code for postgres btrees.
       5             :  *
       6             :  *
       7             :  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
       8             :  * Portions Copyright (c) 1994, Regents of the University of California
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/access/nbtree/nbtsearch.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : 
      16             : #include "postgres.h"
      17             : 
      18             : #include "access/nbtree.h"
      19             : #include "access/relscan.h"
      20             : #include "miscadmin.h"
      21             : #include "pgstat.h"
      22             : #include "storage/predicate.h"
      23             : #include "utils/lsyscache.h"
      24             : #include "utils/rel.h"
      25             : 
      26             : 
      27             : static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
      28             : static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
      29             : static int  _bt_binsrch_posting(BTScanInsert key, Page page,
      30             :                                 OffsetNumber offnum);
      31             : static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
      32             :                          OffsetNumber offnum);
      33             : static void _bt_saveitem(BTScanOpaque so, int itemIndex,
      34             :                          OffsetNumber offnum, IndexTuple itup);
      35             : static int  _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
      36             :                                   OffsetNumber offnum, ItemPointer heapTid,
      37             :                                   IndexTuple itup);
      38             : static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
      39             :                                        OffsetNumber offnum,
      40             :                                        ItemPointer heapTid, int tupleOffset);
      41             : static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
      42             : static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
      43             : static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
      44             :                                   ScanDirection dir);
      45             : static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
      46             : static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
      47             : static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
      48             : 
      49             : 
      50             : /*
      51             :  *  _bt_drop_lock_and_maybe_pin()
      52             :  *
      53             :  * Unlock the buffer; and if it is safe to release the pin, do that, too.  It
      54             :  * is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
      55             :  * This will prevent vacuum from stalling in a blocked state trying to read a
      56             :  * page when a cursor is sitting on it -- at least in many important cases.
      57             :  *
      58             :  * Set the buffer to invalid if the pin is released, since the buffer may be
      59             :  * re-used.  If we need to go back to this block (for example, to apply
      60             :  * LP_DEAD hints) we must get a fresh reference to the buffer.  Hopefully it
      61             :  * will remain in shared memory for as long as it takes to scan the index
      62             :  * buffer page.
      63             :  */
      64             : static void
      65     8178484 : _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
      66             : {
      67     8178484 :     LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK);
      68             : 
      69     8178484 :     if (IsMVCCSnapshot(scan->xs_snapshot) &&
      70     8061950 :         RelationNeedsWAL(scan->indexRelation) &&
      71     8057682 :         !scan->xs_want_itup)
      72             :     {
      73     8039024 :         ReleaseBuffer(sp->buf);
      74     8039024 :         sp->buf = InvalidBuffer;
      75             :     }
      76     8178484 : }
      77             : 
      78             : /*
      79             :  *  _bt_search() -- Search the tree for a particular scankey,
      80             :  *      or more precisely for the first leaf page it could be on.
      81             :  *
      82             :  * The passed scankey is an insertion-type scankey (see nbtree/README),
      83             :  * but it can omit the rightmost column(s) of the index.
      84             :  *
      85             :  * Return value is a stack of parent-page pointers.  *bufP is set to the
      86             :  * address of the leaf-page buffer, which is locked and pinned.  No locks
      87             :  * are held on the parent pages, however!
      88             :  *
      89             :  * If the snapshot parameter is not NULL, "old snapshot" checking will take
      90             :  * place during the descent through the tree.  This is not needed when
      91             :  * positioning for an insert or delete, so NULL is used for those cases.
      92             :  *
      93             :  * The returned buffer is locked according to access parameter.  Additionally,
      94             :  * access = BT_WRITE will allow an empty root page to be created and returned.
      95             :  * When access = BT_READ, an empty index will result in *bufP being set to
      96             :  * InvalidBuffer.  Also, in BT_WRITE mode, any incomplete splits encountered
      97             :  * during the search will be finished.
      98             :  */
      99             : BTStack
     100    22555864 : _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
     101             :            Snapshot snapshot)
     102             : {
     103    22555864 :     BTStack     stack_in = NULL;
     104    22555864 :     int         page_access = BT_READ;
     105             : 
     106             :     /* Get the root page to start with */
     107    22555864 :     *bufP = _bt_getroot(rel, access);
     108             : 
     109             :     /* If index is empty and access = BT_READ, no root page is created. */
     110    22555864 :     if (!BufferIsValid(*bufP))
     111      352448 :         return (BTStack) NULL;
     112             : 
     113             :     /* Loop iterates once per level descended in the tree */
     114             :     for (;;)
     115    19141088 :     {
     116             :         Page        page;
     117             :         BTPageOpaque opaque;
     118             :         OffsetNumber offnum;
     119             :         ItemId      itemid;
     120             :         IndexTuple  itup;
     121             :         BlockNumber blkno;
     122             :         BlockNumber par_blkno;
     123             :         BTStack     new_stack;
     124             : 
     125             :         /*
     126             :          * Race -- the page we just grabbed may have split since we read its
     127             :          * pointer in the parent (or metapage).  If it has, we may need to
     128             :          * move right to its new sibling.  Do that.
     129             :          *
     130             :          * In write-mode, allow _bt_moveright to finish any incomplete splits
     131             :          * along the way.  Strictly speaking, we'd only need to finish an
     132             :          * incomplete split on the leaf page we're about to insert to, not on
     133             :          * any of the upper levels (they are taken care of in _bt_getstackbuf,
     134             :          * if the leaf page is split and we insert to the parent page).  But
     135             :          * this is a good opportunity to finish splits of internal pages too.
     136             :          */
     137    41344504 :         *bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in,
     138             :                               page_access, snapshot);
     139             : 
     140             :         /* if this is a leaf page, we're done */
     141    41344504 :         page = BufferGetPage(*bufP);
     142    41344504 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     143    41344504 :         if (P_ISLEAF(opaque))
     144    22203416 :             break;
     145             : 
     146             :         /*
     147             :          * Find the appropriate item on the internal page, and get the child
     148             :          * page that it points to.
     149             :          */
     150    19141088 :         offnum = _bt_binsrch(rel, key, *bufP);
     151    19141088 :         itemid = PageGetItemId(page, offnum);
     152    19141088 :         itup = (IndexTuple) PageGetItem(page, itemid);
     153             :         Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
     154    19141088 :         blkno = BTreeTupleGetDownLink(itup);
     155    19141088 :         par_blkno = BufferGetBlockNumber(*bufP);
     156             : 
     157             :         /*
     158             :          * We need to save the location of the pivot tuple we chose in the
     159             :          * parent page on a stack.  If we need to split a page, we'll use the
     160             :          * stack to work back up to its parent page.  If caller ends up
     161             :          * splitting a page one level down, it usually ends up inserting a new
     162             :          * pivot tuple/downlink immediately after the location recorded here.
     163             :          */
     164    19141088 :         new_stack = (BTStack) palloc(sizeof(BTStackData));
     165    19141088 :         new_stack->bts_blkno = par_blkno;
     166    19141088 :         new_stack->bts_offset = offnum;
     167    19141088 :         new_stack->bts_parent = stack_in;
     168             : 
     169             :         /*
     170             :          * Page level 1 is lowest non-leaf page level prior to leaves.  So, if
     171             :          * we're on the level 1 and asked to lock leaf page in write mode,
     172             :          * then lock next page in write mode, because it must be a leaf.
     173             :          */
     174    19141088 :         if (opaque->btpo.level == 1 && access == BT_WRITE)
     175     9922500 :             page_access = BT_WRITE;
     176             : 
     177             :         /* drop the read lock on the parent page, acquire one on the child */
     178    19141088 :         *bufP = _bt_relandgetbuf(rel, *bufP, blkno, page_access);
     179             : 
     180             :         /* okay, all set to move down a level */
     181    19141088 :         stack_in = new_stack;
     182             :     }
     183             : 
     184             :     /*
     185             :      * If we're asked to lock leaf in write mode, but didn't manage to, then
     186             :      * relock.  This should only happen when the root page is a leaf page (and
     187             :      * the only page in the index other than the metapage).
     188             :      */
     189    22203416 :     if (access == BT_WRITE && page_access == BT_READ)
     190             :     {
     191             :         /* trade in our read lock for a write lock */
     192     1090466 :         LockBuffer(*bufP, BUFFER_LOCK_UNLOCK);
     193     1090466 :         LockBuffer(*bufP, BT_WRITE);
     194             : 
     195             :         /*
     196             :          * If the page was split between the time that we surrendered our read
     197             :          * lock and acquired our write lock, then this page may no longer be
     198             :          * the right place for the key we want to insert.  In this case, we
     199             :          * need to move right in the tree.  See Lehman and Yao for an
     200             :          * excruciatingly precise description.
     201             :          */
     202     1090466 :         *bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
     203             :                               snapshot);
     204             :     }
     205             : 
     206    22203416 :     return stack_in;
     207             : }
     208             : 
     209             : /*
     210             :  *  _bt_moveright() -- move right in the btree if necessary.
     211             :  *
     212             :  * When we follow a pointer to reach a page, it is possible that
     213             :  * the page has changed in the meanwhile.  If this happens, we're
     214             :  * guaranteed that the page has "split right" -- that is, that any
     215             :  * data that appeared on the page originally is either on the page
     216             :  * or strictly to the right of it.
     217             :  *
     218             :  * This routine decides whether or not we need to move right in the
     219             :  * tree by examining the high key entry on the page.  If that entry is
     220             :  * strictly less than the scankey, or <= the scankey in the
     221             :  * key.nextkey=true case, then we followed the wrong link and we need
     222             :  * to move right.
     223             :  *
     224             :  * The passed insertion-type scankey can omit the rightmost column(s) of the
     225             :  * index. (see nbtree/README)
     226             :  *
     227             :  * When key.nextkey is false (the usual case), we are looking for the first
     228             :  * item >= key.  When key.nextkey is true, we are looking for the first item
     229             :  * strictly greater than key.
     230             :  *
     231             :  * If forupdate is true, we will attempt to finish any incomplete splits
     232             :  * that we encounter.  This is required when locking a target page for an
     233             :  * insertion, because we don't allow inserting on a page before the split
     234             :  * is completed.  'stack' is only used if forupdate is true.
     235             :  *
     236             :  * On entry, we have the buffer pinned and a lock of the type specified by
     237             :  * 'access'.  If we move right, we release the buffer and lock and acquire
     238             :  * the same on the right sibling.  Return value is the buffer we stop at.
     239             :  *
     240             :  * If the snapshot parameter is not NULL, "old snapshot" checking will take
     241             :  * place during the descent through the tree.  This is not needed when
     242             :  * positioning for an insert or delete, so NULL is used for those cases.
     243             :  */
     244             : Buffer
     245    42434970 : _bt_moveright(Relation rel,
     246             :               BTScanInsert key,
     247             :               Buffer buf,
     248             :               bool forupdate,
     249             :               BTStack stack,
     250             :               int access,
     251             :               Snapshot snapshot)
     252             : {
     253             :     Page        page;
     254             :     BTPageOpaque opaque;
     255             :     int32       cmpval;
     256             : 
     257             :     /*
     258             :      * When nextkey = false (normal case): if the scan key that brought us to
     259             :      * this page is > the high key stored on the page, then the page has split
     260             :      * and we need to move right.  (pg_upgrade'd !heapkeyspace indexes could
     261             :      * have some duplicates to the right as well as the left, but that's
     262             :      * something that's only ever dealt with on the leaf level, after
     263             :      * _bt_search has found an initial leaf page.)
     264             :      *
     265             :      * When nextkey = true: move right if the scan key is >= page's high key.
     266             :      * (Note that key.scantid cannot be set in this case.)
     267             :      *
     268             :      * The page could even have split more than once, so scan as far as
     269             :      * needed.
     270             :      *
     271             :      * We also have to move right if we followed a link that brought us to a
     272             :      * dead page.
     273             :      */
     274    42434970 :     cmpval = key->nextkey ? 0 : 1;
     275             : 
     276             :     for (;;)
     277             :     {
     278    42436052 :         page = BufferGetPage(buf);
     279    42436052 :         TestForOldSnapshot(snapshot, rel, page);
     280    42436052 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     281             : 
     282    42436052 :         if (P_RIGHTMOST(opaque))
     283    31715232 :             break;
     284             : 
     285             :         /*
     286             :          * Finish any incomplete splits we encounter along the way.
     287             :          */
     288    10720820 :         if (forupdate && P_INCOMPLETE_SPLIT(opaque))
     289             :         {
     290           0 :             BlockNumber blkno = BufferGetBlockNumber(buf);
     291             : 
     292             :             /* upgrade our lock if necessary */
     293           0 :             if (access == BT_READ)
     294             :             {
     295           0 :                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     296           0 :                 LockBuffer(buf, BT_WRITE);
     297             :             }
     298             : 
     299           0 :             if (P_INCOMPLETE_SPLIT(opaque))
     300           0 :                 _bt_finish_split(rel, buf, stack);
     301             :             else
     302           0 :                 _bt_relbuf(rel, buf);
     303             : 
     304             :             /* re-acquire the lock in the right mode, and re-check */
     305           0 :             buf = _bt_getbuf(rel, blkno, access);
     306           0 :             continue;
     307             :         }
     308             : 
     309    10720820 :         if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
     310             :         {
     311             :             /* step right one page */
     312        1082 :             buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
     313        1082 :             continue;
     314             :         }
     315             :         else
     316             :             break;
     317             :     }
     318             : 
     319    42434970 :     if (P_IGNORE(opaque))
     320           0 :         elog(ERROR, "fell off the end of index \"%s\"",
     321             :              RelationGetRelationName(rel));
     322             : 
     323    42434970 :     return buf;
     324             : }
     325             : 
     326             : /*
     327             :  *  _bt_binsrch() -- Do a binary search for a key on a particular page.
     328             :  *
     329             :  * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
     330             :  * key >= given scankey, or > scankey if nextkey is true.  (NOTE: in
     331             :  * particular, this means it is possible to return a value 1 greater than the
     332             :  * number of keys on the page, if the scankey is > all keys on the page.)
     333             :  *
     334             :  * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
     335             :  * of the last key < given scankey, or last key <= given scankey if nextkey
     336             :  * is true.  (Since _bt_compare treats the first data key of such a page as
     337             :  * minus infinity, there will be at least one key < scankey, so the result
     338             :  * always points at one of the keys on the page.)  This key indicates the
     339             :  * right place to descend to be sure we find all leaf keys >= given scankey
     340             :  * (or leaf keys > given scankey when nextkey is true).
     341             :  *
     342             :  * This procedure is not responsible for walking right, it just examines
     343             :  * the given page.  _bt_binsrch() has no lock or refcount side effects
     344             :  * on the buffer.
     345             :  */
     346             : static OffsetNumber
     347    29927476 : _bt_binsrch(Relation rel,
     348             :             BTScanInsert key,
     349             :             Buffer buf)
     350             : {
     351             :     Page        page;
     352             :     BTPageOpaque opaque;
     353             :     OffsetNumber low,
     354             :                 high;
     355             :     int32       result,
     356             :                 cmpval;
     357             : 
     358    29927476 :     page = BufferGetPage(buf);
     359    29927476 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     360             : 
     361             :     /* Requesting nextkey semantics while using scantid seems nonsensical */
     362             :     Assert(!key->nextkey || key->scantid == NULL);
     363             :     /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
     364             :     Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
     365             : 
     366    29927476 :     low = P_FIRSTDATAKEY(opaque);
     367    29927476 :     high = PageGetMaxOffsetNumber(page);
     368             : 
     369             :     /*
     370             :      * If there are no keys on the page, return the first available slot. Note
     371             :      * this covers two cases: the page is really empty (no keys), or it
     372             :      * contains only a high key.  The latter case is possible after vacuuming.
     373             :      * This can never happen on an internal page, however, since they are
     374             :      * never empty (an internal page must have children).
     375             :      */
     376    29927476 :     if (unlikely(high < low))
     377          90 :         return low;
     378             : 
     379             :     /*
     380             :      * Binary search to find the first key on the page >= scan key, or first
     381             :      * key > scankey when nextkey is true.
     382             :      *
     383             :      * For nextkey=false (cmpval=1), the loop invariant is: all slots before
     384             :      * 'low' are < scan key, all slots at or after 'high' are >= scan key.
     385             :      *
     386             :      * For nextkey=true (cmpval=0), the loop invariant is: all slots before
     387             :      * 'low' are <= scan key, all slots at or after 'high' are > scan key.
     388             :      *
     389             :      * We can fall out when high == low.
     390             :      */
     391    29927386 :     high++;                     /* establish the loop invariant for high */
     392             : 
     393    29927386 :     cmpval = key->nextkey ? 0 : 1;   /* select comparison value */
     394             : 
     395   190502940 :     while (high > low)
     396             :     {
     397   160575554 :         OffsetNumber mid = low + ((high - low) / 2);
     398             : 
     399             :         /* We have low <= mid < high, so mid points at a real slot */
     400             : 
     401   160575554 :         result = _bt_compare(rel, key, page, mid);
     402             : 
     403   160575554 :         if (result >= cmpval)
     404   100120310 :             low = mid + 1;
     405             :         else
     406    60455244 :             high = mid;
     407             :     }
     408             : 
     409             :     /*
     410             :      * At this point we have high == low, but be careful: they could point
     411             :      * past the last slot on the page.
     412             :      *
     413             :      * On a leaf page, we always return the first key >= scan key (resp. >
     414             :      * scan key), which could be the last slot + 1.
     415             :      */
     416    29927386 :     if (P_ISLEAF(opaque))
     417    10786298 :         return low;
     418             : 
     419             :     /*
     420             :      * On a non-leaf page, return the last key < scan key (resp. <= scan key).
     421             :      * There must be one if _bt_compare() is playing by the rules.
     422             :      */
     423             :     Assert(low > P_FIRSTDATAKEY(opaque));
     424             : 
     425    19141088 :     return OffsetNumberPrev(low);
     426             : }
     427             : 
     428             : /*
     429             :  *
     430             :  *  _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
     431             :  *
     432             :  * Like _bt_binsrch(), but with support for caching the binary search
     433             :  * bounds.  Only used during insertion, and only on the leaf page that it
     434             :  * looks like caller will insert tuple on.  Exclusive-locked and pinned
     435             :  * leaf page is contained within insertstate.
     436             :  *
     437             :  * Caches the bounds fields in insertstate so that a subsequent call can
     438             :  * reuse the low and strict high bounds of original binary search.  Callers
     439             :  * that use these fields directly must be prepared for the case where low
     440             :  * and/or stricthigh are not on the same page (one or both exceed maxoff
     441             :  * for the page).  The case where there are no items on the page (high <
     442             :  * low) makes bounds invalid.
     443             :  *
     444             :  * Caller is responsible for invalidating bounds when it modifies the page
     445             :  * before calling here a second time, and for dealing with posting list
     446             :  * tuple matches (callers can use insertstate's postingoff field to
     447             :  * determine which existing heap TID will need to be replaced by a posting
     448             :  * list split).
     449             :  */
     450             : OffsetNumber
     451    16204044 : _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
     452             : {
     453    16204044 :     BTScanInsert key = insertstate->itup_key;
     454             :     Page        page;
     455             :     BTPageOpaque opaque;
     456             :     OffsetNumber low,
     457             :                 high,
     458             :                 stricthigh;
     459             :     int32       result,
     460             :                 cmpval;
     461             : 
     462    16204044 :     page = BufferGetPage(insertstate->buf);
     463    16204044 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     464             : 
     465             :     Assert(P_ISLEAF(opaque));
     466             :     Assert(!key->nextkey);
     467             :     Assert(insertstate->postingoff == 0);
     468             : 
     469    16204044 :     if (!insertstate->bounds_valid)
     470             :     {
     471             :         /* Start new binary search */
     472    11493600 :         low = P_FIRSTDATAKEY(opaque);
     473    11493600 :         high = PageGetMaxOffsetNumber(page);
     474             :     }
     475             :     else
     476             :     {
     477             :         /* Restore result of previous binary search against same page */
     478     4710444 :         low = insertstate->low;
     479     4710444 :         high = insertstate->stricthigh;
     480             :     }
     481             : 
     482             :     /* If there are no keys on the page, return the first available slot */
     483    16204044 :     if (unlikely(high < low))
     484             :     {
     485             :         /* Caller can't reuse bounds */
     486       22416 :         insertstate->low = InvalidOffsetNumber;
     487       22416 :         insertstate->stricthigh = InvalidOffsetNumber;
     488       22416 :         insertstate->bounds_valid = false;
     489       22416 :         return low;
     490             :     }
     491             : 
     492             :     /*
     493             :      * Binary search to find the first key on the page >= scan key. (nextkey
     494             :      * is always false when inserting).
     495             :      *
     496             :      * The loop invariant is: all slots before 'low' are < scan key, all slots
     497             :      * at or after 'high' are >= scan key.  'stricthigh' is > scan key, and is
     498             :      * maintained to save additional search effort for caller.
     499             :      *
     500             :      * We can fall out when high == low.
     501             :      */
     502    16181628 :     if (!insertstate->bounds_valid)
     503    11471184 :         high++;                 /* establish the loop invariant for high */
     504    16181628 :     stricthigh = high;          /* high initially strictly higher */
     505             : 
     506    16181628 :     cmpval = 1;                 /* !nextkey comparison value */
     507             : 
     508    97122172 :     while (high > low)
     509             :     {
     510    80940544 :         OffsetNumber mid = low + ((high - low) / 2);
     511             : 
     512             :         /* We have low <= mid < high, so mid points at a real slot */
     513             : 
     514    80940544 :         result = _bt_compare(rel, key, page, mid);
     515             : 
     516    80940544 :         if (result >= cmpval)
     517    63915084 :             low = mid + 1;
     518             :         else
     519             :         {
     520    17025460 :             high = mid;
     521    17025460 :             if (result != 0)
     522    16253034 :                 stricthigh = high;
     523             :         }
     524             : 
     525             :         /*
     526             :          * If tuple at offset located by binary search is a posting list whose
     527             :          * TID range overlaps with caller's scantid, perform posting list
     528             :          * binary search to set postingoff for caller.  Caller must split the
     529             :          * posting list when postingoff is set.  This should happen
     530             :          * infrequently.
     531             :          */
     532    80940544 :         if (unlikely(result == 0 && key->scantid != NULL))
     533      404022 :             insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
     534             :     }
     535             : 
     536             :     /*
     537             :      * On a leaf page, a binary search always returns the first key >= scan
     538             :      * key (at least in !nextkey case), which could be the last slot + 1. This
     539             :      * is also the lower bound of cached search.
     540             :      *
     541             :      * stricthigh may also be the last slot + 1, which prevents caller from
     542             :      * using bounds directly, but is still useful to us if we're called a
     543             :      * second time with cached bounds (cached low will be < stricthigh when
     544             :      * that happens).
     545             :      */
     546    16181628 :     insertstate->low = low;
     547    16181628 :     insertstate->stricthigh = stricthigh;
     548    16181628 :     insertstate->bounds_valid = true;
     549             : 
     550    16181628 :     return low;
     551             : }
     552             : 
     553             : /*----------
     554             :  *  _bt_binsrch_posting() -- posting list binary search.
     555             :  *
     556             :  * Helper routine for _bt_binsrch_insert().
     557             :  *
     558             :  * Returns offset into posting list where caller's scantid belongs.
     559             :  *----------
     560             :  */
     561             : static int
     562      404022 : _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
     563             : {
     564             :     IndexTuple  itup;
     565             :     ItemId      itemid;
     566             :     int         low,
     567             :                 high,
     568             :                 mid,
     569             :                 res;
     570             : 
     571             :     /*
     572             :      * If this isn't a posting tuple, then the index must be corrupt (if it is
     573             :      * an ordinary non-pivot tuple then there must be an existing tuple with a
     574             :      * heap TID that equals inserter's new heap TID/scantid).  Defensively
     575             :      * check that tuple is a posting list tuple whose posting list range
     576             :      * includes caller's scantid.
     577             :      *
     578             :      * (This is also needed because contrib/amcheck's rootdescend option needs
     579             :      * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
     580             :      */
     581      404022 :     itemid = PageGetItemId(page, offnum);
     582      404022 :     itup = (IndexTuple) PageGetItem(page, itemid);
     583      404022 :     if (!BTreeTupleIsPosting(itup))
     584      400000 :         return 0;
     585             : 
     586             :     Assert(key->heapkeyspace && key->allequalimage);
     587             : 
     588             :     /*
     589             :      * In the event that posting list tuple has LP_DEAD bit set, indicate this
     590             :      * to _bt_binsrch_insert() caller by returning -1, a sentinel value.  A
     591             :      * second call to _bt_binsrch_insert() can take place when its caller has
     592             :      * removed the dead item.
     593             :      */
     594        4022 :     if (ItemIdIsDead(itemid))
     595           0 :         return -1;
     596             : 
     597             :     /* "high" is past end of posting list for loop invariant */
     598        4022 :     low = 0;
     599        4022 :     high = BTreeTupleGetNPosting(itup);
     600             :     Assert(high >= 2);
     601             : 
     602       31018 :     while (high > low)
     603             :     {
     604       26996 :         mid = low + ((high - low) / 2);
     605       26996 :         res = ItemPointerCompare(key->scantid,
     606             :                                  BTreeTupleGetPostingN(itup, mid));
     607             : 
     608       26996 :         if (res > 0)
     609       12048 :             low = mid + 1;
     610       14948 :         else if (res < 0)
     611       14948 :             high = mid;
     612             :         else
     613           0 :             return mid;
     614             :     }
     615             : 
     616             :     /* Exact match not found */
     617        4022 :     return low;
     618             : }
     619             : 
     620             : /*----------
     621             :  *  _bt_compare() -- Compare insertion-type scankey to tuple on a page.
     622             :  *
     623             :  *  page/offnum: location of btree item to be compared to.
     624             :  *
     625             :  *      This routine returns:
     626             :  *          <0 if scankey < tuple at offnum;
     627             :  *           0 if scankey == tuple at offnum;
     628             :  *          >0 if scankey > tuple at offnum.
     629             :  *
     630             :  * NULLs in the keys are treated as sortable values.  Therefore
     631             :  * "equality" does not necessarily mean that the item should be returned
     632             :  * to the caller as a matching key.  Similarly, an insertion scankey
     633             :  * with its scantid set is treated as equal to a posting tuple whose TID
     634             :  * range overlaps with their scantid.  There generally won't be a
     635             :  * matching TID in the posting tuple, which caller must handle
     636             :  * themselves (e.g., by splitting the posting list tuple).
     637             :  *
     638             :  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
     639             :  * "minus infinity": this routine will always claim it is less than the
     640             :  * scankey.  The actual key value stored is explicitly truncated to 0
     641             :  * attributes (explicitly minus infinity) with version 3+ indexes, but
     642             :  * that isn't relied upon.  This allows us to implement the Lehman and
     643             :  * Yao convention that the first down-link pointer is before the first
     644             :  * key.  See backend/access/nbtree/README for details.
     645             :  *----------
     646             :  */
     647             : int32
     648   262655086 : _bt_compare(Relation rel,
     649             :             BTScanInsert key,
     650             :             Page page,
     651             :             OffsetNumber offnum)
     652             : {
     653   262655086 :     TupleDesc   itupdesc = RelationGetDescr(rel);
     654   262655086 :     BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     655             :     IndexTuple  itup;
     656             :     ItemPointer heapTid;
     657             :     ScanKey     scankey;
     658             :     int         ncmpkey;
     659             :     int         ntupatts;
     660             :     int32       result;
     661             : 
     662             :     Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
     663             :     Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
     664             :     Assert(key->heapkeyspace || key->scantid == NULL);
     665             : 
     666             :     /*
     667             :      * Force result ">" if target item is first data item on an internal page
     668             :      * --- see NOTE above.
     669             :      */
     670   262655086 :     if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
     671     2525106 :         return 1;
     672             : 
     673   260129980 :     itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
     674   260129980 :     ntupatts = BTreeTupleGetNAtts(itup, rel);
     675             : 
     676             :     /*
     677             :      * The scan key is set up with the attribute number associated with each
     678             :      * term in the key.  It is important that, if the index is multi-key, the
     679             :      * scan contain the first k key attributes, and that they be in order.  If
     680             :      * you think about how multi-key ordering works, you'll understand why
     681             :      * this is.
     682             :      *
     683             :      * We don't test for violation of this condition here, however.  The
     684             :      * initial setup for the index scan had better have gotten it right (see
     685             :      * _bt_first).
     686             :      */
     687             : 
     688   260129980 :     ncmpkey = Min(ntupatts, key->keysz);
     689             :     Assert(key->heapkeyspace || ncmpkey == key->keysz);
     690             :     Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
     691   260129980 :     scankey = key->scankeys;
     692   428517956 :     for (int i = 1; i <= ncmpkey; i++)
     693             :     {
     694             :         Datum       datum;
     695             :         bool        isNull;
     696             : 
     697   387322412 :         datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
     698             : 
     699   387322412 :         if (scankey->sk_flags & SK_ISNULL)   /* key is NULL */
     700             :         {
     701      206654 :             if (isNull)
     702          84 :                 result = 0;     /* NULL "=" NULL */
     703      206570 :             else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
     704         176 :                 result = -1;    /* NULL "<" NOT_NULL */
     705             :             else
     706      206394 :                 result = 1;     /* NULL ">" NOT_NULL */
     707             :         }
     708   387115758 :         else if (isNull)        /* key is NOT_NULL and item is NULL */
     709             :         {
     710         132 :             if (scankey->sk_flags & SK_BT_NULLS_FIRST)
     711           0 :                 result = 1;     /* NOT_NULL ">" NULL */
     712             :             else
     713         132 :                 result = -1;    /* NOT_NULL "<" NULL */
     714             :         }
     715             :         else
     716             :         {
     717             :             /*
     718             :              * The sk_func needs to be passed the index value as left arg and
     719             :              * the sk_argument as right arg (they might be of different
     720             :              * types).  Since it is convenient for callers to think of
     721             :              * _bt_compare as comparing the scankey to the index item, we have
     722             :              * to flip the sign of the comparison result.  (Unless it's a DESC
     723             :              * column, in which case we *don't* flip the sign.)
     724             :              */
     725   387115626 :             result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
     726             :                                                      scankey->sk_collation,
     727             :                                                      datum,
     728             :                                                      scankey->sk_argument));
     729             : 
     730   387115626 :             if (!(scankey->sk_flags & SK_BT_DESC))
     731   387115622 :                 INVERT_COMPARE_RESULT(result);
     732             :         }
     733             : 
     734             :         /* if the keys are unequal, return the difference */
     735   387322412 :         if (result != 0)
     736   218934436 :             return result;
     737             : 
     738   168387976 :         scankey++;
     739             :     }
     740             : 
     741             :     /*
     742             :      * All non-truncated attributes (other than heap TID) were found to be
     743             :      * equal.  Treat truncated attributes as minus infinity when scankey has a
     744             :      * key attribute value that would otherwise be compared directly.
     745             :      *
     746             :      * Note: it doesn't matter if ntupatts includes non-key attributes;
     747             :      * scankey won't, so explicitly excluding non-key attributes isn't
     748             :      * necessary.
     749             :      */
     750    41195544 :     if (key->keysz > ntupatts)
     751      516328 :         return 1;
     752             : 
     753             :     /*
     754             :      * Use the heap TID attribute and scantid to try to break the tie.  The
     755             :      * rules are the same as any other key attribute -- only the
     756             :      * representation differs.
     757             :      */
     758    40679216 :     heapTid = BTreeTupleGetHeapTID(itup);
     759    40679216 :     if (key->scantid == NULL)
     760             :     {
     761             :         /*
     762             :          * Most searches have a scankey that is considered greater than a
     763             :          * truncated pivot tuple if and when the scankey has equal values for
     764             :          * attributes up to and including the least significant untruncated
     765             :          * attribute in tuple.
     766             :          *
     767             :          * For example, if an index has the minimum two attributes (single
     768             :          * user key attribute, plus heap TID attribute), and a page's high key
     769             :          * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
     770             :          * will not descend to the page to the left.  The search will descend
     771             :          * right instead.  The truncated attribute in pivot tuple means that
     772             :          * all non-pivot tuples on the page to the left are strictly < 'foo',
     773             :          * so it isn't necessary to descend left.  In other words, search
     774             :          * doesn't have to descend left because it isn't interested in a match
     775             :          * that has a heap TID value of -inf.
     776             :          *
     777             :          * However, some searches (pivotsearch searches) actually require that
     778             :          * we descend left when this happens.  -inf is treated as a possible
     779             :          * match for omitted scankey attribute(s).  This is needed by page
     780             :          * deletion, which must re-find leaf pages that are targets for
     781             :          * deletion using their high keys.
     782             :          *
     783             :          * Note: the heap TID part of the test ensures that scankey is being
     784             :          * compared to a pivot tuple with one or more truncated key
     785             :          * attributes.
     786             :          *
     787             :          * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
     788             :          * left here, since they have no heap TID attribute (and cannot have
     789             :          * any -inf key values in any case, since truncation can only remove
     790             :          * non-key attributes).  !heapkeyspace searches must always be
     791             :          * prepared to deal with matches on both sides of the pivot once the
     792             :          * leaf level is reached.
     793             :          */
     794    15519666 :         if (key->heapkeyspace && !key->pivotsearch &&
     795    15508468 :             key->keysz == ntupatts && heapTid == NULL)
     796      158942 :             return 1;
     797             : 
     798             :         /* All provided scankey arguments found to be equal */
     799    15360724 :         return 0;
     800             :     }
     801             : 
     802             :     /*
     803             :      * Treat truncated heap TID as minus infinity, since scankey has a key
     804             :      * attribute value (scantid) that would otherwise be compared directly
     805             :      */
     806             :     Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
     807    25159550 :     if (heapTid == NULL)
     808        4792 :         return 1;
     809             : 
     810             :     /*
     811             :      * Scankey must be treated as equal to a posting list tuple if its scantid
     812             :      * value falls within the range of the posting list.  In all other cases
     813             :      * there can only be a single heap TID value, which is compared directly
     814             :      * with scantid.
     815             :      */
     816             :     Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
     817    25154758 :     result = ItemPointerCompare(key->scantid, heapTid);
     818    25154758 :     if (result <= 0 || !BTreeTupleIsPosting(itup))
     819    25092104 :         return result;
     820             :     else
     821             :     {
     822       62654 :         result = ItemPointerCompare(key->scantid,
     823             :                                     BTreeTupleGetMaxHeapTID(itup));
     824       62654 :         if (result > 0)
     825       58632 :             return 1;
     826             :     }
     827             : 
     828        4022 :     return 0;
     829             : }
     830             : 
     831             : /*
     832             :  *  _bt_first() -- Find the first item in a scan.
     833             :  *
     834             :  *      We need to be clever about the direction of scan, the search
     835             :  *      conditions, and the tree ordering.  We find the first item (or,
     836             :  *      if backwards scan, the last item) in the tree that satisfies the
     837             :  *      qualifications in the scan key.  On success exit, the page containing
     838             :  *      the current index tuple is pinned but not locked, and data about
     839             :  *      the matching tuple(s) on the page has been loaded into so->currPos.
     840             :  *      scan->xs_ctup.t_self is set to the heap TID of the current tuple,
     841             :  *      and if requested, scan->xs_itup points to a copy of the index tuple.
     842             :  *
     843             :  * If there are no matching items in the index, we return false, with no
     844             :  * pins or locks held.
     845             :  *
     846             :  * Note that scan->keyData[], and the so->keyData[] scankey built from it,
     847             :  * are both search-type scankeys (see nbtree/README for more about this).
     848             :  * Within this routine, we build a temporary insertion-type scankey to use
     849             :  * in locating the scan start position.
     850             :  */
     851             : bool
     852    11187880 : _bt_first(IndexScanDesc scan, ScanDirection dir)
     853             : {
     854    11187880 :     Relation    rel = scan->indexRelation;
     855    11187880 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     856             :     Buffer      buf;
     857             :     BTStack     stack;
     858             :     OffsetNumber offnum;
     859             :     StrategyNumber strat;
     860             :     bool        nextkey;
     861             :     bool        goback;
     862             :     BTScanInsertData inskey;
     863             :     ScanKey     startKeys[INDEX_MAX_KEYS];
     864             :     ScanKeyData notnullkeys[INDEX_MAX_KEYS];
     865    11187880 :     int         keysCount = 0;
     866             :     int         i;
     867    11187880 :     bool        status = true;
     868             :     StrategyNumber strat_total;
     869             :     BTScanPosItem *currItem;
     870             :     BlockNumber blkno;
     871             : 
     872             :     Assert(!BTScanPosIsValid(so->currPos));
     873             : 
     874    11187880 :     pgstat_count_index_scan(rel);
     875             : 
     876             :     /*
     877             :      * Examine the scan keys and eliminate any redundant keys; also mark the
     878             :      * keys that must be matched to continue the scan.
     879             :      */
     880    11187880 :     _bt_preprocess_keys(scan);
     881             : 
     882             :     /*
     883             :      * Quit now if _bt_preprocess_keys() discovered that the scan keys can
     884             :      * never be satisfied (eg, x == 1 AND x > 2).
     885             :      */
     886    11187880 :     if (!so->qual_ok)
     887         906 :         return false;
     888             : 
     889             :     /*
     890             :      * For parallel scans, get the starting page from shared state. If the
     891             :      * scan has not started, proceed to find out first leaf page in the usual
     892             :      * way while keeping other participating processes waiting.  If the scan
     893             :      * has already begun, use the page number from the shared structure.
     894             :      */
     895    11186974 :     if (scan->parallel_scan != NULL)
     896             :     {
     897         240 :         status = _bt_parallel_seize(scan, &blkno);
     898         240 :         if (!status)
     899         192 :             return false;
     900          48 :         else if (blkno == P_NONE)
     901             :         {
     902           0 :             _bt_parallel_done(scan);
     903           0 :             return false;
     904             :         }
     905          48 :         else if (blkno != InvalidBlockNumber)
     906             :         {
     907           0 :             if (!_bt_parallel_readpage(scan, blkno, dir))
     908           0 :                 return false;
     909           0 :             goto readcomplete;
     910             :         }
     911             :     }
     912             : 
     913             :     /*----------
     914             :      * Examine the scan keys to discover where we need to start the scan.
     915             :      *
     916             :      * We want to identify the keys that can be used as starting boundaries;
     917             :      * these are =, >, or >= keys for a forward scan or =, <, <= keys for
     918             :      * a backwards scan.  We can use keys for multiple attributes so long as
     919             :      * the prior attributes had only =, >= (resp. =, <=) keys.  Once we accept
     920             :      * a > or < boundary or find an attribute with no boundary (which can be
     921             :      * thought of as the same as "> -infinity"), we can't use keys for any
     922             :      * attributes to its right, because it would break our simplistic notion
     923             :      * of what initial positioning strategy to use.
     924             :      *
     925             :      * When the scan keys include cross-type operators, _bt_preprocess_keys
     926             :      * may not be able to eliminate redundant keys; in such cases we will
     927             :      * arbitrarily pick a usable one for each attribute.  This is correct
     928             :      * but possibly not optimal behavior.  (For example, with keys like
     929             :      * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
     930             :      * x=5 would be more efficient.)  Since the situation only arises given
     931             :      * a poorly-worded query plus an incomplete opfamily, live with it.
     932             :      *
     933             :      * When both equality and inequality keys appear for a single attribute
     934             :      * (again, only possible when cross-type operators appear), we *must*
     935             :      * select one of the equality keys for the starting point, because
     936             :      * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
     937             :      * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
     938             :      * start at x=4, we will fail and stop before reaching x=10.  If multiple
     939             :      * equality quals survive preprocessing, however, it doesn't matter which
     940             :      * one we use --- by definition, they are either redundant or
     941             :      * contradictory.
     942             :      *
     943             :      * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
     944             :      * If the index stores nulls at the end of the index we'll be starting
     945             :      * from, and we have no boundary key for the column (which means the key
     946             :      * we deduced NOT NULL from is an inequality key that constrains the other
     947             :      * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
     948             :      * use as a boundary key.  If we didn't do this, we might find ourselves
     949             :      * traversing a lot of null entries at the start of the scan.
     950             :      *
     951             :      * In this loop, row-comparison keys are treated the same as keys on their
     952             :      * first (leftmost) columns.  We'll add on lower-order columns of the row
     953             :      * comparison below, if possible.
     954             :      *
     955             :      * The selected scan keys (at most one per index column) are remembered by
     956             :      * storing their addresses into the local startKeys[] array.
     957             :      *----------
     958             :      */
     959    11186782 :     strat_total = BTEqualStrategyNumber;
     960    11186782 :     if (so->numberOfKeys > 0)
     961             :     {
     962             :         AttrNumber  curattr;
     963             :         ScanKey     chosen;
     964             :         ScanKey     impliesNN;
     965             :         ScanKey     cur;
     966             : 
     967             :         /*
     968             :          * chosen is the so-far-chosen key for the current attribute, if any.
     969             :          * We don't cast the decision in stone until we reach keys for the
     970             :          * next attribute.
     971             :          */
     972    11181400 :         curattr = 1;
     973    11181400 :         chosen = NULL;
     974             :         /* Also remember any scankey that implies a NOT NULL constraint */
     975    11181400 :         impliesNN = NULL;
     976             : 
     977             :         /*
     978             :          * Loop iterates from 0 to numberOfKeys inclusive; we use the last
     979             :          * pass to handle after-last-key processing.  Actual exit from the
     980             :          * loop is at one of the "break" statements below.
     981             :          */
     982    11181400 :         for (cur = so->keyData, i = 0;; cur++, i++)
     983             :         {
     984    30397010 :             if (i >= so->numberOfKeys || cur->sk_attno != curattr)
     985             :             {
     986             :                 /*
     987             :                  * Done looking at keys for curattr.  If we didn't find a
     988             :                  * usable boundary key, see if we can deduce a NOT NULL key.
     989             :                  */
     990    19257702 :                 if (chosen == NULL && impliesNN != NULL &&
     991       42538 :                     ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
     992             :                      ScanDirectionIsForward(dir) :
     993             :                      ScanDirectionIsBackward(dir)))
     994             :                 {
     995             :                     /* Yes, so build the key in notnullkeys[keysCount] */
     996           4 :                     chosen = &notnullkeys[keysCount];
     997           4 :                     ScanKeyEntryInitialize(chosen,
     998             :                                            (SK_SEARCHNOTNULL | SK_ISNULL |
     999           4 :                                             (impliesNN->sk_flags &
    1000             :                                              (SK_BT_DESC | SK_BT_NULLS_FIRST))),
    1001             :                                            curattr,
    1002           4 :                                            ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
    1003             :                                             BTGreaterStrategyNumber :
    1004             :                                             BTLessStrategyNumber),
    1005             :                                            InvalidOid,
    1006             :                                            InvalidOid,
    1007             :                                            InvalidOid,
    1008             :                                            (Datum) 0);
    1009             :                 }
    1010             : 
    1011             :                 /*
    1012             :                  * If we still didn't find a usable boundary key, quit; else
    1013             :                  * save the boundary key pointer in startKeys.
    1014             :                  */
    1015    19215164 :                 if (chosen == NULL)
    1016       42714 :                     break;
    1017    19172450 :                 startKeys[keysCount++] = chosen;
    1018             : 
    1019             :                 /*
    1020             :                  * Adjust strat_total, and quit if we have stored a > or <
    1021             :                  * key.
    1022             :                  */
    1023    19172450 :                 strat = chosen->sk_strategy;
    1024    19172450 :                 if (strat != BTEqualStrategyNumber)
    1025             :                 {
    1026      764764 :                     strat_total = strat;
    1027      764764 :                     if (strat == BTGreaterStrategyNumber ||
    1028             :                         strat == BTLessStrategyNumber)
    1029             :                         break;
    1030             :                 }
    1031             : 
    1032             :                 /*
    1033             :                  * Done if that was the last attribute, or if next key is not
    1034             :                  * in sequence (implying no boundary key is available for the
    1035             :                  * next attribute).
    1036             :                  */
    1037    18410266 :                 if (i >= so->numberOfKeys ||
    1038     8034554 :                     cur->sk_attno != curattr + 1)
    1039             :                     break;
    1040             : 
    1041             :                 /*
    1042             :                  * Reset for next attr.
    1043             :                  */
    1044     8033764 :                 curattr = cur->sk_attno;
    1045     8033764 :                 chosen = NULL;
    1046     8033764 :                 impliesNN = NULL;
    1047             :             }
    1048             : 
    1049             :             /*
    1050             :              * Can we use this key as a starting boundary for this attr?
    1051             :              *
    1052             :              * If not, does it imply a NOT NULL constraint?  (Because
    1053             :              * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
    1054             :              * *any* inequality key works for that; we need not test.)
    1055             :              */
    1056    19215610 :             switch (cur->sk_strategy)
    1057             :             {
    1058       70954 :                 case BTLessStrategyNumber:
    1059             :                 case BTLessEqualStrategyNumber:
    1060       70954 :                     if (chosen == NULL)
    1061             :                     {
    1062       70336 :                         if (ScanDirectionIsBackward(dir))
    1063       27810 :                             chosen = cur;
    1064             :                         else
    1065       42526 :                             impliesNN = cur;
    1066             :                     }
    1067       70954 :                     break;
    1068    18407686 :                 case BTEqualStrategyNumber:
    1069             :                     /* override any non-equality choice */
    1070    18407686 :                     chosen = cur;
    1071    18407686 :                     break;
    1072      736970 :                 case BTGreaterEqualStrategyNumber:
    1073             :                 case BTGreaterStrategyNumber:
    1074      736970 :                     if (chosen == NULL)
    1075             :                     {
    1076      736970 :                         if (ScanDirectionIsForward(dir))
    1077      736950 :                             chosen = cur;
    1078             :                         else
    1079          20 :                             impliesNN = cur;
    1080             :                     }
    1081      736970 :                     break;
    1082             :             }
    1083    19215610 :         }
    1084             :     }
    1085             : 
    1086             :     /*
    1087             :      * If we found no usable boundary keys, we have to start from one end of
    1088             :      * the tree.  Walk down that edge to the first or last key, and scan from
    1089             :      * there.
    1090             :      */
    1091    11186782 :     if (keysCount == 0)
    1092             :     {
    1093             :         bool        match;
    1094             : 
    1095       47946 :         match = _bt_endpoint(scan, dir);
    1096             : 
    1097       47946 :         if (!match)
    1098             :         {
    1099             :             /* No match, so mark (parallel) scan finished */
    1100        3346 :             _bt_parallel_done(scan);
    1101             :         }
    1102             : 
    1103       47946 :         return match;
    1104             :     }
    1105             : 
    1106             :     /*
    1107             :      * We want to start the scan somewhere within the index.  Set up an
    1108             :      * insertion scankey we can use to search for the boundary point we
    1109             :      * identified above.  The insertion scankey is built using the keys
    1110             :      * identified by startKeys[].  (Remaining insertion scankey fields are
    1111             :      * initialized after initial-positioning strategy is finalized.)
    1112             :      */
    1113             :     Assert(keysCount <= INDEX_MAX_KEYS);
    1114    30311270 :     for (i = 0; i < keysCount; i++)
    1115             :     {
    1116    19172450 :         ScanKey     cur = startKeys[i];
    1117             : 
    1118             :         Assert(cur->sk_attno == i + 1);
    1119             : 
    1120    19172450 :         if (cur->sk_flags & SK_ROW_HEADER)
    1121             :         {
    1122             :             /*
    1123             :              * Row comparison header: look to the first row member instead.
    1124             :              *
    1125             :              * The member scankeys are already in insertion format (ie, they
    1126             :              * have sk_func = 3-way-comparison function), but we have to watch
    1127             :              * out for nulls, which _bt_preprocess_keys didn't check. A null
    1128             :              * in the first row member makes the condition unmatchable, just
    1129             :              * like qual_ok = false.
    1130             :              */
    1131          16 :             ScanKey     subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
    1132             : 
    1133             :             Assert(subkey->sk_flags & SK_ROW_MEMBER);
    1134          16 :             if (subkey->sk_flags & SK_ISNULL)
    1135             :             {
    1136           0 :                 _bt_parallel_done(scan);
    1137           0 :                 return false;
    1138             :             }
    1139          16 :             memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
    1140             : 
    1141             :             /*
    1142             :              * If the row comparison is the last positioning key we accepted,
    1143             :              * try to add additional keys from the lower-order row members.
    1144             :              * (If we accepted independent conditions on additional index
    1145             :              * columns, we use those instead --- doesn't seem worth trying to
    1146             :              * determine which is more restrictive.)  Note that this is OK
    1147             :              * even if the row comparison is of ">" or "<" type, because the
    1148             :              * condition applied to all but the last row member is effectively
    1149             :              * ">=" or "<=", and so the extra keys don't break the positioning
    1150             :              * scheme.  But, by the same token, if we aren't able to use all
    1151             :              * the row members, then the part of the row comparison that we
    1152             :              * did use has to be treated as just a ">=" or "<=" condition, and
    1153             :              * so we'd better adjust strat_total accordingly.
    1154             :              */
    1155          16 :             if (i == keysCount - 1)
    1156             :             {
    1157          16 :                 bool        used_all_subkeys = false;
    1158             : 
    1159             :                 Assert(!(subkey->sk_flags & SK_ROW_END));
    1160             :                 for (;;)
    1161             :                 {
    1162           0 :                     subkey++;
    1163             :                     Assert(subkey->sk_flags & SK_ROW_MEMBER);
    1164          16 :                     if (subkey->sk_attno != keysCount + 1)
    1165           0 :                         break;  /* out-of-sequence, can't use it */
    1166          16 :                     if (subkey->sk_strategy != cur->sk_strategy)
    1167           0 :                         break;  /* wrong direction, can't use it */
    1168          16 :                     if (subkey->sk_flags & SK_ISNULL)
    1169           0 :                         break;  /* can't use null keys */
    1170             :                     Assert(keysCount < INDEX_MAX_KEYS);
    1171          16 :                     memcpy(inskey.scankeys + keysCount, subkey,
    1172             :                            sizeof(ScanKeyData));
    1173          16 :                     keysCount++;
    1174          16 :                     if (subkey->sk_flags & SK_ROW_END)
    1175             :                     {
    1176          16 :                         used_all_subkeys = true;
    1177          16 :                         break;
    1178             :                     }
    1179             :                 }
    1180          16 :                 if (!used_all_subkeys)
    1181             :                 {
    1182           0 :                     switch (strat_total)
    1183             :                     {
    1184           0 :                         case BTLessStrategyNumber:
    1185           0 :                             strat_total = BTLessEqualStrategyNumber;
    1186           0 :                             break;
    1187           0 :                         case BTGreaterStrategyNumber:
    1188           0 :                             strat_total = BTGreaterEqualStrategyNumber;
    1189           0 :                             break;
    1190             :                     }
    1191          16 :                 }
    1192          16 :                 break;          /* done with outer loop */
    1193             :             }
    1194             :         }
    1195             :         else
    1196             :         {
    1197             :             /*
    1198             :              * Ordinary comparison key.  Transform the search-style scan key
    1199             :              * to an insertion scan key by replacing the sk_func with the
    1200             :              * appropriate btree comparison function.
    1201             :              *
    1202             :              * If scankey operator is not a cross-type comparison, we can use
    1203             :              * the cached comparison function; otherwise gotta look it up in
    1204             :              * the catalogs.  (That can't lead to infinite recursion, since no
    1205             :              * indexscan initiated by syscache lookup will use cross-data-type
    1206             :              * operators.)
    1207             :              *
    1208             :              * We support the convention that sk_subtype == InvalidOid means
    1209             :              * the opclass input type; this is a hack to simplify life for
    1210             :              * ScanKeyInit().
    1211             :              */
    1212    19172434 :             if (cur->sk_subtype == rel->rd_opcintype[i] ||
    1213    18861864 :                 cur->sk_subtype == InvalidOid)
    1214    19165804 :             {
    1215             :                 FmgrInfo   *procinfo;
    1216             : 
    1217    19165804 :                 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
    1218    38331608 :                 ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
    1219             :                                                cur->sk_flags,
    1220    19165804 :                                                cur->sk_attno,
    1221             :                                                InvalidStrategy,
    1222             :                                                cur->sk_subtype,
    1223             :                                                cur->sk_collation,
    1224             :                                                procinfo,
    1225             :                                                cur->sk_argument);
    1226             :             }
    1227             :             else
    1228             :             {
    1229             :                 RegProcedure cmp_proc;
    1230             : 
    1231       13260 :                 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
    1232        6630 :                                              rel->rd_opcintype[i],
    1233             :                                              cur->sk_subtype,
    1234             :                                              BTORDER_PROC);
    1235        6630 :                 if (!RegProcedureIsValid(cmp_proc))
    1236           0 :                     elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
    1237             :                          BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
    1238             :                          cur->sk_attno, RelationGetRelationName(rel));
    1239       13260 :                 ScanKeyEntryInitialize(inskey.scankeys + i,
    1240             :                                        cur->sk_flags,
    1241        6630 :                                        cur->sk_attno,
    1242             :                                        InvalidStrategy,
    1243             :                                        cur->sk_subtype,
    1244             :                                        cur->sk_collation,
    1245             :                                        cmp_proc,
    1246             :                                        cur->sk_argument);
    1247             :             }
    1248             :         }
    1249             :     }
    1250             : 
    1251             :     /*----------
    1252             :      * Examine the selected initial-positioning strategy to determine exactly
    1253             :      * where we need to start the scan, and set flag variables to control the
    1254             :      * code below.
    1255             :      *
    1256             :      * If nextkey = false, _bt_search and _bt_binsrch will locate the first
    1257             :      * item >= scan key.  If nextkey = true, they will locate the first
    1258             :      * item > scan key.
    1259             :      *
    1260             :      * If goback = true, we will then step back one item, while if
    1261             :      * goback = false, we will start the scan on the located item.
    1262             :      *----------
    1263             :      */
    1264    11138836 :     switch (strat_total)
    1265             :     {
    1266       27814 :         case BTLessStrategyNumber:
    1267             : 
    1268             :             /*
    1269             :              * Find first item >= scankey, then back up one to arrive at last
    1270             :              * item < scankey.  (Note: this positioning strategy is only used
    1271             :              * for a backward scan, so that is always the correct starting
    1272             :              * position.)
    1273             :              */
    1274       27814 :             nextkey = false;
    1275       27814 :             goback = true;
    1276       27814 :             break;
    1277             : 
    1278           0 :         case BTLessEqualStrategyNumber:
    1279             : 
    1280             :             /*
    1281             :              * Find first item > scankey, then back up one to arrive at last
    1282             :              * item <= scankey.  (Note: this positioning strategy is only used
    1283             :              * for a backward scan, so that is always the correct starting
    1284             :              * position.)
    1285             :              */
    1286           0 :             nextkey = true;
    1287           0 :             goback = true;
    1288           0 :             break;
    1289             : 
    1290    10374072 :         case BTEqualStrategyNumber:
    1291             : 
    1292             :             /*
    1293             :              * If a backward scan was specified, need to start with last equal
    1294             :              * item not first one.
    1295             :              */
    1296    10374072 :             if (ScanDirectionIsBackward(dir))
    1297             :             {
    1298             :                 /*
    1299             :                  * This is the same as the <= strategy.  We will check at the
    1300             :                  * end whether the found item is actually =.
    1301             :                  */
    1302          88 :                 nextkey = true;
    1303          88 :                 goback = true;
    1304             :             }
    1305             :             else
    1306             :             {
    1307             :                 /*
    1308             :                  * This is the same as the >= strategy.  We will check at the
    1309             :                  * end whether the found item is actually =.
    1310             :                  */
    1311    10373984 :                 nextkey = false;
    1312    10373984 :                 goback = false;
    1313             :             }
    1314    10374072 :             break;
    1315             : 
    1316        2580 :         case BTGreaterEqualStrategyNumber:
    1317             : 
    1318             :             /*
    1319             :              * Find first item >= scankey.  (This is only used for forward
    1320             :              * scans.)
    1321             :              */
    1322        2580 :             nextkey = false;
    1323        2580 :             goback = false;
    1324        2580 :             break;
    1325             : 
    1326      734370 :         case BTGreaterStrategyNumber:
    1327             : 
    1328             :             /*
    1329             :              * Find first item > scankey.  (This is only used for forward
    1330             :              * scans.)
    1331             :              */
    1332      734370 :             nextkey = true;
    1333      734370 :             goback = false;
    1334      734370 :             break;
    1335             : 
    1336           0 :         default:
    1337             :             /* can't get here, but keep compiler quiet */
    1338           0 :             elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
    1339             :             return false;
    1340             :     }
    1341             : 
    1342             :     /* Initialize remaining insertion scan key fields */
    1343    11138836 :     _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
    1344    11138836 :     inskey.anynullkeys = false; /* unused */
    1345    11138836 :     inskey.nextkey = nextkey;
    1346    11138836 :     inskey.pivotsearch = false;
    1347    11138836 :     inskey.scantid = NULL;
    1348    11138836 :     inskey.keysz = keysCount;
    1349             : 
    1350             :     /*
    1351             :      * Use the manufactured insertion scan key to descend the tree and
    1352             :      * position ourselves on the target leaf page.
    1353             :      */
    1354    11138836 :     stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
    1355             : 
    1356             :     /* don't need to keep the stack around... */
    1357    11138836 :     _bt_freestack(stack);
    1358             : 
    1359    11138836 :     if (!BufferIsValid(buf))
    1360             :     {
    1361             :         /*
    1362             :          * We only get here if the index is completely empty. Lock relation
    1363             :          * because nothing finer to lock exists.
    1364             :          */
    1365      352448 :         PredicateLockRelation(rel, scan->xs_snapshot);
    1366             : 
    1367             :         /*
    1368             :          * mark parallel scan as done, so that all the workers can finish
    1369             :          * their scan
    1370             :          */
    1371      352448 :         _bt_parallel_done(scan);
    1372      352448 :         BTScanPosInvalidate(so->currPos);
    1373             : 
    1374      352448 :         return false;
    1375             :     }
    1376             :     else
    1377    10786388 :         PredicateLockPage(rel, BufferGetBlockNumber(buf),
    1378             :                           scan->xs_snapshot);
    1379             : 
    1380    10786388 :     _bt_initialize_more_data(so, dir);
    1381             : 
    1382             :     /* position to the precise item on the page */
    1383    10786388 :     offnum = _bt_binsrch(rel, &inskey, buf);
    1384             : 
    1385             :     /*
    1386             :      * If nextkey = false, we are positioned at the first item >= scan key, or
    1387             :      * possibly at the end of a page on which all the existing items are less
    1388             :      * than the scan key and we know that everything on later pages is greater
    1389             :      * than or equal to scan key.
    1390             :      *
    1391             :      * If nextkey = true, we are positioned at the first item > scan key, or
    1392             :      * possibly at the end of a page on which all the existing items are less
    1393             :      * than or equal to the scan key and we know that everything on later
    1394             :      * pages is greater than scan key.
    1395             :      *
    1396             :      * The actually desired starting point is either this item or the prior
    1397             :      * one, or in the end-of-page case it's the first item on the next page or
    1398             :      * the last item on this page.  Adjust the starting offset if needed. (If
    1399             :      * this results in an offset before the first item or after the last one,
    1400             :      * _bt_readpage will report no items found, and then we'll step to the
    1401             :      * next page as needed.)
    1402             :      */
    1403    10786388 :     if (goback)
    1404       27894 :         offnum = OffsetNumberPrev(offnum);
    1405             : 
    1406             :     /* remember which buffer we have pinned, if any */
    1407             :     Assert(!BTScanPosIsValid(so->currPos));
    1408    10786388 :     so->currPos.buf = buf;
    1409             : 
    1410             :     /*
    1411             :      * Now load data from the first page of the scan.
    1412             :      */
    1413    10786388 :     if (!_bt_readpage(scan, dir, offnum))
    1414             :     {
    1415             :         /*
    1416             :          * There's no actually-matching data on this page.  Try to advance to
    1417             :          * the next page.  Return false if there's no matching data at all.
    1418             :          */
    1419     2670946 :         LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
    1420     2670946 :         if (!_bt_steppage(scan, dir))
    1421     2670928 :             return false;
    1422             :     }
    1423             :     else
    1424             :     {
    1425             :         /* Drop the lock, and maybe the pin, on the current page */
    1426     8115442 :         _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
    1427             :     }
    1428             : 
    1429     8115460 : readcomplete:
    1430             :     /* OK, itemIndex says what to return */
    1431     8115460 :     currItem = &so->currPos.items[so->currPos.itemIndex];
    1432     8115460 :     scan->xs_heaptid = currItem->heapTid;
    1433     8115460 :     if (scan->xs_want_itup)
    1434       41078 :         scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
    1435             : 
    1436     8115460 :     return true;
    1437             : }
    1438             : 
    1439             : /*
    1440             :  *  _bt_next() -- Get the next item in a scan.
    1441             :  *
    1442             :  *      On entry, so->currPos describes the current page, which may be pinned
    1443             :  *      but is not locked, and so->currPos.itemIndex identifies which item was
    1444             :  *      previously returned.
    1445             :  *
    1446             :  *      On successful exit, scan->xs_ctup.t_self is set to the TID of the
    1447             :  *      next heap tuple, and if requested, scan->xs_itup points to a copy of
    1448             :  *      the index tuple.  so->currPos is updated as needed.
    1449             :  *
    1450             :  *      On failure exit (no more tuples), we release pin and set
    1451             :  *      so->currPos.buf to InvalidBuffer.
    1452             :  */
    1453             : bool
    1454    10113916 : _bt_next(IndexScanDesc scan, ScanDirection dir)
    1455             : {
    1456    10113916 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1457             :     BTScanPosItem *currItem;
    1458             : 
    1459             :     /*
    1460             :      * Advance to next tuple on current page; or if there's no more, try to
    1461             :      * step to the next page with data.
    1462             :      */
    1463    10113916 :     if (ScanDirectionIsForward(dir))
    1464             :     {
    1465    10109162 :         if (++so->currPos.itemIndex > so->currPos.lastItem)
    1466             :         {
    1467     1197358 :             if (!_bt_steppage(scan, dir))
    1468     1178940 :                 return false;
    1469             :         }
    1470             :     }
    1471             :     else
    1472             :     {
    1473        4754 :         if (--so->currPos.itemIndex < so->currPos.firstItem)
    1474             :         {
    1475          46 :             if (!_bt_steppage(scan, dir))
    1476          40 :                 return false;
    1477             :         }
    1478             :     }
    1479             : 
    1480             :     /* OK, itemIndex says what to return */
    1481     8934936 :     currItem = &so->currPos.items[so->currPos.itemIndex];
    1482     8934936 :     scan->xs_heaptid = currItem->heapTid;
    1483     8934936 :     if (scan->xs_want_itup)
    1484     1990250 :         scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
    1485             : 
    1486     8934936 :     return true;
    1487             : }
    1488             : 
    1489             : /*
    1490             :  *  _bt_readpage() -- Load data from current index page into so->currPos
    1491             :  *
    1492             :  * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
    1493             :  * is not changed here.  Also, currPos.moreLeft and moreRight must be valid;
    1494             :  * they are updated as appropriate.  All other fields of so->currPos are
    1495             :  * initialized from scratch here.
    1496             :  *
    1497             :  * We scan the current page starting at offnum and moving in the indicated
    1498             :  * direction.  All items matching the scan keys are loaded into currPos.items.
    1499             :  * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
    1500             :  * that there can be no more matching tuples in the current scan direction.
    1501             :  *
    1502             :  * In the case of a parallel scan, caller must have called _bt_parallel_seize
    1503             :  * prior to calling this function; this function will invoke
    1504             :  * _bt_parallel_release before returning.
    1505             :  *
    1506             :  * Returns true if any matching items found on the page, false if none.
    1507             :  */
    1508             : static bool
    1509    10849926 : _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
    1510             : {
    1511    10849926 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1512             :     Page        page;
    1513             :     BTPageOpaque opaque;
    1514             :     OffsetNumber minoff;
    1515             :     OffsetNumber maxoff;
    1516             :     int         itemIndex;
    1517             :     bool        continuescan;
    1518             :     int         indnatts;
    1519             : 
    1520             :     /*
    1521             :      * We must have the buffer pinned and locked, but the usual macro can't be
    1522             :      * used here; this function is what makes it good for currPos.
    1523             :      */
    1524             :     Assert(BufferIsValid(so->currPos.buf));
    1525             : 
    1526    10849926 :     page = BufferGetPage(so->currPos.buf);
    1527    10849926 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1528             : 
    1529             :     /* allow next page be processed by parallel worker */
    1530    10849926 :     if (scan->parallel_scan)
    1531             :     {
    1532         856 :         if (ScanDirectionIsForward(dir))
    1533         856 :             _bt_parallel_release(scan, opaque->btpo_next);
    1534             :         else
    1535           0 :             _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
    1536             :     }
    1537             : 
    1538    10849926 :     continuescan = true;        /* default assumption */
    1539    10849926 :     indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
    1540    10849926 :     minoff = P_FIRSTDATAKEY(opaque);
    1541    10849926 :     maxoff = PageGetMaxOffsetNumber(page);
    1542             : 
    1543             :     /*
    1544             :      * We note the buffer's block number so that we can release the pin later.
    1545             :      * This allows us to re-read the buffer if it is needed again for hinting.
    1546             :      */
    1547    10849926 :     so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
    1548             : 
    1549             :     /*
    1550             :      * We save the LSN of the page as we read it, so that we know whether it
    1551             :      * safe to apply LP_DEAD hints to the page later.  This allows us to drop
    1552             :      * the pin for MVCC scans, which allows vacuum to avoid blocking.
    1553             :      */
    1554    10849926 :     so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
    1555             : 
    1556             :     /*
    1557             :      * we must save the page's right-link while scanning it; this tells us
    1558             :      * where to step right to after we're done with these items.  There is no
    1559             :      * corresponding need for the left-link, since splits always go right.
    1560             :      */
    1561    10849926 :     so->currPos.nextPage = opaque->btpo_next;
    1562             : 
    1563             :     /* initialize tuple workspace to empty */
    1564    10849926 :     so->currPos.nextTupleOffset = 0;
    1565             : 
    1566             :     /*
    1567             :      * Now that the current page has been made consistent, the macro should be
    1568             :      * good.
    1569             :      */
    1570             :     Assert(BTScanPosIsPinned(so->currPos));
    1571             : 
    1572    10849926 :     if (ScanDirectionIsForward(dir))
    1573             :     {
    1574             :         /* load items[] in ascending order */
    1575    10821988 :         itemIndex = 0;
    1576             : 
    1577    10821988 :         offnum = Max(offnum, minoff);
    1578             : 
    1579    85210704 :         while (offnum <= maxoff)
    1580             :         {
    1581    82449824 :             ItemId      iid = PageGetItemId(page, offnum);
    1582             :             IndexTuple  itup;
    1583             : 
    1584             :             /*
    1585             :              * If the scan specifies not to return killed tuples, then we
    1586             :              * treat a killed tuple as not passing the qual
    1587             :              */
    1588    82449824 :             if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
    1589             :             {
    1590     1723906 :                 offnum = OffsetNumberNext(offnum);
    1591     1723906 :                 continue;
    1592             :             }
    1593             : 
    1594    80725918 :             itup = (IndexTuple) PageGetItem(page, iid);
    1595             : 
    1596    80725918 :             if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
    1597             :             {
    1598             :                 /* tuple passes all scan key conditions */
    1599    72477580 :                 if (!BTreeTupleIsPosting(itup))
    1600             :                 {
    1601             :                     /* Remember it */
    1602    72462460 :                     _bt_saveitem(so, itemIndex, offnum, itup);
    1603    72462460 :                     itemIndex++;
    1604             :                 }
    1605             :                 else
    1606             :                 {
    1607             :                     int         tupleOffset;
    1608             : 
    1609             :                     /*
    1610             :                      * Set up state to return posting list, and remember first
    1611             :                      * TID
    1612             :                      */
    1613             :                     tupleOffset =
    1614       15120 :                         _bt_setuppostingitems(so, itemIndex, offnum,
    1615             :                                               BTreeTupleGetPostingN(itup, 0),
    1616             :                                               itup);
    1617       15120 :                     itemIndex++;
    1618             :                     /* Remember additional TIDs */
    1619      953480 :                     for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
    1620             :                     {
    1621      938360 :                         _bt_savepostingitem(so, itemIndex, offnum,
    1622             :                                             BTreeTupleGetPostingN(itup, i),
    1623             :                                             tupleOffset);
    1624      938360 :                         itemIndex++;
    1625             :                     }
    1626             :                 }
    1627             :             }
    1628             :             /* When !continuescan, there can't be any more matches, so stop */
    1629    80725918 :             if (!continuescan)
    1630     8061108 :                 break;
    1631             : 
    1632    72664810 :             offnum = OffsetNumberNext(offnum);
    1633             :         }
    1634             : 
    1635             :         /*
    1636             :          * We don't need to visit page to the right when the high key
    1637             :          * indicates that no more matches will be found there.
    1638             :          *
    1639             :          * Checking the high key like this works out more often than you might
    1640             :          * think.  Leaf page splits pick a split point between the two most
    1641             :          * dissimilar tuples (this is weighed against the need to evenly share
    1642             :          * free space).  Leaf pages with high key attribute values that can
    1643             :          * only appear on non-pivot tuples on the right sibling page are
    1644             :          * common.
    1645             :          */
    1646    10821988 :         if (continuescan && !P_RIGHTMOST(opaque))
    1647             :         {
    1648      530814 :             ItemId      iid = PageGetItemId(page, P_HIKEY);
    1649      530814 :             IndexTuple  itup = (IndexTuple) PageGetItem(page, iid);
    1650             :             int         truncatt;
    1651             : 
    1652      530814 :             truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
    1653      530814 :             _bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
    1654             :         }
    1655             : 
    1656    10821988 :         if (!continuescan)
    1657     8464290 :             so->currPos.moreRight = false;
    1658             : 
    1659             :         Assert(itemIndex <= MaxTIDsPerBTreePage);
    1660    10821988 :         so->currPos.firstItem = 0;
    1661    10821988 :         so->currPos.lastItem = itemIndex - 1;
    1662    10821988 :         so->currPos.itemIndex = 0;
    1663             :     }
    1664             :     else
    1665             :     {
    1666             :         /* load items[] in descending order */
    1667       27938 :         itemIndex = MaxTIDsPerBTreePage;
    1668             : 
    1669       27938 :         offnum = Min(offnum, maxoff);
    1670             : 
    1671     4030866 :         while (offnum >= minoff)
    1672             :         {
    1673     4002980 :             ItemId      iid = PageGetItemId(page, offnum);
    1674             :             IndexTuple  itup;
    1675             :             bool        tuple_alive;
    1676             :             bool        passes_quals;
    1677             : 
    1678             :             /*
    1679             :              * If the scan specifies not to return killed tuples, then we
    1680             :              * treat a killed tuple as not passing the qual.  Most of the
    1681             :              * time, it's a win to not bother examining the tuple's index
    1682             :              * keys, but just skip to the next tuple (previous, actually,
    1683             :              * since we're scanning backwards).  However, if this is the first
    1684             :              * tuple on the page, we do check the index keys, to prevent
    1685             :              * uselessly advancing to the page to the left.  This is similar
    1686             :              * to the high key optimization used by forward scans.
    1687             :              */
    1688     4002980 :             if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
    1689             :             {
    1690             :                 Assert(offnum >= P_FIRSTDATAKEY(opaque));
    1691      251560 :                 if (offnum > P_FIRSTDATAKEY(opaque))
    1692             :                 {
    1693      251222 :                     offnum = OffsetNumberPrev(offnum);
    1694      251222 :                     continue;
    1695             :                 }
    1696             : 
    1697         338 :                 tuple_alive = false;
    1698             :             }
    1699             :             else
    1700     3751420 :                 tuple_alive = true;
    1701             : 
    1702     3751758 :             itup = (IndexTuple) PageGetItem(page, iid);
    1703             : 
    1704     3751758 :             passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
    1705             :                                          &continuescan);
    1706     3751758 :             if (passes_quals && tuple_alive)
    1707             :             {
    1708             :                 /* tuple passes all scan key conditions */
    1709     3751368 :                 if (!BTreeTupleIsPosting(itup))
    1710             :                 {
    1711             :                     /* Remember it */
    1712     3751356 :                     itemIndex--;
    1713     3751356 :                     _bt_saveitem(so, itemIndex, offnum, itup);
    1714             :                 }
    1715             :                 else
    1716             :                 {
    1717             :                     int         tupleOffset;
    1718             : 
    1719             :                     /*
    1720             :                      * Set up state to return posting list, and remember first
    1721             :                      * TID.
    1722             :                      *
    1723             :                      * Note that we deliberately save/return items from
    1724             :                      * posting lists in ascending heap TID order for backwards
    1725             :                      * scans.  This allows _bt_killitems() to make a
    1726             :                      * consistent assumption about the order of items
    1727             :                      * associated with the same posting list tuple.
    1728             :                      */
    1729          12 :                     itemIndex--;
    1730             :                     tupleOffset =
    1731          12 :                         _bt_setuppostingitems(so, itemIndex, offnum,
    1732             :                                               BTreeTupleGetPostingN(itup, 0),
    1733             :                                               itup);
    1734             :                     /* Remember additional TIDs */
    1735          24 :                     for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
    1736             :                     {
    1737          12 :                         itemIndex--;
    1738          12 :                         _bt_savepostingitem(so, itemIndex, offnum,
    1739             :                                             BTreeTupleGetPostingN(itup, i),
    1740             :                                             tupleOffset);
    1741             :                     }
    1742             :                 }
    1743             :             }
    1744     3751758 :             if (!continuescan)
    1745             :             {
    1746             :                 /* there can't be any more matches, so stop */
    1747          52 :                 so->currPos.moreLeft = false;
    1748          52 :                 break;
    1749             :             }
    1750             : 
    1751     3751706 :             offnum = OffsetNumberPrev(offnum);
    1752             :         }
    1753             : 
    1754             :         Assert(itemIndex >= 0);
    1755       27938 :         so->currPos.firstItem = itemIndex;
    1756       27938 :         so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
    1757       27938 :         so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
    1758             :     }
    1759             : 
    1760    10849926 :     return (so->currPos.firstItem <= so->currPos.lastItem);
    1761             : }
    1762             : 
    1763             : /* Save an index item into so->currPos.items[itemIndex] */
    1764             : static void
    1765    76213816 : _bt_saveitem(BTScanOpaque so, int itemIndex,
    1766             :              OffsetNumber offnum, IndexTuple itup)
    1767             : {
    1768    76213816 :     BTScanPosItem *currItem = &so->currPos.items[itemIndex];
    1769             : 
    1770             :     Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
    1771             : 
    1772    76213816 :     currItem->heapTid = itup->t_tid;
    1773    76213816 :     currItem->indexOffset = offnum;
    1774    76213816 :     if (so->currTuples)
    1775             :     {
    1776    14214498 :         Size        itupsz = IndexTupleSize(itup);
    1777             : 
    1778    14214498 :         currItem->tupleOffset = so->currPos.nextTupleOffset;
    1779    14214498 :         memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
    1780    14214498 :         so->currPos.nextTupleOffset += MAXALIGN(itupsz);
    1781             :     }
    1782    76213816 : }
    1783             : 
    1784             : /*
    1785             :  * Setup state to save TIDs/items from a single posting list tuple.
    1786             :  *
    1787             :  * Saves an index item into so->currPos.items[itemIndex] for TID that is
    1788             :  * returned to scan first.  Second or subsequent TIDs for posting list should
    1789             :  * be saved by calling _bt_savepostingitem().
    1790             :  *
    1791             :  * Returns an offset into tuple storage space that main tuple is stored at if
    1792             :  * needed.
    1793             :  */
    1794             : static int
    1795       15132 : _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
    1796             :                       ItemPointer heapTid, IndexTuple itup)
    1797             : {
    1798       15132 :     BTScanPosItem *currItem = &so->currPos.items[itemIndex];
    1799             : 
    1800             :     Assert(BTreeTupleIsPosting(itup));
    1801             : 
    1802       15132 :     currItem->heapTid = *heapTid;
    1803       15132 :     currItem->indexOffset = offnum;
    1804       15132 :     if (so->currTuples)
    1805             :     {
    1806             :         /* Save base IndexTuple (truncate posting list) */
    1807             :         IndexTuple  base;
    1808        4836 :         Size        itupsz = BTreeTupleGetPostingOffset(itup);
    1809             : 
    1810        4836 :         itupsz = MAXALIGN(itupsz);
    1811        4836 :         currItem->tupleOffset = so->currPos.nextTupleOffset;
    1812        4836 :         base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
    1813        4836 :         memcpy(base, itup, itupsz);
    1814             :         /* Defensively reduce work area index tuple header size */
    1815        4836 :         base->t_info &= ~INDEX_SIZE_MASK;
    1816        4836 :         base->t_info |= itupsz;
    1817        4836 :         so->currPos.nextTupleOffset += itupsz;
    1818             : 
    1819        4836 :         return currItem->tupleOffset;
    1820             :     }
    1821             : 
    1822       10296 :     return 0;
    1823             : }
    1824             : 
    1825             : /*
    1826             :  * Save an index item into so->currPos.items[itemIndex] for current posting
    1827             :  * tuple.
    1828             :  *
    1829             :  * Assumes that _bt_setuppostingitems() has already been called for current
    1830             :  * posting list tuple.  Caller passes its return value as tupleOffset.
    1831             :  */
    1832             : static inline void
    1833      938372 : _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
    1834             :                     ItemPointer heapTid, int tupleOffset)
    1835             : {
    1836      938372 :     BTScanPosItem *currItem = &so->currPos.items[itemIndex];
    1837             : 
    1838      938372 :     currItem->heapTid = *heapTid;
    1839      938372 :     currItem->indexOffset = offnum;
    1840             : 
    1841             :     /*
    1842             :      * Have index-only scans return the same base IndexTuple for every TID
    1843             :      * that originates from the same posting list
    1844             :      */
    1845      938372 :     if (so->currTuples)
    1846      346244 :         currItem->tupleOffset = tupleOffset;
    1847      938372 : }
    1848             : 
    1849             : /*
    1850             :  *  _bt_steppage() -- Step to next page containing valid data for scan
    1851             :  *
    1852             :  * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
    1853             :  * if pinned, we'll drop the pin before moving to next page.  The buffer is
    1854             :  * not locked on entry.
    1855             :  *
    1856             :  * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
    1857             :  * read lock, on that page.  If we do not hold the pin, we set so->currPos.buf
    1858             :  * to InvalidBuffer.  We return true to indicate success.
    1859             :  */
    1860             : static bool
    1861     3868450 : _bt_steppage(IndexScanDesc scan, ScanDirection dir)
    1862             : {
    1863     3868450 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1864     3868450 :     BlockNumber blkno = InvalidBlockNumber;
    1865     3868450 :     bool        status = true;
    1866             : 
    1867             :     Assert(BTScanPosIsValid(so->currPos));
    1868             : 
    1869             :     /* Before leaving current page, deal with any killed items */
    1870     3868450 :     if (so->numKilled > 0)
    1871       43186 :         _bt_killitems(scan);
    1872             : 
    1873             :     /*
    1874             :      * Before we modify currPos, make a copy of the page data if there was a
    1875             :      * mark position that needs it.
    1876             :      */
    1877     3868450 :     if (so->markItemIndex >= 0)
    1878             :     {
    1879             :         /* bump pin on current buffer for assignment to mark buffer */
    1880         246 :         if (BTScanPosIsPinned(so->currPos))
    1881         236 :             IncrBufferRefCount(so->currPos.buf);
    1882         246 :         memcpy(&so->markPos, &so->currPos,
    1883             :                offsetof(BTScanPosData, items[1]) +
    1884         246 :                so->currPos.lastItem * sizeof(BTScanPosItem));
    1885         246 :         if (so->markTuples)
    1886         236 :             memcpy(so->markTuples, so->currTuples,
    1887         236 :                    so->currPos.nextTupleOffset);
    1888         246 :         so->markPos.itemIndex = so->markItemIndex;
    1889         246 :         so->markItemIndex = -1;
    1890             :     }
    1891             : 
    1892     3868450 :     if (ScanDirectionIsForward(dir))
    1893             :     {
    1894             :         /* Walk right to the next page with data */
    1895     3868396 :         if (scan->parallel_scan != NULL)
    1896             :         {
    1897             :             /*
    1898             :              * Seize the scan to get the next block number; if the scan has
    1899             :              * ended already, bail out.
    1900             :              */
    1901         856 :             status = _bt_parallel_seize(scan, &blkno);
    1902         856 :             if (!status)
    1903             :             {
    1904             :                 /* release the previous buffer, if pinned */
    1905           0 :                 BTScanPosUnpinIfPinned(so->currPos);
    1906           0 :                 BTScanPosInvalidate(so->currPos);
    1907           0 :                 return false;
    1908             :             }
    1909             :         }
    1910             :         else
    1911             :         {
    1912             :             /* Not parallel, so use the previously-saved nextPage link. */
    1913     3867540 :             blkno = so->currPos.nextPage;
    1914             :         }
    1915             : 
    1916             :         /* Remember we left a page with data */
    1917     3868396 :         so->currPos.moreLeft = true;
    1918             : 
    1919             :         /* release the previous buffer, if pinned */
    1920     3868396 :         BTScanPosUnpinIfPinned(so->currPos);
    1921             :     }
    1922             :     else
    1923             :     {
    1924             :         /* Remember we left a page with data */
    1925          54 :         so->currPos.moreRight = true;
    1926             : 
    1927          54 :         if (scan->parallel_scan != NULL)
    1928             :         {
    1929             :             /*
    1930             :              * Seize the scan to get the current block number; if the scan has
    1931             :              * ended already, bail out.
    1932             :              */
    1933           0 :             status = _bt_parallel_seize(scan, &blkno);
    1934           0 :             BTScanPosUnpinIfPinned(so->currPos);
    1935           0 :             if (!status)
    1936             :             {
    1937           0 :                 BTScanPosInvalidate(so->currPos);
    1938           0 :                 return false;
    1939             :             }
    1940             :         }
    1941             :         else
    1942             :         {
    1943             :             /* Not parallel, so just use our own notion of the current page */
    1944          54 :             blkno = so->currPos.currPage;
    1945             :         }
    1946             :     }
    1947             : 
    1948     3868450 :     if (!_bt_readnextpage(scan, blkno, dir))
    1949     3849978 :         return false;
    1950             : 
    1951             :     /* Drop the lock, and maybe the pin, on the current page */
    1952       18472 :     _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
    1953             : 
    1954       18472 :     return true;
    1955             : }
    1956             : 
    1957             : /*
    1958             :  *  _bt_readnextpage() -- Read next page containing valid data for scan
    1959             :  *
    1960             :  * On success exit, so->currPos is updated to contain data from the next
    1961             :  * interesting page.  Caller is responsible to release lock and pin on
    1962             :  * buffer on success.  We return true to indicate success.
    1963             :  *
    1964             :  * If there are no more matching records in the given direction, we drop all
    1965             :  * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
    1966             :  */
    1967             : static bool
    1968     3868450 : _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
    1969             : {
    1970     3868450 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1971             :     Relation    rel;
    1972             :     Page        page;
    1973             :     BTPageOpaque opaque;
    1974     3868450 :     bool        status = true;
    1975             : 
    1976     3868450 :     rel = scan->indexRelation;
    1977             : 
    1978     3868450 :     if (ScanDirectionIsForward(dir))
    1979             :     {
    1980             :         for (;;)
    1981             :         {
    1982             :             /*
    1983             :              * if we're at end of scan, give up and mark parallel scan as
    1984             :              * done, so that all the workers can finish their scan
    1985             :              */
    1986     3868790 :             if (blkno == P_NONE || !so->currPos.moreRight)
    1987             :             {
    1988     3849934 :                 _bt_parallel_done(scan);
    1989     3849934 :                 BTScanPosInvalidate(so->currPos);
    1990     3849934 :                 return false;
    1991             :             }
    1992             :             /* check for interrupts while we're not holding any buffer lock */
    1993       18856 :             CHECK_FOR_INTERRUPTS();
    1994             :             /* step right one page */
    1995       18856 :             so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
    1996       18856 :             page = BufferGetPage(so->currPos.buf);
    1997       18856 :             TestForOldSnapshot(scan->xs_snapshot, rel, page);
    1998       18856 :             opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1999             :             /* check for deleted page */
    2000       18856 :             if (!P_IGNORE(opaque))
    2001             :             {
    2002       18856 :                 PredicateLockPage(rel, blkno, scan->xs_snapshot);
    2003             :                 /* see if there are any matches on this page */
    2004             :                 /* note that this will clear moreRight if we can stop */
    2005       18856 :                 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
    2006       18462 :                     break;
    2007             :             }
    2008           0 :             else if (scan->parallel_scan != NULL)
    2009             :             {
    2010             :                 /* allow next page be processed by parallel worker */
    2011           0 :                 _bt_parallel_release(scan, opaque->btpo_next);
    2012             :             }
    2013             : 
    2014             :             /* nope, keep going */
    2015         394 :             if (scan->parallel_scan != NULL)
    2016             :             {
    2017           0 :                 _bt_relbuf(rel, so->currPos.buf);
    2018           0 :                 status = _bt_parallel_seize(scan, &blkno);
    2019           0 :                 if (!status)
    2020             :                 {
    2021           0 :                     BTScanPosInvalidate(so->currPos);
    2022           0 :                     return false;
    2023             :                 }
    2024             :             }
    2025             :             else
    2026             :             {
    2027         394 :                 blkno = opaque->btpo_next;
    2028         394 :                 _bt_relbuf(rel, so->currPos.buf);
    2029             :             }
    2030             :         }
    2031             :     }
    2032             :     else
    2033             :     {
    2034             :         /*
    2035             :          * Should only happen in parallel cases, when some other backend
    2036             :          * advanced the scan.
    2037             :          */
    2038          54 :         if (so->currPos.currPage != blkno)
    2039             :         {
    2040           0 :             BTScanPosUnpinIfPinned(so->currPos);
    2041           0 :             so->currPos.currPage = blkno;
    2042             :         }
    2043             : 
    2044             :         /*
    2045             :          * Walk left to the next page with data.  This is much more complex
    2046             :          * than the walk-right case because of the possibility that the page
    2047             :          * to our left splits while we are in flight to it, plus the
    2048             :          * possibility that the page we were on gets deleted after we leave
    2049             :          * it.  See nbtree/README for details.
    2050             :          *
    2051             :          * It might be possible to rearrange this code to have less overhead
    2052             :          * in pinning and locking, but that would require capturing the left
    2053             :          * pointer when the page is initially read, and using it here, along
    2054             :          * with big changes to _bt_walk_left() and the code below.  It is not
    2055             :          * clear whether this would be a win, since if the page immediately to
    2056             :          * the left splits after we read this page and before we step left, we
    2057             :          * would need to visit more pages than with the current code.
    2058             :          *
    2059             :          * Note that if we change the code so that we drop the pin for a scan
    2060             :          * which uses a non-MVCC snapshot, we will need to modify the code for
    2061             :          * walking left, to allow for the possibility that a referenced page
    2062             :          * has been deleted.  As long as the buffer is pinned or the snapshot
    2063             :          * is MVCC the page cannot move past the half-dead state to fully
    2064             :          * deleted.
    2065             :          */
    2066          54 :         if (BTScanPosIsPinned(so->currPos))
    2067          26 :             LockBuffer(so->currPos.buf, BT_READ);
    2068             :         else
    2069          28 :             so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
    2070             : 
    2071             :         for (;;)
    2072             :         {
    2073             :             /* Done if we know there are no matching keys to the left */
    2074          56 :             if (!so->currPos.moreLeft)
    2075             :             {
    2076          28 :                 _bt_relbuf(rel, so->currPos.buf);
    2077          28 :                 _bt_parallel_done(scan);
    2078          28 :                 BTScanPosInvalidate(so->currPos);
    2079          28 :                 return false;
    2080             :             }
    2081             : 
    2082             :             /* Step to next physical page */
    2083          28 :             so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
    2084             :                                             scan->xs_snapshot);
    2085             : 
    2086             :             /* if we're physically at end of index, return failure */
    2087          28 :             if (so->currPos.buf == InvalidBuffer)
    2088             :             {
    2089          16 :                 _bt_parallel_done(scan);
    2090          16 :                 BTScanPosInvalidate(so->currPos);
    2091          16 :                 return false;
    2092             :             }
    2093             : 
    2094             :             /*
    2095             :              * Okay, we managed to move left to a non-deleted page. Done if
    2096             :              * it's not half-dead and contains matching tuples. Else loop back
    2097             :              * and do it all again.
    2098             :              */
    2099          12 :             page = BufferGetPage(so->currPos.buf);
    2100          12 :             TestForOldSnapshot(scan->xs_snapshot, rel, page);
    2101          12 :             opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    2102          12 :             if (!P_IGNORE(opaque))
    2103             :             {
    2104          12 :                 PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
    2105             :                 /* see if there are any matches on this page */
    2106             :                 /* note that this will clear moreLeft if we can stop */
    2107          12 :                 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
    2108          10 :                     break;
    2109             :             }
    2110           0 :             else if (scan->parallel_scan != NULL)
    2111             :             {
    2112             :                 /* allow next page be processed by parallel worker */
    2113           0 :                 _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
    2114             :             }
    2115             : 
    2116             :             /*
    2117             :              * For parallel scans, get the last page scanned as it is quite
    2118             :              * possible that by the time we try to seize the scan, some other
    2119             :              * worker has already advanced the scan to a different page.  We
    2120             :              * must continue based on the latest page scanned by any worker.
    2121             :              */
    2122           2 :             if (scan->parallel_scan != NULL)
    2123             :             {
    2124           0 :                 _bt_relbuf(rel, so->currPos.buf);
    2125           0 :                 status = _bt_parallel_seize(scan, &blkno);
    2126           0 :                 if (!status)
    2127             :                 {
    2128           0 :                     BTScanPosInvalidate(so->currPos);
    2129           0 :                     return false;
    2130             :                 }
    2131           0 :                 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
    2132             :             }
    2133             :         }
    2134             :     }
    2135             : 
    2136       18472 :     return true;
    2137             : }
    2138             : 
    2139             : /*
    2140             :  *  _bt_parallel_readpage() -- Read current page containing valid data for scan
    2141             :  *
    2142             :  * On success, release lock and maybe pin on buffer.  We return true to
    2143             :  * indicate success.
    2144             :  */
    2145             : static bool
    2146           0 : _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
    2147             : {
    2148           0 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    2149             : 
    2150           0 :     _bt_initialize_more_data(so, dir);
    2151             : 
    2152           0 :     if (!_bt_readnextpage(scan, blkno, dir))
    2153           0 :         return false;
    2154             : 
    2155             :     /* Drop the lock, and maybe the pin, on the current page */
    2156           0 :     _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
    2157             : 
    2158           0 :     return true;
    2159             : }
    2160             : 
    2161             : /*
    2162             :  * _bt_walk_left() -- step left one page, if possible
    2163             :  *
    2164             :  * The given buffer must be pinned and read-locked.  This will be dropped
    2165             :  * before stepping left.  On return, we have pin and read lock on the
    2166             :  * returned page, instead.
    2167             :  *
    2168             :  * Returns InvalidBuffer if there is no page to the left (no lock is held
    2169             :  * in that case).
    2170             :  *
    2171             :  * When working on a non-leaf level, it is possible for the returned page
    2172             :  * to be half-dead; the caller should check that condition and step left
    2173             :  * again if it's important.
    2174             :  */
    2175             : static Buffer
    2176          28 : _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
    2177             : {
    2178             :     Page        page;
    2179             :     BTPageOpaque opaque;
    2180             : 
    2181          28 :     page = BufferGetPage(buf);
    2182          28 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    2183             : 
    2184             :     for (;;)
    2185           0 :     {
    2186             :         BlockNumber obknum;
    2187             :         BlockNumber lblkno;
    2188             :         BlockNumber blkno;
    2189             :         int         tries;
    2190             : 
    2191             :         /* if we're at end of tree, release buf and return failure */
    2192          28 :         if (P_LEFTMOST(opaque))
    2193             :         {
    2194          16 :             _bt_relbuf(rel, buf);
    2195          16 :             break;
    2196             :         }
    2197             :         /* remember original page we are stepping left from */
    2198          12 :         obknum = BufferGetBlockNumber(buf);
    2199             :         /* step left */
    2200          12 :         blkno = lblkno = opaque->btpo_prev;
    2201          12 :         _bt_relbuf(rel, buf);
    2202             :         /* check for interrupts while we're not holding any buffer lock */
    2203          12 :         CHECK_FOR_INTERRUPTS();
    2204          12 :         buf = _bt_getbuf(rel, blkno, BT_READ);
    2205          12 :         page = BufferGetPage(buf);
    2206          12 :         TestForOldSnapshot(snapshot, rel, page);
    2207          12 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    2208             : 
    2209             :         /*
    2210             :          * If this isn't the page we want, walk right till we find what we
    2211             :          * want --- but go no more than four hops (an arbitrary limit). If we
    2212             :          * don't find the correct page by then, the most likely bet is that
    2213             :          * the original page got deleted and isn't in the sibling chain at all
    2214             :          * anymore, not that its left sibling got split more than four times.
    2215             :          *
    2216             :          * Note that it is correct to test P_ISDELETED not P_IGNORE here,
    2217             :          * because half-dead pages are still in the sibling chain.  Caller
    2218             :          * must reject half-dead pages if wanted.
    2219             :          */
    2220          12 :         tries = 0;
    2221             :         for (;;)
    2222             :         {
    2223          12 :             if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
    2224             :             {
    2225             :                 /* Found desired page, return it */
    2226          12 :                 return buf;
    2227             :             }
    2228           0 :             if (P_RIGHTMOST(opaque) || ++tries > 4)
    2229             :                 break;
    2230           0 :             blkno = opaque->btpo_next;
    2231           0 :             buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    2232           0 :             page = BufferGetPage(buf);
    2233           0 :             TestForOldSnapshot(snapshot, rel, page);
    2234           0 :             opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    2235             :         }
    2236             : 
    2237             :         /* Return to the original page to see what's up */
    2238           0 :         buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
    2239           0 :         page = BufferGetPage(buf);
    2240           0 :         TestForOldSnapshot(snapshot, rel, page);
    2241           0 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    2242           0 :         if (P_ISDELETED(opaque))
    2243             :         {
    2244             :             /*
    2245             :              * It was deleted.  Move right to first nondeleted page (there
    2246             :              * must be one); that is the page that has acquired the deleted
    2247             :              * one's keyspace, so stepping left from it will take us where we
    2248             :              * want to be.
    2249             :              */
    2250             :             for (;;)
    2251             :             {
    2252           0 :                 if (P_RIGHTMOST(opaque))
    2253           0 :                     elog(ERROR, "fell off the end of index \"%s\"",
    2254             :                          RelationGetRelationName(rel));
    2255           0 :                 blkno = opaque->btpo_next;
    2256           0 :                 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    2257           0 :                 page = BufferGetPage(buf);
    2258           0 :                 TestForOldSnapshot(snapshot, rel, page);
    2259           0 :                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    2260           0 :                 if (!P_ISDELETED(opaque))
    2261           0 :                     break;
    2262             :             }
    2263             : 
    2264             :             /*
    2265             :              * Now return to top of loop, resetting obknum to point to this
    2266             :              * nondeleted page, and try again.
    2267             :              */
    2268             :         }
    2269             :         else
    2270             :         {
    2271             :             /*
    2272             :              * It wasn't deleted; the explanation had better be that the page
    2273             :              * to the left got split or deleted. Without this check, we'd go
    2274             :              * into an infinite loop if there's anything wrong.
    2275             :              */
    2276           0 :             if (opaque->btpo_prev == lblkno)
    2277           0 :                 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
    2278             :                      obknum, RelationGetRelationName(rel));
    2279             :             /* Okay to try again with new lblkno value */
    2280             :         }
    2281             :     }
    2282             : 
    2283          16 :     return InvalidBuffer;
    2284             : }
    2285             : 
    2286             : /*
    2287             :  * _bt_get_endpoint() -- Find the first or last page on a given tree level
    2288             :  *
    2289             :  * If the index is empty, we will return InvalidBuffer; any other failure
    2290             :  * condition causes ereport().  We will not return a dead page.
    2291             :  *
    2292             :  * The returned buffer is pinned and read-locked.
    2293             :  */
    2294             : Buffer
    2295       47952 : _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
    2296             :                  Snapshot snapshot)
    2297             : {
    2298             :     Buffer      buf;
    2299             :     Page        page;
    2300             :     BTPageOpaque opaque;
    2301             :     OffsetNumber offnum;
    2302             :     BlockNumber blkno;
    2303             :     IndexTuple  itup;
    2304             : 
    2305             :     /*
    2306             :      * If we are looking for a leaf page, okay to descend from fast root;
    2307             :      * otherwise better descend from true root.  (There is no point in being
    2308             :      * smarter about intermediate levels.)
    2309             :      */
    2310       47952 :     if (level == 0)
    2311       47946 :         buf = _bt_getroot(rel, BT_READ);
    2312             :     else
    2313           6 :         buf = _bt_gettrueroot(rel);
    2314             : 
    2315       47952 :     if (!BufferIsValid(buf))
    2316        3276 :         return InvalidBuffer;
    2317             : 
    2318       44676 :     page = BufferGetPage(buf);
    2319       44676 :     TestForOldSnapshot(snapshot, rel, page);
    2320       44676 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    2321             : 
    2322             :     for (;;)
    2323             :     {
    2324             :         /*
    2325             :          * If we landed on a deleted page, step right to find a live page
    2326             :          * (there must be one).  Also, if we want the rightmost page, step
    2327             :          * right if needed to get to it (this could happen if the page split
    2328             :          * since we obtained a pointer to it).
    2329             :          */
    2330       71508 :         while (P_IGNORE(opaque) ||
    2331          36 :                (rightmost && !P_RIGHTMOST(opaque)))
    2332             :         {
    2333           0 :             blkno = opaque->btpo_next;
    2334           0 :             if (blkno == P_NONE)
    2335           0 :                 elog(ERROR, "fell off the end of index \"%s\"",
    2336             :                      RelationGetRelationName(rel));
    2337           0 :             buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    2338           0 :             page = BufferGetPage(buf);
    2339           0 :             TestForOldSnapshot(snapshot, rel, page);
    2340           0 :             opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    2341             :         }
    2342             : 
    2343             :         /* Done? */
    2344       71508 :         if (opaque->btpo.level == level)
    2345       44676 :             break;
    2346       26832 :         if (opaque->btpo.level < level)
    2347           0 :             ereport(ERROR,
    2348             :                     (errcode(ERRCODE_INDEX_CORRUPTED),
    2349             :                      errmsg_internal("btree level %u not found in index \"%s\"",
    2350             :                                      level, RelationGetRelationName(rel))));
    2351             : 
    2352             :         /* Descend to leftmost or rightmost child page */
    2353       26832 :         if (rightmost)
    2354           4 :             offnum = PageGetMaxOffsetNumber(page);
    2355             :         else
    2356       26828 :             offnum = P_FIRSTDATAKEY(opaque);
    2357             : 
    2358       26832 :         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
    2359       26832 :         blkno = BTreeTupleGetDownLink(itup);
    2360             : 
    2361       26832 :         buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    2362       26832 :         page = BufferGetPage(buf);
    2363       26832 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    2364             :     }
    2365             : 
    2366       44676 :     return buf;
    2367             : }
    2368             : 
    2369             : /*
    2370             :  *  _bt_endpoint() -- Find the first or last page in the index, and scan
    2371             :  * from there to the first key satisfying all the quals.
    2372             :  *
    2373             :  * This is used by _bt_first() to set up a scan when we've determined
    2374             :  * that the scan must start at the beginning or end of the index (for
    2375             :  * a forward or backward scan respectively).  Exit conditions are the
    2376             :  * same as for _bt_first().
    2377             :  */
    2378             : static bool
    2379       47946 : _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
    2380             : {
    2381       47946 :     Relation    rel = scan->indexRelation;
    2382       47946 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    2383             :     Buffer      buf;
    2384             :     Page        page;
    2385             :     BTPageOpaque opaque;
    2386             :     OffsetNumber start;
    2387             :     BTScanPosItem *currItem;
    2388             : 
    2389             :     /*
    2390             :      * Scan down to the leftmost or rightmost leaf page.  This is a simplified
    2391             :      * version of _bt_search().  We don't maintain a stack since we know we
    2392             :      * won't need it.
    2393             :      */
    2394       47946 :     buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot);
    2395             : 
    2396       47946 :     if (!BufferIsValid(buf))
    2397             :     {
    2398             :         /*
    2399             :          * Empty index. Lock the whole relation, as nothing finer to lock
    2400             :          * exists.
    2401             :          */
    2402        3276 :         PredicateLockRelation(rel, scan->xs_snapshot);
    2403        3276 :         BTScanPosInvalidate(so->currPos);
    2404        3276 :         return false;
    2405             :     }
    2406             : 
    2407       44670 :     PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
    2408       44670 :     page = BufferGetPage(buf);
    2409       44670 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    2410             :     Assert(P_ISLEAF(opaque));
    2411             : 
    2412       44670 :     if (ScanDirectionIsForward(dir))
    2413             :     {
    2414             :         /* There could be dead pages to the left, so not this: */
    2415             :         /* Assert(P_LEFTMOST(opaque)); */
    2416             : 
    2417       44638 :         start = P_FIRSTDATAKEY(opaque);
    2418             :     }
    2419          32 :     else if (ScanDirectionIsBackward(dir))
    2420             :     {
    2421             :         Assert(P_RIGHTMOST(opaque));
    2422             : 
    2423          32 :         start = PageGetMaxOffsetNumber(page);
    2424             :     }
    2425             :     else
    2426             :     {
    2427           0 :         elog(ERROR, "invalid scan direction: %d", (int) dir);
    2428             :         start = 0;              /* keep compiler quiet */
    2429             :     }
    2430             : 
    2431             :     /* remember which buffer we have pinned */
    2432       44670 :     so->currPos.buf = buf;
    2433             : 
    2434       44670 :     _bt_initialize_more_data(so, dir);
    2435             : 
    2436             :     /*
    2437             :      * Now load data from the first page of the scan.
    2438             :      */
    2439       44670 :     if (!_bt_readpage(scan, dir, start))
    2440             :     {
    2441             :         /*
    2442             :          * There's no actually-matching data on this page.  Try to advance to
    2443             :          * the next page.  Return false if there's no matching data at all.
    2444             :          */
    2445         100 :         LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
    2446         100 :         if (!_bt_steppage(scan, dir))
    2447          70 :             return false;
    2448             :     }
    2449             :     else
    2450             :     {
    2451             :         /* Drop the lock, and maybe the pin, on the current page */
    2452       44570 :         _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
    2453             :     }
    2454             : 
    2455             :     /* OK, itemIndex says what to return */
    2456       44600 :     currItem = &so->currPos.items[so->currPos.itemIndex];
    2457       44600 :     scan->xs_heaptid = currItem->heapTid;
    2458       44600 :     if (scan->xs_want_itup)
    2459       42294 :         scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
    2460             : 
    2461       44600 :     return true;
    2462             : }
    2463             : 
    2464             : /*
    2465             :  * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
    2466             :  * for scan direction
    2467             :  */
    2468             : static inline void
    2469    10831058 : _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
    2470             : {
    2471             :     /* initialize moreLeft/moreRight appropriately for scan direction */
    2472    10831058 :     if (ScanDirectionIsForward(dir))
    2473             :     {
    2474    10803132 :         so->currPos.moreLeft = false;
    2475    10803132 :         so->currPos.moreRight = true;
    2476             :     }
    2477             :     else
    2478             :     {
    2479       27926 :         so->currPos.moreLeft = true;
    2480       27926 :         so->currPos.moreRight = false;
    2481             :     }
    2482    10831058 :     so->numKilled = 0;           /* just paranoia */
    2483    10831058 :     so->markItemIndex = -1;      /* ditto */
    2484    10831058 : }

Generated by: LCOV version 1.13