LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtree.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 480 556 86.3 %
Date: 2025-04-22 08:15:34 Functions: 26 28 92.9 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtree.c
       4             :  *    Implementation of Lehman and Yao's btree management algorithm for
       5             :  *    Postgres.
       6             :  *
       7             :  * NOTES
       8             :  *    This file contains only the public interface routines.
       9             :  *
      10             :  *
      11             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
      12             :  * Portions Copyright (c) 1994, Regents of the University of California
      13             :  *
      14             :  * IDENTIFICATION
      15             :  *    src/backend/access/nbtree/nbtree.c
      16             :  *
      17             :  *-------------------------------------------------------------------------
      18             :  */
      19             : #include "postgres.h"
      20             : 
      21             : #include "access/nbtree.h"
      22             : #include "access/relscan.h"
      23             : #include "access/stratnum.h"
      24             : #include "commands/progress.h"
      25             : #include "commands/vacuum.h"
      26             : #include "nodes/execnodes.h"
      27             : #include "pgstat.h"
      28             : #include "storage/bulk_write.h"
      29             : #include "storage/condition_variable.h"
      30             : #include "storage/indexfsm.h"
      31             : #include "storage/ipc.h"
      32             : #include "storage/lmgr.h"
      33             : #include "storage/read_stream.h"
      34             : #include "utils/datum.h"
      35             : #include "utils/fmgrprotos.h"
      36             : #include "utils/index_selfuncs.h"
      37             : #include "utils/memutils.h"
      38             : 
      39             : 
      40             : /*
      41             :  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
      42             :  *
      43             :  * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
      44             :  * scan to advance it via another call to _bt_first.
      45             :  *
      46             :  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
      47             :  * a new page; others must wait.
      48             :  *
      49             :  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
      50             :  * to a new page; some process can start doing that.
      51             :  *
      52             :  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
      53             :  */
      54             : typedef enum
      55             : {
      56             :     BTPARALLEL_NOT_INITIALIZED,
      57             :     BTPARALLEL_NEED_PRIMSCAN,
      58             :     BTPARALLEL_ADVANCING,
      59             :     BTPARALLEL_IDLE,
      60             :     BTPARALLEL_DONE,
      61             : } BTPS_State;
      62             : 
      63             : /*
      64             :  * BTParallelScanDescData contains btree specific shared information required
      65             :  * for parallel scan.
      66             :  */
      67             : typedef struct BTParallelScanDescData
      68             : {
      69             :     BlockNumber btps_nextScanPage;  /* next page to be scanned */
      70             :     BlockNumber btps_lastCurrPage;  /* page whose sibling link was copied into
      71             :                                      * btps_nextScanPage */
      72             :     BTPS_State  btps_pageStatus;    /* indicates whether next page is
      73             :                                      * available for scan. see above for
      74             :                                      * possible states of parallel scan. */
      75             :     LWLock      btps_lock;      /* protects shared parallel state */
      76             :     ConditionVariable btps_cv;  /* used to synchronize parallel scan */
      77             : 
      78             :     /*
      79             :      * btps_arrElems is used when scans need to schedule another primitive
      80             :      * index scan with one or more SAOP arrays.  Holds BTArrayKeyInfo.cur_elem
      81             :      * offsets for each = scan key associated with a ScalarArrayOp array.
      82             :      */
      83             :     int         btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
      84             : 
      85             :     /*
      86             :      * Additional space (at the end of the struct) is used when scans need to
      87             :      * schedule another primitive index scan with one or more skip arrays.
      88             :      * Holds a flattened datum representation for each = scan key associated
      89             :      * with a skip array.
      90             :      */
      91             : }           BTParallelScanDescData;
      92             : 
      93             : typedef struct BTParallelScanDescData *BTParallelScanDesc;
      94             : 
      95             : 
      96             : static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan,
      97             :                                           BTScanOpaque so);
      98             : static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan,
      99             :                                         BTScanOpaque so);
     100             : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
     101             :                          IndexBulkDeleteCallback callback, void *callback_state,
     102             :                          BTCycleId cycleid);
     103             : static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf);
     104             : static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
     105             :                                           IndexTuple posting,
     106             :                                           OffsetNumber updatedoffset,
     107             :                                           int *nremaining);
     108             : 
     109             : 
     110             : /*
     111             :  * Btree handler function: return IndexAmRoutine with access method parameters
     112             :  * and callbacks.
     113             :  */
     114             : Datum
     115     3389988 : bthandler(PG_FUNCTION_ARGS)
     116             : {
     117     3389988 :     IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
     118             : 
     119     3389988 :     amroutine->amstrategies = BTMaxStrategyNumber;
     120     3389988 :     amroutine->amsupport = BTNProcs;
     121     3389988 :     amroutine->amoptsprocnum = BTOPTIONS_PROC;
     122     3389988 :     amroutine->amcanorder = true;
     123     3389988 :     amroutine->amcanorderbyop = false;
     124     3389988 :     amroutine->amcanhash = false;
     125     3389988 :     amroutine->amconsistentequality = true;
     126     3389988 :     amroutine->amconsistentordering = true;
     127     3389988 :     amroutine->amcanbackward = true;
     128     3389988 :     amroutine->amcanunique = true;
     129     3389988 :     amroutine->amcanmulticol = true;
     130     3389988 :     amroutine->amoptionalkey = true;
     131     3389988 :     amroutine->amsearcharray = true;
     132     3389988 :     amroutine->amsearchnulls = true;
     133     3389988 :     amroutine->amstorage = false;
     134     3389988 :     amroutine->amclusterable = true;
     135     3389988 :     amroutine->ampredlocks = true;
     136     3389988 :     amroutine->amcanparallel = true;
     137     3389988 :     amroutine->amcanbuildparallel = true;
     138     3389988 :     amroutine->amcaninclude = true;
     139     3389988 :     amroutine->amusemaintenanceworkmem = false;
     140     3389988 :     amroutine->amsummarizing = false;
     141     3389988 :     amroutine->amparallelvacuumoptions =
     142             :         VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
     143     3389988 :     amroutine->amkeytype = InvalidOid;
     144             : 
     145     3389988 :     amroutine->ambuild = btbuild;
     146     3389988 :     amroutine->ambuildempty = btbuildempty;
     147     3389988 :     amroutine->aminsert = btinsert;
     148     3389988 :     amroutine->aminsertcleanup = NULL;
     149     3389988 :     amroutine->ambulkdelete = btbulkdelete;
     150     3389988 :     amroutine->amvacuumcleanup = btvacuumcleanup;
     151     3389988 :     amroutine->amcanreturn = btcanreturn;
     152     3389988 :     amroutine->amcostestimate = btcostestimate;
     153     3389988 :     amroutine->amgettreeheight = btgettreeheight;
     154     3389988 :     amroutine->amoptions = btoptions;
     155     3389988 :     amroutine->amproperty = btproperty;
     156     3389988 :     amroutine->ambuildphasename = btbuildphasename;
     157     3389988 :     amroutine->amvalidate = btvalidate;
     158     3389988 :     amroutine->amadjustmembers = btadjustmembers;
     159     3389988 :     amroutine->ambeginscan = btbeginscan;
     160     3389988 :     amroutine->amrescan = btrescan;
     161     3389988 :     amroutine->amgettuple = btgettuple;
     162     3389988 :     amroutine->amgetbitmap = btgetbitmap;
     163     3389988 :     amroutine->amendscan = btendscan;
     164     3389988 :     amroutine->ammarkpos = btmarkpos;
     165     3389988 :     amroutine->amrestrpos = btrestrpos;
     166     3389988 :     amroutine->amestimateparallelscan = btestimateparallelscan;
     167     3389988 :     amroutine->aminitparallelscan = btinitparallelscan;
     168     3389988 :     amroutine->amparallelrescan = btparallelrescan;
     169     3389988 :     amroutine->amtranslatestrategy = bttranslatestrategy;
     170     3389988 :     amroutine->amtranslatecmptype = bttranslatecmptype;
     171             : 
     172     3389988 :     PG_RETURN_POINTER(amroutine);
     173             : }
     174             : 
     175             : /*
     176             :  *  btbuildempty() -- build an empty btree index in the initialization fork
     177             :  */
     178             : void
     179         164 : btbuildempty(Relation index)
     180             : {
     181         164 :     bool        allequalimage = _bt_allequalimage(index, false);
     182             :     BulkWriteState *bulkstate;
     183             :     BulkWriteBuffer metabuf;
     184             : 
     185         164 :     bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
     186             : 
     187             :     /* Construct metapage. */
     188         164 :     metabuf = smgr_bulk_get_buf(bulkstate);
     189         164 :     _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
     190         164 :     smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
     191             : 
     192         164 :     smgr_bulk_finish(bulkstate);
     193         164 : }
     194             : 
     195             : /*
     196             :  *  btinsert() -- insert an index tuple into a btree.
     197             :  *
     198             :  *      Descend the tree recursively, find the appropriate location for our
     199             :  *      new tuple, and put it there.
     200             :  */
     201             : bool
     202     7213298 : btinsert(Relation rel, Datum *values, bool *isnull,
     203             :          ItemPointer ht_ctid, Relation heapRel,
     204             :          IndexUniqueCheck checkUnique,
     205             :          bool indexUnchanged,
     206             :          IndexInfo *indexInfo)
     207             : {
     208             :     bool        result;
     209             :     IndexTuple  itup;
     210             : 
     211             :     /* generate an index tuple */
     212     7213298 :     itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
     213     7213298 :     itup->t_tid = *ht_ctid;
     214             : 
     215     7213298 :     result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
     216             : 
     217     7212786 :     pfree(itup);
     218             : 
     219     7212786 :     return result;
     220             : }
     221             : 
     222             : /*
     223             :  *  btgettuple() -- Get the next tuple in the scan.
     224             :  */
     225             : bool
     226    34875996 : btgettuple(IndexScanDesc scan, ScanDirection dir)
     227             : {
     228    34875996 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     229             :     bool        res;
     230             : 
     231             :     /* btree indexes are never lossy */
     232    34875996 :     scan->xs_recheck = false;
     233             : 
     234             :     /* Each loop iteration performs another primitive index scan */
     235             :     do
     236             :     {
     237             :         /*
     238             :          * If we've already initialized this scan, we can just advance it in
     239             :          * the appropriate direction.  If we haven't done so yet, we call
     240             :          * _bt_first() to get the first item in the scan.
     241             :          */
     242    34892896 :         if (!BTScanPosIsValid(so->currPos))
     243    15390880 :             res = _bt_first(scan, dir);
     244             :         else
     245             :         {
     246             :             /*
     247             :              * Check to see if we should kill the previously-fetched tuple.
     248             :              */
     249    19502016 :             if (scan->kill_prior_tuple)
     250             :             {
     251             :                 /*
     252             :                  * Yes, remember it for later. (We'll deal with all such
     253             :                  * tuples at once right before leaving the index page.)  The
     254             :                  * test for numKilled overrun is not just paranoia: if the
     255             :                  * caller reverses direction in the indexscan then the same
     256             :                  * item might get entered multiple times. It's not worth
     257             :                  * trying to optimize that, so we don't detect it, but instead
     258             :                  * just forget any excess entries.
     259             :                  */
     260      489706 :                 if (so->killedItems == NULL)
     261      170344 :                     so->killedItems = (int *)
     262      170344 :                         palloc(MaxTIDsPerBTreePage * sizeof(int));
     263      489706 :                 if (so->numKilled < MaxTIDsPerBTreePage)
     264      489706 :                     so->killedItems[so->numKilled++] = so->currPos.itemIndex;
     265             :             }
     266             : 
     267             :             /*
     268             :              * Now continue the scan.
     269             :              */
     270    19502016 :             res = _bt_next(scan, dir);
     271             :         }
     272             : 
     273             :         /* If we have a tuple, return it ... */
     274    34892896 :         if (res)
     275    27907180 :             break;
     276             :         /* ... otherwise see if we need another primitive index scan */
     277     6985716 :     } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
     278             : 
     279    34875996 :     return res;
     280             : }
     281             : 
     282             : /*
     283             :  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
     284             :  */
     285             : int64
     286       13552 : btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
     287             : {
     288       13552 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     289       13552 :     int64       ntids = 0;
     290             :     ItemPointer heapTid;
     291             : 
     292             :     /* Each loop iteration performs another primitive index scan */
     293             :     do
     294             :     {
     295             :         /* Fetch the first page & tuple */
     296       14612 :         if (_bt_first(scan, ForwardScanDirection))
     297             :         {
     298             :             /* Save tuple ID, and continue scanning */
     299       10484 :             heapTid = &scan->xs_heaptid;
     300       10484 :             tbm_add_tuples(tbm, heapTid, 1, false);
     301       10484 :             ntids++;
     302             : 
     303             :             for (;;)
     304             :             {
     305             :                 /*
     306             :                  * Advance to next tuple within page.  This is the same as the
     307             :                  * easy case in _bt_next().
     308             :                  */
     309     2044622 :                 if (++so->currPos.itemIndex > so->currPos.lastItem)
     310             :                 {
     311             :                     /* let _bt_next do the heavy lifting */
     312       16162 :                     if (!_bt_next(scan, ForwardScanDirection))
     313       10484 :                         break;
     314             :                 }
     315             : 
     316             :                 /* Save tuple ID, and continue scanning */
     317     2034138 :                 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
     318     2034138 :                 tbm_add_tuples(tbm, heapTid, 1, false);
     319     2034138 :                 ntids++;
     320             :             }
     321             :         }
     322             :         /* Now see if we need another primitive index scan */
     323       14612 :     } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection));
     324             : 
     325       13552 :     return ntids;
     326             : }
     327             : 
     328             : /*
     329             :  *  btbeginscan() -- start a scan on a btree index
     330             :  */
     331             : IndexScanDesc
     332    14685860 : btbeginscan(Relation rel, int nkeys, int norderbys)
     333             : {
     334             :     IndexScanDesc scan;
     335             :     BTScanOpaque so;
     336             : 
     337             :     /* no order by operators allowed */
     338             :     Assert(norderbys == 0);
     339             : 
     340             :     /* get the scan */
     341    14685860 :     scan = RelationGetIndexScan(rel, nkeys, norderbys);
     342             : 
     343             :     /* allocate private workspace */
     344    14685860 :     so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
     345    14685860 :     BTScanPosInvalidate(so->currPos);
     346    14685860 :     BTScanPosInvalidate(so->markPos);
     347    14685860 :     if (scan->numberOfKeys > 0)
     348    14672714 :         so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
     349             :     else
     350       13146 :         so->keyData = NULL;
     351             : 
     352    14685860 :     so->skipScan = false;
     353    14685860 :     so->needPrimScan = false;
     354    14685860 :     so->scanBehind = false;
     355    14685860 :     so->oppositeDirCheck = false;
     356    14685860 :     so->arrayKeys = NULL;
     357    14685860 :     so->orderProcs = NULL;
     358    14685860 :     so->arrayContext = NULL;
     359             : 
     360    14685860 :     so->killedItems = NULL;      /* until needed */
     361    14685860 :     so->numKilled = 0;
     362             : 
     363             :     /*
     364             :      * We don't know yet whether the scan will be index-only, so we do not
     365             :      * allocate the tuple workspace arrays until btrescan.  However, we set up
     366             :      * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
     367             :      */
     368    14685860 :     so->currTuples = so->markTuples = NULL;
     369             : 
     370    14685860 :     scan->xs_itupdesc = RelationGetDescr(rel);
     371             : 
     372    14685860 :     scan->opaque = so;
     373             : 
     374    14685860 :     return scan;
     375             : }
     376             : 
     377             : /*
     378             :  *  btrescan() -- rescan an index relation
     379             :  */
     380             : void
     381    15388814 : btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
     382             :          ScanKey orderbys, int norderbys)
     383             : {
     384    15388814 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     385             : 
     386             :     /* we aren't holding any read locks, but gotta drop the pins */
     387    15388814 :     if (BTScanPosIsValid(so->currPos))
     388             :     {
     389             :         /* Before leaving current page, deal with any killed items */
     390       93290 :         if (so->numKilled > 0)
     391        1286 :             _bt_killitems(scan);
     392       93290 :         BTScanPosUnpinIfPinned(so->currPos);
     393       93290 :         BTScanPosInvalidate(so->currPos);
     394             :     }
     395             : 
     396    15388814 :     so->markItemIndex = -1;
     397    15388814 :     so->needPrimScan = false;
     398    15388814 :     so->scanBehind = false;
     399    15388814 :     so->oppositeDirCheck = false;
     400    15388814 :     BTScanPosUnpinIfPinned(so->markPos);
     401    15388814 :     BTScanPosInvalidate(so->markPos);
     402             : 
     403             :     /*
     404             :      * Allocate tuple workspace arrays, if needed for an index-only scan and
     405             :      * not already done in a previous rescan call.  To save on palloc
     406             :      * overhead, both workspaces are allocated as one palloc block; only this
     407             :      * function and btendscan know that.
     408             :      *
     409             :      * NOTE: this data structure also makes it safe to return data from a
     410             :      * "name" column, even though btree name_ops uses an underlying storage
     411             :      * datatype of cstring.  The risk there is that "name" is supposed to be
     412             :      * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
     413             :      * However, since we only return data out of tuples sitting in the
     414             :      * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
     415             :      * data out of the markTuples array --- running off the end of memory for
     416             :      * a SIGSEGV is not possible.  Yeah, this is ugly as sin, but it beats
     417             :      * adding special-case treatment for name_ops elsewhere.
     418             :      */
     419    15388814 :     if (scan->xs_want_itup && so->currTuples == NULL)
     420             :     {
     421      133320 :         so->currTuples = (char *) palloc(BLCKSZ * 2);
     422      133320 :         so->markTuples = so->currTuples + BLCKSZ;
     423             :     }
     424             : 
     425             :     /*
     426             :      * Reset the scan keys
     427             :      */
     428    15388814 :     if (scankey && scan->numberOfKeys > 0)
     429    15375474 :         memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
     430    15388814 :     so->numberOfKeys = 0;        /* until _bt_preprocess_keys sets it */
     431    15388814 :     so->numArrayKeys = 0;        /* ditto */
     432    15388814 : }
     433             : 
     434             : /*
     435             :  *  btendscan() -- close down a scan
     436             :  */
     437             : void
     438    14684310 : btendscan(IndexScanDesc scan)
     439             : {
     440    14684310 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     441             : 
     442             :     /* we aren't holding any read locks, but gotta drop the pins */
     443    14684310 :     if (BTScanPosIsValid(so->currPos))
     444             :     {
     445             :         /* Before leaving current page, deal with any killed items */
     446     8310762 :         if (so->numKilled > 0)
     447       91246 :             _bt_killitems(scan);
     448     8310762 :         BTScanPosUnpinIfPinned(so->currPos);
     449             :     }
     450             : 
     451    14684310 :     so->markItemIndex = -1;
     452    14684310 :     BTScanPosUnpinIfPinned(so->markPos);
     453             : 
     454             :     /* No need to invalidate positions, the RAM is about to be freed. */
     455             : 
     456             :     /* Release storage */
     457    14684310 :     if (so->keyData != NULL)
     458    14671194 :         pfree(so->keyData);
     459             :     /* so->arrayKeys and so->orderProcs are in arrayContext */
     460    14684310 :     if (so->arrayContext != NULL)
     461        4750 :         MemoryContextDelete(so->arrayContext);
     462    14684310 :     if (so->killedItems != NULL)
     463      170288 :         pfree(so->killedItems);
     464    14684310 :     if (so->currTuples != NULL)
     465      133276 :         pfree(so->currTuples);
     466             :     /* so->markTuples should not be pfree'd, see btrescan */
     467    14684310 :     pfree(so);
     468    14684310 : }
     469             : 
     470             : /*
     471             :  *  btmarkpos() -- save current scan position
     472             :  */
     473             : void
     474      130074 : btmarkpos(IndexScanDesc scan)
     475             : {
     476      130074 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     477             : 
     478             :     /* There may be an old mark with a pin (but no lock). */
     479      130074 :     BTScanPosUnpinIfPinned(so->markPos);
     480             : 
     481             :     /*
     482             :      * Just record the current itemIndex.  If we later step to next page
     483             :      * before releasing the marked position, _bt_steppage makes a full copy of
     484             :      * the currPos struct in markPos.  If (as often happens) the mark is moved
     485             :      * before we leave the page, we don't have to do that work.
     486             :      */
     487      130074 :     if (BTScanPosIsValid(so->currPos))
     488      130074 :         so->markItemIndex = so->currPos.itemIndex;
     489             :     else
     490             :     {
     491           0 :         BTScanPosInvalidate(so->markPos);
     492           0 :         so->markItemIndex = -1;
     493             :     }
     494      130074 : }
     495             : 
     496             : /*
     497             :  *  btrestrpos() -- restore scan to last saved position
     498             :  */
     499             : void
     500       54018 : btrestrpos(IndexScanDesc scan)
     501             : {
     502       54018 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     503             : 
     504       54018 :     if (so->markItemIndex >= 0)
     505             :     {
     506             :         /*
     507             :          * The scan has never moved to a new page since the last mark.  Just
     508             :          * restore the itemIndex.
     509             :          *
     510             :          * NB: In this case we can't count on anything in so->markPos to be
     511             :          * accurate.
     512             :          */
     513       53910 :         so->currPos.itemIndex = so->markItemIndex;
     514             :     }
     515             :     else
     516             :     {
     517             :         /*
     518             :          * The scan moved to a new page after last mark or restore, and we are
     519             :          * now restoring to the marked page.  We aren't holding any read
     520             :          * locks, but if we're still holding the pin for the current position,
     521             :          * we must drop it.
     522             :          */
     523         108 :         if (BTScanPosIsValid(so->currPos))
     524             :         {
     525             :             /* Before leaving current page, deal with any killed items */
     526         108 :             if (so->numKilled > 0)
     527           0 :                 _bt_killitems(scan);
     528         108 :             BTScanPosUnpinIfPinned(so->currPos);
     529             :         }
     530             : 
     531         108 :         if (BTScanPosIsValid(so->markPos))
     532             :         {
     533             :             /* bump pin on mark buffer for assignment to current buffer */
     534         108 :             if (BTScanPosIsPinned(so->markPos))
     535           0 :                 IncrBufferRefCount(so->markPos.buf);
     536         108 :             memcpy(&so->currPos, &so->markPos,
     537             :                    offsetof(BTScanPosData, items[1]) +
     538         108 :                    so->markPos.lastItem * sizeof(BTScanPosItem));
     539         108 :             if (so->currTuples)
     540           0 :                 memcpy(so->currTuples, so->markTuples,
     541           0 :                        so->markPos.nextTupleOffset);
     542             :             /* Reset the scan's array keys (see _bt_steppage for why) */
     543         108 :             if (so->numArrayKeys)
     544             :             {
     545           0 :                 _bt_start_array_keys(scan, so->currPos.dir);
     546           0 :                 so->needPrimScan = false;
     547             :             }
     548             :         }
     549             :         else
     550           0 :             BTScanPosInvalidate(so->currPos);
     551             :     }
     552       54018 : }
     553             : 
     554             : /*
     555             :  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
     556             :  */
     557             : Size
     558          64 : btestimateparallelscan(Relation rel, int nkeys, int norderbys)
     559             : {
     560          64 :     int16       nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
     561             :     Size        estnbtreeshared,
     562             :                 genericattrspace;
     563             : 
     564             :     /*
     565             :      * Pessimistically assume that every input scan key will be output with
     566             :      * its own SAOP array
     567             :      */
     568          64 :     estnbtreeshared = offsetof(BTParallelScanDescData, btps_arrElems) +
     569             :         sizeof(int) * nkeys;
     570             : 
     571             :     /* Single column indexes cannot possibly use a skip array */
     572          64 :     if (nkeyatts == 1)
     573          46 :         return estnbtreeshared;
     574             : 
     575             :     /*
     576             :      * Pessimistically assume that all attributes prior to the least
     577             :      * significant attribute require a skip array (and an associated key)
     578             :      */
     579          18 :     genericattrspace = datumEstimateSpace((Datum) 0, false, true,
     580             :                                           sizeof(Datum));
     581          36 :     for (int attnum = 1; attnum < nkeyatts; attnum++)
     582             :     {
     583             :         CompactAttribute *attr;
     584             : 
     585             :         /*
     586             :          * We make the conservative assumption that every index column will
     587             :          * also require a skip array.
     588             :          *
     589             :          * Every skip array must have space to store its scan key's sk_flags.
     590             :          */
     591          18 :         estnbtreeshared = add_size(estnbtreeshared, sizeof(int));
     592             : 
     593             :         /* Consider space required to store a datum of opclass input type */
     594          18 :         attr = TupleDescCompactAttr(rel->rd_att, attnum - 1);
     595          18 :         if (attr->attbyval)
     596             :         {
     597             :             /* This index attribute stores pass-by-value datums */
     598          18 :             Size        estfixed = datumEstimateSpace((Datum) 0, false,
     599          18 :                                                       true, attr->attlen);
     600             : 
     601          18 :             estnbtreeshared = add_size(estnbtreeshared, estfixed);
     602          18 :             continue;
     603             :         }
     604             : 
     605             :         /*
     606             :          * This index attribute stores pass-by-reference datums.
     607             :          *
     608             :          * Assume that serializing this array will use just as much space as a
     609             :          * pass-by-value datum, in addition to space for the largest possible
     610             :          * whole index tuple (this is not just a per-datum portion of the
     611             :          * largest possible tuple because that'd be almost as large anyway).
     612             :          *
     613             :          * This is quite conservative, but it's not clear how we could do much
     614             :          * better.  The executor requires an up-front storage request size
     615             :          * that reliably covers the scan's high watermark memory usage.  We
     616             :          * can't be sure of the real high watermark until the scan is over.
     617             :          */
     618           0 :         estnbtreeshared = add_size(estnbtreeshared, genericattrspace);
     619           0 :         estnbtreeshared = add_size(estnbtreeshared, BTMaxItemSize);
     620             :     }
     621             : 
     622          18 :     return estnbtreeshared;
     623             : }
     624             : 
     625             : /*
     626             :  * _bt_parallel_serialize_arrays() -- Serialize parallel array state.
     627             :  *
     628             :  * Caller must have exclusively locked btscan->btps_lock when called.
     629             :  */
     630             : static void
     631          36 : _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan,
     632             :                               BTScanOpaque so)
     633             : {
     634             :     char       *datumshared;
     635             : 
     636             :     /* Space for serialized datums begins immediately after btps_arrElems[] */
     637          36 :     datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
     638          72 :     for (int i = 0; i < so->numArrayKeys; i++)
     639             :     {
     640          36 :         BTArrayKeyInfo *array = &so->arrayKeys[i];
     641          36 :         ScanKey     skey = &so->keyData[array->scan_key];
     642             : 
     643          36 :         if (array->num_elems != -1)
     644             :         {
     645             :             /* Save SAOP array's cur_elem (no need to copy key/datum) */
     646             :             Assert(!(skey->sk_flags & SK_BT_SKIP));
     647          36 :             btscan->btps_arrElems[i] = array->cur_elem;
     648          36 :             continue;
     649             :         }
     650             : 
     651             :         /* Save all mutable state associated with skip array's key */
     652             :         Assert(skey->sk_flags & SK_BT_SKIP);
     653           0 :         memcpy(datumshared, &skey->sk_flags, sizeof(int));
     654           0 :         datumshared += sizeof(int);
     655             : 
     656           0 :         if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
     657             :         {
     658             :             /* No sk_argument datum to serialize */
     659             :             Assert(skey->sk_argument == 0);
     660           0 :             continue;
     661             :         }
     662             : 
     663           0 :         datumSerialize(skey->sk_argument, (skey->sk_flags & SK_ISNULL) != 0,
     664           0 :                        array->attbyval, array->attlen, &datumshared);
     665             :     }
     666          36 : }
     667             : 
     668             : /*
     669             :  * _bt_parallel_restore_arrays() -- Restore serialized parallel array state.
     670             :  *
     671             :  * Caller must have exclusively locked btscan->btps_lock when called.
     672             :  */
     673             : static void
     674          36 : _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan,
     675             :                             BTScanOpaque so)
     676             : {
     677             :     char       *datumshared;
     678             : 
     679             :     /* Space for serialized datums begins immediately after btps_arrElems[] */
     680          36 :     datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
     681          72 :     for (int i = 0; i < so->numArrayKeys; i++)
     682             :     {
     683          36 :         BTArrayKeyInfo *array = &so->arrayKeys[i];
     684          36 :         ScanKey     skey = &so->keyData[array->scan_key];
     685             :         bool        isnull;
     686             : 
     687          36 :         if (array->num_elems != -1)
     688             :         {
     689             :             /* Restore SAOP array using its saved cur_elem */
     690             :             Assert(!(skey->sk_flags & SK_BT_SKIP));
     691          36 :             array->cur_elem = btscan->btps_arrElems[i];
     692          36 :             skey->sk_argument = array->elem_values[array->cur_elem];
     693          36 :             continue;
     694             :         }
     695             : 
     696             :         /* Restore skip array by restoring its key directly */
     697           0 :         if (!array->attbyval && skey->sk_argument)
     698           0 :             pfree(DatumGetPointer(skey->sk_argument));
     699           0 :         skey->sk_argument = (Datum) 0;
     700           0 :         memcpy(&skey->sk_flags, datumshared, sizeof(int));
     701           0 :         datumshared += sizeof(int);
     702             : 
     703             :         Assert(skey->sk_flags & SK_BT_SKIP);
     704             : 
     705           0 :         if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
     706             :         {
     707             :             /* No sk_argument datum to restore */
     708           0 :             continue;
     709             :         }
     710             : 
     711           0 :         skey->sk_argument = datumRestore(&datumshared, &isnull);
     712           0 :         if (isnull)
     713             :         {
     714             :             Assert(skey->sk_argument == 0);
     715             :             Assert(skey->sk_flags & SK_SEARCHNULL);
     716             :             Assert(skey->sk_flags & SK_ISNULL);
     717             :         }
     718             :     }
     719          36 : }
     720             : 
     721             : /*
     722             :  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
     723             :  */
     724             : void
     725          64 : btinitparallelscan(void *target)
     726             : {
     727          64 :     BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
     728             : 
     729          64 :     LWLockInitialize(&bt_target->btps_lock,
     730             :                      LWTRANCHE_PARALLEL_BTREE_SCAN);
     731          64 :     bt_target->btps_nextScanPage = InvalidBlockNumber;
     732          64 :     bt_target->btps_lastCurrPage = InvalidBlockNumber;
     733          64 :     bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
     734          64 :     ConditionVariableInit(&bt_target->btps_cv);
     735          64 : }
     736             : 
     737             : /*
     738             :  *  btparallelrescan() -- reset parallel scan
     739             :  */
     740             : void
     741          24 : btparallelrescan(IndexScanDesc scan)
     742             : {
     743             :     BTParallelScanDesc btscan;
     744          24 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
     745             : 
     746             :     Assert(parallel_scan);
     747             : 
     748          24 :     btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
     749             :                                                   parallel_scan->ps_offset_am);
     750             : 
     751             :     /*
     752             :      * In theory, we don't need to acquire the LWLock here, because there
     753             :      * shouldn't be any other workers running at this point, but we do so for
     754             :      * consistency.
     755             :      */
     756          24 :     LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
     757          24 :     btscan->btps_nextScanPage = InvalidBlockNumber;
     758          24 :     btscan->btps_lastCurrPage = InvalidBlockNumber;
     759          24 :     btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
     760          24 :     LWLockRelease(&btscan->btps_lock);
     761          24 : }
     762             : 
     763             : /*
     764             :  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
     765             :  *      page.  Other scans must wait until we call _bt_parallel_release()
     766             :  *      or _bt_parallel_done().
     767             :  *
     768             :  * The return value is true if we successfully seized the scan and false
     769             :  * if we did not.  The latter case occurs when no pages remain, or when
     770             :  * another primitive index scan is scheduled that caller's backend cannot
     771             :  * start just yet (only backends that call from _bt_first are capable of
     772             :  * starting primitive index scans, which they indicate by passing first=true).
     773             :  *
     774             :  * If the return value is true, *next_scan_page returns the next page of the
     775             :  * scan, and *last_curr_page returns the page that *next_scan_page came from.
     776             :  * An invalid *next_scan_page means the scan hasn't yet started, or that
     777             :  * caller needs to start the next primitive index scan (if it's the latter
     778             :  * case we'll set so.needPrimScan).
     779             :  *
     780             :  * Callers should ignore the value of *next_scan_page and *last_curr_page if
     781             :  * the return value is false.
     782             :  */
     783             : bool
     784        1658 : _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
     785             :                    BlockNumber *last_curr_page, bool first)
     786             : {
     787        1658 :     Relation    rel = scan->indexRelation;
     788        1658 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     789        1658 :     bool        exit_loop = false,
     790        1658 :                 status = true,
     791        1658 :                 endscan = false;
     792        1658 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
     793             :     BTParallelScanDesc btscan;
     794             : 
     795        1658 :     *next_scan_page = InvalidBlockNumber;
     796        1658 :     *last_curr_page = InvalidBlockNumber;
     797             : 
     798             :     /*
     799             :      * Reset so->currPos, and initialize moreLeft/moreRight such that the next
     800             :      * call to _bt_readnextpage treats this backend similarly to a serial
     801             :      * backend that steps from *last_curr_page to *next_scan_page (unless this
     802             :      * backend's so->currPos is initialized by _bt_readfirstpage before then).
     803             :      */
     804        1658 :     BTScanPosInvalidate(so->currPos);
     805        1658 :     so->currPos.moreLeft = so->currPos.moreRight = true;
     806             : 
     807        1658 :     if (first)
     808             :     {
     809             :         /*
     810             :          * Initialize array related state when called from _bt_first, assuming
     811             :          * that this will be the first primitive index scan for the scan
     812             :          */
     813         446 :         so->needPrimScan = false;
     814         446 :         so->scanBehind = false;
     815         446 :         so->oppositeDirCheck = false;
     816             :     }
     817             :     else
     818             :     {
     819             :         /*
     820             :          * Don't attempt to seize the scan when it requires another primitive
     821             :          * index scan, since caller's backend cannot start it right now
     822             :          */
     823        1212 :         if (so->needPrimScan)
     824           0 :             return false;
     825             :     }
     826             : 
     827        1658 :     btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
     828             :                                                   parallel_scan->ps_offset_am);
     829             : 
     830             :     while (1)
     831             :     {
     832        1658 :         LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
     833             : 
     834        1658 :         if (btscan->btps_pageStatus == BTPARALLEL_DONE)
     835             :         {
     836             :             /* We're done with this parallel index scan */
     837         314 :             status = false;
     838             :         }
     839        1344 :         else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
     840        1220 :                  btscan->btps_nextScanPage == P_NONE)
     841             :         {
     842             :             /* End this parallel index scan */
     843           8 :             status = false;
     844           8 :             endscan = true;
     845             :         }
     846        1336 :         else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
     847             :         {
     848             :             Assert(so->numArrayKeys);
     849             : 
     850          36 :             if (first)
     851             :             {
     852             :                 /* Can start scheduled primitive scan right away, so do so */
     853          36 :                 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
     854             : 
     855             :                 /* Restore scan's array keys from serialized values */
     856          36 :                 _bt_parallel_restore_arrays(rel, btscan, so);
     857          36 :                 exit_loop = true;
     858             :             }
     859             :             else
     860             :             {
     861             :                 /*
     862             :                  * Don't attempt to seize the scan when it requires another
     863             :                  * primitive index scan, since caller's backend cannot start
     864             :                  * it right now
     865             :                  */
     866           0 :                 status = false;
     867             :             }
     868             : 
     869             :             /*
     870             :              * Either way, update backend local state to indicate that a
     871             :              * pending primitive scan is required
     872             :              */
     873          36 :             so->needPrimScan = true;
     874          36 :             so->scanBehind = false;
     875          36 :             so->oppositeDirCheck = false;
     876             :         }
     877        1300 :         else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
     878             :         {
     879             :             /*
     880             :              * We have successfully seized control of the scan for the purpose
     881             :              * of advancing it to a new page!
     882             :              */
     883        1300 :             btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
     884             :             Assert(btscan->btps_nextScanPage != P_NONE);
     885        1300 :             *next_scan_page = btscan->btps_nextScanPage;
     886        1300 :             *last_curr_page = btscan->btps_lastCurrPage;
     887        1300 :             exit_loop = true;
     888             :         }
     889        1658 :         LWLockRelease(&btscan->btps_lock);
     890        1658 :         if (exit_loop || !status)
     891             :             break;
     892           0 :         ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
     893             :     }
     894        1658 :     ConditionVariableCancelSleep();
     895             : 
     896             :     /* When the scan has reached the rightmost (or leftmost) page, end it */
     897        1658 :     if (endscan)
     898           8 :         _bt_parallel_done(scan);
     899             : 
     900        1658 :     return status;
     901             : }
     902             : 
     903             : /*
     904             :  * _bt_parallel_release() -- Complete the process of advancing the scan to a
     905             :  *      new page.  We now have the new value btps_nextScanPage; another backend
     906             :  *      can now begin advancing the scan.
     907             :  *
     908             :  * Callers whose scan uses array keys must save their curr_page argument so
     909             :  * that it can be passed to _bt_parallel_primscan_schedule, should caller
     910             :  * determine that another primitive index scan is required.
     911             :  *
     912             :  * If caller's next_scan_page is P_NONE, the scan has reached the index's
     913             :  * rightmost/leftmost page.  This is treated as reaching the end of the scan
     914             :  * within _bt_parallel_seize.
     915             :  *
     916             :  * Note: unlike the serial case, parallel scans don't need to remember both
     917             :  * sibling links.  next_scan_page is whichever link is next given the scan's
     918             :  * direction.  That's all we'll ever need, since the direction of a parallel
     919             :  * scan can never change.
     920             :  */
     921             : void
     922        1336 : _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
     923             :                      BlockNumber curr_page)
     924             : {
     925        1336 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
     926             :     BTParallelScanDesc btscan;
     927             : 
     928             :     Assert(BlockNumberIsValid(next_scan_page));
     929             : 
     930        1336 :     btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
     931             :                                                   parallel_scan->ps_offset_am);
     932             : 
     933        1336 :     LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
     934        1336 :     btscan->btps_nextScanPage = next_scan_page;
     935        1336 :     btscan->btps_lastCurrPage = curr_page;
     936        1336 :     btscan->btps_pageStatus = BTPARALLEL_IDLE;
     937        1336 :     LWLockRelease(&btscan->btps_lock);
     938        1336 :     ConditionVariableSignal(&btscan->btps_cv);
     939        1336 : }
     940             : 
     941             : /*
     942             :  * _bt_parallel_done() -- Mark the parallel scan as complete.
     943             :  *
     944             :  * When there are no pages left to scan, this function should be called to
     945             :  * notify other workers.  Otherwise, they might wait forever for the scan to
     946             :  * advance to the next page.
     947             :  */
     948             : void
     949     7000044 : _bt_parallel_done(IndexScanDesc scan)
     950             : {
     951     7000044 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     952     7000044 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
     953             :     BTParallelScanDesc btscan;
     954     7000044 :     bool        status_changed = false;
     955             : 
     956             :     Assert(!BTScanPosIsValid(so->currPos));
     957             : 
     958             :     /* Do nothing, for non-parallel scans */
     959     7000044 :     if (parallel_scan == NULL)
     960     6999888 :         return;
     961             : 
     962             :     /*
     963             :      * Should not mark parallel scan done when there's still a pending
     964             :      * primitive index scan
     965             :      */
     966         156 :     if (so->needPrimScan)
     967          36 :         return;
     968             : 
     969         120 :     btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
     970             :                                                   parallel_scan->ps_offset_am);
     971             : 
     972             :     /*
     973             :      * Mark the parallel scan as done, unless some other process did so
     974             :      * already
     975             :      */
     976         120 :     LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
     977             :     Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
     978         120 :     if (btscan->btps_pageStatus != BTPARALLEL_DONE)
     979             :     {
     980          88 :         btscan->btps_pageStatus = BTPARALLEL_DONE;
     981          88 :         status_changed = true;
     982             :     }
     983         120 :     LWLockRelease(&btscan->btps_lock);
     984             : 
     985             :     /* wake up all the workers associated with this parallel scan */
     986         120 :     if (status_changed)
     987          88 :         ConditionVariableBroadcast(&btscan->btps_cv);
     988             : }
     989             : 
     990             : /*
     991             :  * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
     992             :  *
     993             :  * Caller passes the curr_page most recently passed to _bt_parallel_release
     994             :  * by its backend.  Caller successfully schedules the next primitive index scan
     995             :  * if the shared parallel state hasn't been seized since caller's backend last
     996             :  * advanced the scan.
     997             :  */
     998             : void
     999          36 : _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
    1000             : {
    1001          36 :     Relation    rel = scan->indexRelation;
    1002          36 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1003          36 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
    1004             :     BTParallelScanDesc btscan;
    1005             : 
    1006             :     Assert(so->numArrayKeys);
    1007             : 
    1008          36 :     btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
    1009             :                                                   parallel_scan->ps_offset_am);
    1010             : 
    1011          36 :     LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
    1012          36 :     if (btscan->btps_lastCurrPage == curr_page &&
    1013          36 :         btscan->btps_pageStatus == BTPARALLEL_IDLE)
    1014             :     {
    1015          36 :         btscan->btps_nextScanPage = InvalidBlockNumber;
    1016          36 :         btscan->btps_lastCurrPage = InvalidBlockNumber;
    1017          36 :         btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
    1018             : 
    1019             :         /* Serialize scan's current array keys */
    1020          36 :         _bt_parallel_serialize_arrays(rel, btscan, so);
    1021             :     }
    1022          36 :     LWLockRelease(&btscan->btps_lock);
    1023          36 : }
    1024             : 
    1025             : /*
    1026             :  * Bulk deletion of all index entries pointing to a set of heap tuples.
    1027             :  * The set of target tuples is specified via a callback routine that tells
    1028             :  * whether any given heap tuple (identified by ItemPointer) is being deleted.
    1029             :  *
    1030             :  * Result: a palloc'd struct containing statistical info for VACUUM displays.
    1031             :  */
    1032             : IndexBulkDeleteResult *
    1033        2774 : btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
    1034             :              IndexBulkDeleteCallback callback, void *callback_state)
    1035             : {
    1036        2774 :     Relation    rel = info->index;
    1037             :     BTCycleId   cycleid;
    1038             : 
    1039             :     /* allocate stats if first time through, else re-use existing struct */
    1040        2774 :     if (stats == NULL)
    1041        2774 :         stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
    1042             : 
    1043             :     /* Establish the vacuum cycle ID to use for this scan */
    1044             :     /* The ENSURE stuff ensures we clean up shared memory on failure */
    1045        2774 :     PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
    1046             :     {
    1047        2774 :         cycleid = _bt_start_vacuum(rel);
    1048             : 
    1049        2774 :         btvacuumscan(info, stats, callback, callback_state, cycleid);
    1050             :     }
    1051        2774 :     PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
    1052        2774 :     _bt_end_vacuum(rel);
    1053             : 
    1054        2774 :     return stats;
    1055             : }
    1056             : 
    1057             : /*
    1058             :  * Post-VACUUM cleanup.
    1059             :  *
    1060             :  * Result: a palloc'd struct containing statistical info for VACUUM displays.
    1061             :  */
    1062             : IndexBulkDeleteResult *
    1063      172792 : btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
    1064             : {
    1065             :     BlockNumber num_delpages;
    1066             : 
    1067             :     /* No-op in ANALYZE ONLY mode */
    1068      172792 :     if (info->analyze_only)
    1069       17320 :         return stats;
    1070             : 
    1071             :     /*
    1072             :      * If btbulkdelete was called, we need not do anything (we just maintain
    1073             :      * the information used within _bt_vacuum_needs_cleanup() by calling
    1074             :      * _bt_set_cleanup_info() below).
    1075             :      *
    1076             :      * If btbulkdelete was _not_ called, then we have a choice to make: we
    1077             :      * must decide whether or not a btvacuumscan() call is needed now (i.e.
    1078             :      * whether the ongoing VACUUM operation can entirely avoid a physical scan
    1079             :      * of the index).  A call to _bt_vacuum_needs_cleanup() decides it for us
    1080             :      * now.
    1081             :      */
    1082      155472 :     if (stats == NULL)
    1083             :     {
    1084             :         /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
    1085      153310 :         if (!_bt_vacuum_needs_cleanup(info->index))
    1086      153294 :             return NULL;
    1087             : 
    1088             :         /*
    1089             :          * Since we aren't going to actually delete any leaf items, there's no
    1090             :          * need to go through all the vacuum-cycle-ID pushups here.
    1091             :          *
    1092             :          * Posting list tuples are a source of inaccuracy for cleanup-only
    1093             :          * scans.  btvacuumscan() will assume that the number of index tuples
    1094             :          * from each page can be used as num_index_tuples, even though
    1095             :          * num_index_tuples is supposed to represent the number of TIDs in the
    1096             :          * index.  This naive approach can underestimate the number of tuples
    1097             :          * in the index significantly.
    1098             :          *
    1099             :          * We handle the problem by making num_index_tuples an estimate in
    1100             :          * cleanup-only case.
    1101             :          */
    1102          16 :         stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
    1103          16 :         btvacuumscan(info, stats, NULL, NULL, 0);
    1104          16 :         stats->estimated_count = true;
    1105             :     }
    1106             : 
    1107             :     /*
    1108             :      * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
    1109             :      *
    1110             :      * num_delpages is the number of deleted pages now in the index that were
    1111             :      * not safe to place in the FSM to be recycled just yet.  num_delpages is
    1112             :      * greater than 0 only when _bt_pagedel() actually deleted pages during
    1113             :      * our call to btvacuumscan().  Even then, _bt_pendingfsm_finalize() must
    1114             :      * have failed to place any newly deleted pages in the FSM just moments
    1115             :      * ago.  (Actually, there are edge cases where recycling of the current
    1116             :      * VACUUM's newly deleted pages does not even become safe by the time the
    1117             :      * next VACUUM comes around.  See nbtree/README.)
    1118             :      */
    1119             :     Assert(stats->pages_deleted >= stats->pages_free);
    1120        2178 :     num_delpages = stats->pages_deleted - stats->pages_free;
    1121        2178 :     _bt_set_cleanup_info(info->index, num_delpages);
    1122             : 
    1123             :     /*
    1124             :      * It's quite possible for us to be fooled by concurrent page splits into
    1125             :      * double-counting some index tuples, so disbelieve any total that exceeds
    1126             :      * the underlying heap's count ... if we know that accurately.  Otherwise
    1127             :      * this might just make matters worse.
    1128             :      */
    1129        2178 :     if (!info->estimated_count)
    1130             :     {
    1131        2132 :         if (stats->num_index_tuples > info->num_heap_tuples)
    1132           0 :             stats->num_index_tuples = info->num_heap_tuples;
    1133             :     }
    1134             : 
    1135        2178 :     return stats;
    1136             : }
    1137             : 
    1138             : /*
    1139             :  * btvacuumscan --- scan the index for VACUUMing purposes
    1140             :  *
    1141             :  * This combines the functions of looking for leaf tuples that are deletable
    1142             :  * according to the vacuum callback, looking for empty pages that can be
    1143             :  * deleted, and looking for old deleted pages that can be recycled.  Both
    1144             :  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
    1145             :  * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
    1146             :  *
    1147             :  * The caller is responsible for initially allocating/zeroing a stats struct
    1148             :  * and for obtaining a vacuum cycle ID if necessary.
    1149             :  */
    1150             : static void
    1151        2790 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
    1152             :              IndexBulkDeleteCallback callback, void *callback_state,
    1153             :              BTCycleId cycleid)
    1154             : {
    1155        2790 :     Relation    rel = info->index;
    1156             :     BTVacState  vstate;
    1157             :     BlockNumber num_pages;
    1158             :     bool        needLock;
    1159             :     BlockRangeReadStreamPrivate p;
    1160        2790 :     ReadStream *stream = NULL;
    1161             : 
    1162             :     /*
    1163             :      * Reset fields that track information about the entire index now.  This
    1164             :      * avoids double-counting in the case where a single VACUUM command
    1165             :      * requires multiple scans of the index.
    1166             :      *
    1167             :      * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
    1168             :      * since they track information about the VACUUM command, and so must last
    1169             :      * across each call to btvacuumscan().
    1170             :      *
    1171             :      * (Note that pages_free is treated as state about the whole index, not
    1172             :      * the current VACUUM.  This is appropriate because RecordFreeIndexPage()
    1173             :      * calls are idempotent, and get repeated for the same deleted pages in
    1174             :      * some scenarios.  The point for us is to track the number of recyclable
    1175             :      * pages in the index at the end of the VACUUM command.)
    1176             :      */
    1177        2790 :     stats->num_pages = 0;
    1178        2790 :     stats->num_index_tuples = 0;
    1179        2790 :     stats->pages_deleted = 0;
    1180        2790 :     stats->pages_free = 0;
    1181             : 
    1182             :     /* Set up info to pass down to btvacuumpage */
    1183        2790 :     vstate.info = info;
    1184        2790 :     vstate.stats = stats;
    1185        2790 :     vstate.callback = callback;
    1186        2790 :     vstate.callback_state = callback_state;
    1187        2790 :     vstate.cycleid = cycleid;
    1188             : 
    1189             :     /* Create a temporary memory context to run _bt_pagedel in */
    1190        2790 :     vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
    1191             :                                                   "_bt_pagedel",
    1192             :                                                   ALLOCSET_DEFAULT_SIZES);
    1193             : 
    1194             :     /* Initialize vstate fields used by _bt_pendingfsm_finalize */
    1195        2790 :     vstate.bufsize = 0;
    1196        2790 :     vstate.maxbufsize = 0;
    1197        2790 :     vstate.pendingpages = NULL;
    1198        2790 :     vstate.npendingpages = 0;
    1199             :     /* Consider applying _bt_pendingfsm_finalize optimization */
    1200        2790 :     _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
    1201             : 
    1202             :     /*
    1203             :      * The outer loop iterates over all index pages except the metapage, in
    1204             :      * physical order (we hope the kernel will cooperate in providing
    1205             :      * read-ahead for speed).  It is critical that we visit all leaf pages,
    1206             :      * including ones added after we start the scan, else we might fail to
    1207             :      * delete some deletable tuples.  Hence, we must repeatedly check the
    1208             :      * relation length.  We must acquire the relation-extension lock while
    1209             :      * doing so to avoid a race condition: if someone else is extending the
    1210             :      * relation, there is a window where bufmgr/smgr have created a new
    1211             :      * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
    1212             :      * we manage to scan such a page here, we'll improperly assume it can be
    1213             :      * recycled.  Taking the lock synchronizes things enough to prevent a
    1214             :      * problem: either num_pages won't include the new page, or _bt_getbuf
    1215             :      * already has write lock on the buffer and it will be fully initialized
    1216             :      * before we can examine it.  Also, we need not worry if a page is added
    1217             :      * immediately after we look; the page splitting code already has
    1218             :      * write-lock on the left page before it adds a right page, so we must
    1219             :      * already have processed any tuples due to be moved into such a page.
    1220             :      *
    1221             :      * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
    1222             :      * think the use of the extension lock is still required.
    1223             :      *
    1224             :      * We can skip locking for new or temp relations, however, since no one
    1225             :      * else could be accessing them.
    1226             :      */
    1227        2790 :     needLock = !RELATION_IS_LOCAL(rel);
    1228             : 
    1229        2790 :     p.current_blocknum = BTREE_METAPAGE + 1;
    1230             : 
    1231             :     /*
    1232             :      * It is safe to use batchmode as block_range_read_stream_cb takes no
    1233             :      * locks.
    1234             :      */
    1235        2790 :     stream = read_stream_begin_relation(READ_STREAM_FULL |
    1236             :                                         READ_STREAM_USE_BATCHING,
    1237             :                                         info->strategy,
    1238             :                                         rel,
    1239             :                                         MAIN_FORKNUM,
    1240             :                                         block_range_read_stream_cb,
    1241             :                                         &p,
    1242             :                                         0);
    1243             :     for (;;)
    1244             :     {
    1245             :         /* Get the current relation length */
    1246        5306 :         if (needLock)
    1247        5302 :             LockRelationForExtension(rel, ExclusiveLock);
    1248        5306 :         num_pages = RelationGetNumberOfBlocks(rel);
    1249        5306 :         if (needLock)
    1250        5302 :             UnlockRelationForExtension(rel, ExclusiveLock);
    1251             : 
    1252        5306 :         if (info->report_progress)
    1253         950 :             pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
    1254             :                                          num_pages);
    1255             : 
    1256             :         /* Quit if we've scanned the whole relation */
    1257        5306 :         if (p.current_blocknum >= num_pages)
    1258        2790 :             break;
    1259             : 
    1260        2516 :         p.last_exclusive = num_pages;
    1261             : 
    1262             :         /* Iterate over pages, then loop back to recheck relation length */
    1263             :         while (true)
    1264       23426 :         {
    1265             :             BlockNumber current_block;
    1266             :             Buffer      buf;
    1267             : 
    1268             :             /* call vacuum_delay_point while not holding any buffer lock */
    1269       25942 :             vacuum_delay_point(false);
    1270             : 
    1271       25942 :             buf = read_stream_next_buffer(stream, NULL);
    1272             : 
    1273       25942 :             if (!BufferIsValid(buf))
    1274        2516 :                 break;
    1275             : 
    1276       23426 :             current_block = btvacuumpage(&vstate, buf);
    1277             : 
    1278       23426 :             if (info->report_progress)
    1279         980 :                 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
    1280             :                                              current_block);
    1281             :         }
    1282             : 
    1283             :         /*
    1284             :          * We have to reset the read stream to use it again. After returning
    1285             :          * InvalidBuffer, the read stream API won't invoke our callback again
    1286             :          * until the stream has been reset.
    1287             :          */
    1288        2516 :         read_stream_reset(stream);
    1289             :     }
    1290             : 
    1291        2790 :     read_stream_end(stream);
    1292             : 
    1293             :     /* Set statistics num_pages field to final size of index */
    1294        2790 :     stats->num_pages = num_pages;
    1295             : 
    1296        2790 :     MemoryContextDelete(vstate.pagedelcontext);
    1297             : 
    1298             :     /*
    1299             :      * If there were any calls to _bt_pagedel() during scan of the index then
    1300             :      * see if any of the resulting pages can be placed in the FSM now.  When
    1301             :      * it's not safe we'll have to leave it up to a future VACUUM operation.
    1302             :      *
    1303             :      * Finally, if we placed any pages in the FSM (either just now or during
    1304             :      * the scan), forcibly update the upper-level FSM pages to ensure that
    1305             :      * searchers can find them.
    1306             :      */
    1307        2790 :     _bt_pendingfsm_finalize(rel, &vstate);
    1308        2790 :     if (stats->pages_free > 0)
    1309          30 :         IndexFreeSpaceMapVacuum(rel);
    1310        2790 : }
    1311             : 
    1312             : /*
    1313             :  * btvacuumpage --- VACUUM one page
    1314             :  *
    1315             :  * This processes a single page for btvacuumscan().  In some cases we must
    1316             :  * backtrack to re-examine and VACUUM pages that were on buf's page during
    1317             :  * a previous call here.  This is how we handle page splits (that happened
    1318             :  * after our cycleid was acquired) whose right half page happened to reuse
    1319             :  * a block that we might have processed at some point before it was
    1320             :  * recycled (i.e. before the page split).
    1321             :  *
    1322             :  * Returns BlockNumber of a scanned page (not backtracked).
    1323             :  */
    1324             : static BlockNumber
    1325       23426 : btvacuumpage(BTVacState *vstate, Buffer buf)
    1326             : {
    1327       23426 :     IndexVacuumInfo *info = vstate->info;
    1328       23426 :     IndexBulkDeleteResult *stats = vstate->stats;
    1329       23426 :     IndexBulkDeleteCallback callback = vstate->callback;
    1330       23426 :     void       *callback_state = vstate->callback_state;
    1331       23426 :     Relation    rel = info->index;
    1332       23426 :     Relation    heaprel = info->heaprel;
    1333             :     bool        attempt_pagedel;
    1334             :     BlockNumber blkno,
    1335             :                 backtrack_to;
    1336       23426 :     BlockNumber scanblkno = BufferGetBlockNumber(buf);
    1337             :     Page        page;
    1338             :     BTPageOpaque opaque;
    1339             : 
    1340       23426 :     blkno = scanblkno;
    1341             : 
    1342       23426 : backtrack:
    1343             : 
    1344       23426 :     attempt_pagedel = false;
    1345       23426 :     backtrack_to = P_NONE;
    1346             : 
    1347       23426 :     _bt_lockbuf(rel, buf, BT_READ);
    1348       23426 :     page = BufferGetPage(buf);
    1349       23426 :     opaque = NULL;
    1350       23426 :     if (!PageIsNew(page))
    1351             :     {
    1352       23426 :         _bt_checkpage(rel, buf);
    1353       23426 :         opaque = BTPageGetOpaque(page);
    1354             :     }
    1355             : 
    1356             :     Assert(blkno <= scanblkno);
    1357       23426 :     if (blkno != scanblkno)
    1358             :     {
    1359             :         /*
    1360             :          * We're backtracking.
    1361             :          *
    1362             :          * We followed a right link to a sibling leaf page (a page that
    1363             :          * happens to be from a block located before scanblkno).  The only
    1364             :          * case we want to do anything with is a live leaf page having the
    1365             :          * current vacuum cycle ID.
    1366             :          *
    1367             :          * The page had better be in a state that's consistent with what we
    1368             :          * expect.  Check for conditions that imply corruption in passing.  It
    1369             :          * can't be half-dead because only an interrupted VACUUM process can
    1370             :          * leave pages in that state, so we'd definitely have dealt with it
    1371             :          * back when the page was the scanblkno page (half-dead pages are
    1372             :          * always marked fully deleted by _bt_pagedel(), barring corruption).
    1373             :          */
    1374           0 :         if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
    1375             :         {
    1376             :             Assert(false);
    1377           0 :             ereport(LOG,
    1378             :                     (errcode(ERRCODE_INDEX_CORRUPTED),
    1379             :                      errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
    1380             :                                      blkno, scanblkno, RelationGetRelationName(rel))));
    1381           0 :             _bt_relbuf(rel, buf);
    1382           0 :             return scanblkno;
    1383             :         }
    1384             : 
    1385             :         /*
    1386             :          * We may have already processed the page in an earlier call, when the
    1387             :          * page was scanblkno.  This happens when the leaf page split occurred
    1388             :          * after the scan began, but before the right sibling page became the
    1389             :          * scanblkno.
    1390             :          *
    1391             :          * Page may also have been deleted by current btvacuumpage() call,
    1392             :          * since _bt_pagedel() sometimes deletes the right sibling page of
    1393             :          * scanblkno in passing (it does so after we decided where to
    1394             :          * backtrack to).  We don't need to process this page as a deleted
    1395             :          * page a second time now (in fact, it would be wrong to count it as a
    1396             :          * deleted page in the bulk delete statistics a second time).
    1397             :          */
    1398           0 :         if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
    1399             :         {
    1400             :             /* Done with current scanblkno (and all lower split pages) */
    1401           0 :             _bt_relbuf(rel, buf);
    1402           0 :             return scanblkno;
    1403             :         }
    1404             :     }
    1405             : 
    1406       23426 :     if (!opaque || BTPageIsRecyclable(page, heaprel))
    1407             :     {
    1408             :         /* Okay to recycle this page (which could be leaf or internal) */
    1409        1160 :         RecordFreeIndexPage(rel, blkno);
    1410        1160 :         stats->pages_deleted++;
    1411        1160 :         stats->pages_free++;
    1412             :     }
    1413       22266 :     else if (P_ISDELETED(opaque))
    1414             :     {
    1415             :         /*
    1416             :          * Already deleted page (which could be leaf or internal).  Can't
    1417             :          * recycle yet.
    1418             :          */
    1419         208 :         stats->pages_deleted++;
    1420             :     }
    1421       22058 :     else if (P_ISHALFDEAD(opaque))
    1422             :     {
    1423             :         /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
    1424           0 :         attempt_pagedel = true;
    1425             : 
    1426             :         /*
    1427             :          * _bt_pagedel() will increment both pages_newly_deleted and
    1428             :          * pages_deleted stats in all cases (barring corruption)
    1429             :          */
    1430             :     }
    1431       22058 :     else if (P_ISLEAF(opaque))
    1432             :     {
    1433             :         OffsetNumber deletable[MaxIndexTuplesPerPage];
    1434             :         int         ndeletable;
    1435             :         BTVacuumPosting updatable[MaxIndexTuplesPerPage];
    1436             :         int         nupdatable;
    1437             :         OffsetNumber offnum,
    1438             :                     minoff,
    1439             :                     maxoff;
    1440             :         int         nhtidsdead,
    1441             :                     nhtidslive;
    1442             : 
    1443             :         /*
    1444             :          * Trade in the initial read lock for a full cleanup lock on this
    1445             :          * page.  We must get such a lock on every leaf page over the course
    1446             :          * of the vacuum scan, whether or not it actually contains any
    1447             :          * deletable tuples --- see nbtree/README.
    1448             :          */
    1449       20522 :         _bt_upgradelockbufcleanup(rel, buf);
    1450             : 
    1451             :         /*
    1452             :          * Check whether we need to backtrack to earlier pages.  What we are
    1453             :          * concerned about is a page split that happened since we started the
    1454             :          * vacuum scan.  If the split moved tuples on the right half of the
    1455             :          * split (i.e. the tuples that sort high) to a block that we already
    1456             :          * passed over, then we might have missed the tuples.  We need to
    1457             :          * backtrack now.  (Must do this before possibly clearing btpo_cycleid
    1458             :          * or deleting scanblkno page below!)
    1459             :          */
    1460       20522 :         if (vstate->cycleid != 0 &&
    1461       20318 :             opaque->btpo_cycleid == vstate->cycleid &&
    1462           0 :             !(opaque->btpo_flags & BTP_SPLIT_END) &&
    1463           0 :             !P_RIGHTMOST(opaque) &&
    1464           0 :             opaque->btpo_next < scanblkno)
    1465           0 :             backtrack_to = opaque->btpo_next;
    1466             : 
    1467       20522 :         ndeletable = 0;
    1468       20522 :         nupdatable = 0;
    1469       20522 :         minoff = P_FIRSTDATAKEY(opaque);
    1470       20522 :         maxoff = PageGetMaxOffsetNumber(page);
    1471       20522 :         nhtidsdead = 0;
    1472       20522 :         nhtidslive = 0;
    1473       20522 :         if (callback)
    1474             :         {
    1475             :             /* btbulkdelete callback tells us what to delete (or update) */
    1476     4008568 :             for (offnum = minoff;
    1477             :                  offnum <= maxoff;
    1478     3988250 :                  offnum = OffsetNumberNext(offnum))
    1479             :             {
    1480             :                 IndexTuple  itup;
    1481             : 
    1482     3988250 :                 itup = (IndexTuple) PageGetItem(page,
    1483     3988250 :                                                 PageGetItemId(page, offnum));
    1484             : 
    1485             :                 Assert(!BTreeTupleIsPivot(itup));
    1486     3988250 :                 if (!BTreeTupleIsPosting(itup))
    1487             :                 {
    1488             :                     /* Regular tuple, standard table TID representation */
    1489     3872114 :                     if (callback(&itup->t_tid, callback_state))
    1490             :                     {
    1491     1392800 :                         deletable[ndeletable++] = offnum;
    1492     1392800 :                         nhtidsdead++;
    1493             :                     }
    1494             :                     else
    1495     2479314 :                         nhtidslive++;
    1496             :                 }
    1497             :                 else
    1498             :                 {
    1499             :                     BTVacuumPosting vacposting;
    1500             :                     int         nremaining;
    1501             : 
    1502             :                     /* Posting list tuple */
    1503      116136 :                     vacposting = btreevacuumposting(vstate, itup, offnum,
    1504             :                                                     &nremaining);
    1505      116136 :                     if (vacposting == NULL)
    1506             :                     {
    1507             :                         /*
    1508             :                          * All table TIDs from the posting tuple remain, so no
    1509             :                          * delete or update required
    1510             :                          */
    1511             :                         Assert(nremaining == BTreeTupleGetNPosting(itup));
    1512             :                     }
    1513       70930 :                     else if (nremaining > 0)
    1514             :                     {
    1515             : 
    1516             :                         /*
    1517             :                          * Store metadata about posting list tuple in
    1518             :                          * updatable array for entire page.  Existing tuple
    1519             :                          * will be updated during the later call to
    1520             :                          * _bt_delitems_vacuum().
    1521             :                          */
    1522             :                         Assert(nremaining < BTreeTupleGetNPosting(itup));
    1523       36266 :                         updatable[nupdatable++] = vacposting;
    1524       36266 :                         nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
    1525             :                     }
    1526             :                     else
    1527             :                     {
    1528             :                         /*
    1529             :                          * All table TIDs from the posting list must be
    1530             :                          * deleted.  We'll delete the index tuple completely
    1531             :                          * (no update required).
    1532             :                          */
    1533             :                         Assert(nremaining == 0);
    1534       34664 :                         deletable[ndeletable++] = offnum;
    1535       34664 :                         nhtidsdead += BTreeTupleGetNPosting(itup);
    1536       34664 :                         pfree(vacposting);
    1537             :                     }
    1538             : 
    1539      116136 :                     nhtidslive += nremaining;
    1540             :                 }
    1541             :             }
    1542             :         }
    1543             : 
    1544             :         /*
    1545             :          * Apply any needed deletes or updates.  We issue just one
    1546             :          * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
    1547             :          */
    1548       20522 :         if (ndeletable > 0 || nupdatable > 0)
    1549             :         {
    1550             :             Assert(nhtidsdead >= ndeletable + nupdatable);
    1551       13180 :             _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
    1552             :                                 nupdatable);
    1553             : 
    1554       13180 :             stats->tuples_removed += nhtidsdead;
    1555             :             /* must recompute maxoff */
    1556       13180 :             maxoff = PageGetMaxOffsetNumber(page);
    1557             : 
    1558             :             /* can't leak memory here */
    1559       49446 :             for (int i = 0; i < nupdatable; i++)
    1560       36266 :                 pfree(updatable[i]);
    1561             :         }
    1562             :         else
    1563             :         {
    1564             :             /*
    1565             :              * If the leaf page has been split during this vacuum cycle, it
    1566             :              * seems worth expending a write to clear btpo_cycleid even if we
    1567             :              * don't have any deletions to do.  (If we do, _bt_delitems_vacuum
    1568             :              * takes care of this.)  This ensures we won't process the page
    1569             :              * again.
    1570             :              *
    1571             :              * We treat this like a hint-bit update because there's no need to
    1572             :              * WAL-log it.
    1573             :              */
    1574             :             Assert(nhtidsdead == 0);
    1575        7342 :             if (vstate->cycleid != 0 &&
    1576        7138 :                 opaque->btpo_cycleid == vstate->cycleid)
    1577             :             {
    1578           0 :                 opaque->btpo_cycleid = 0;
    1579           0 :                 MarkBufferDirtyHint(buf, true);
    1580             :             }
    1581             :         }
    1582             : 
    1583             :         /*
    1584             :          * If the leaf page is now empty, try to delete it; else count the
    1585             :          * live tuples (live table TIDs in posting lists are counted as
    1586             :          * separate live tuples).  We don't delete when backtracking, though,
    1587             :          * since that would require teaching _bt_pagedel() about backtracking
    1588             :          * (doesn't seem worth adding more complexity to deal with that).
    1589             :          *
    1590             :          * We don't count the number of live TIDs during cleanup-only calls to
    1591             :          * btvacuumscan (i.e. when callback is not set).  We count the number
    1592             :          * of index tuples directly instead.  This avoids the expense of
    1593             :          * directly examining all of the tuples on each page.  VACUUM will
    1594             :          * treat num_index_tuples as an estimate in cleanup-only case, so it
    1595             :          * doesn't matter that this underestimates num_index_tuples
    1596             :          * significantly in some cases.
    1597             :          */
    1598       20522 :         if (minoff > maxoff)
    1599        5602 :             attempt_pagedel = (blkno == scanblkno);
    1600       14920 :         else if (callback)
    1601       14724 :             stats->num_index_tuples += nhtidslive;
    1602             :         else
    1603         196 :             stats->num_index_tuples += maxoff - minoff + 1;
    1604             : 
    1605             :         Assert(!attempt_pagedel || nhtidslive == 0);
    1606             :     }
    1607             : 
    1608       23426 :     if (attempt_pagedel)
    1609             :     {
    1610             :         MemoryContext oldcontext;
    1611             : 
    1612             :         /* Run pagedel in a temp context to avoid memory leakage */
    1613        5602 :         MemoryContextReset(vstate->pagedelcontext);
    1614        5602 :         oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
    1615             : 
    1616             :         /*
    1617             :          * _bt_pagedel maintains the bulk delete stats on our behalf;
    1618             :          * pages_newly_deleted and pages_deleted are likely to be incremented
    1619             :          * during call
    1620             :          */
    1621             :         Assert(blkno == scanblkno);
    1622        5602 :         _bt_pagedel(rel, buf, vstate);
    1623             : 
    1624        5602 :         MemoryContextSwitchTo(oldcontext);
    1625             :         /* pagedel released buffer, so we shouldn't */
    1626             :     }
    1627             :     else
    1628       17824 :         _bt_relbuf(rel, buf);
    1629             : 
    1630       23426 :     if (backtrack_to != P_NONE)
    1631             :     {
    1632           0 :         blkno = backtrack_to;
    1633             : 
    1634             :         /* check for vacuum delay while not holding any buffer lock */
    1635           0 :         vacuum_delay_point(false);
    1636             : 
    1637             :         /*
    1638             :          * We can't use _bt_getbuf() here because it always applies
    1639             :          * _bt_checkpage(), which will barf on an all-zero page. We want to
    1640             :          * recycle all-zero pages, not fail.  Also, we want to use a
    1641             :          * nondefault buffer access strategy.
    1642             :          */
    1643           0 :         buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
    1644             :                                  info->strategy);
    1645           0 :         goto backtrack;
    1646             :     }
    1647             : 
    1648       23426 :     return scanblkno;
    1649             : }
    1650             : 
    1651             : /*
    1652             :  * btreevacuumposting --- determine TIDs still needed in posting list
    1653             :  *
    1654             :  * Returns metadata describing how to build replacement tuple without the TIDs
    1655             :  * that VACUUM needs to delete.  Returned value is NULL in the common case
    1656             :  * where no changes are needed to caller's posting list tuple (we avoid
    1657             :  * allocating memory here as an optimization).
    1658             :  *
    1659             :  * The number of TIDs that should remain in the posting list tuple is set for
    1660             :  * caller in *nremaining.
    1661             :  */
    1662             : static BTVacuumPosting
    1663      116136 : btreevacuumposting(BTVacState *vstate, IndexTuple posting,
    1664             :                    OffsetNumber updatedoffset, int *nremaining)
    1665             : {
    1666      116136 :     int         live = 0;
    1667      116136 :     int         nitem = BTreeTupleGetNPosting(posting);
    1668      116136 :     ItemPointer items = BTreeTupleGetPosting(posting);
    1669      116136 :     BTVacuumPosting vacposting = NULL;
    1670             : 
    1671      667870 :     for (int i = 0; i < nitem; i++)
    1672             :     {
    1673      551734 :         if (!vstate->callback(items + i, vstate->callback_state))
    1674             :         {
    1675             :             /* Live table TID */
    1676      311820 :             live++;
    1677             :         }
    1678      239914 :         else if (vacposting == NULL)
    1679             :         {
    1680             :             /*
    1681             :              * First dead table TID encountered.
    1682             :              *
    1683             :              * It's now clear that we need to delete one or more dead table
    1684             :              * TIDs, so start maintaining metadata describing how to update
    1685             :              * existing posting list tuple.
    1686             :              */
    1687       70930 :             vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
    1688             :                                 nitem * sizeof(uint16));
    1689             : 
    1690       70930 :             vacposting->itup = posting;
    1691       70930 :             vacposting->updatedoffset = updatedoffset;
    1692       70930 :             vacposting->ndeletedtids = 0;
    1693       70930 :             vacposting->deletetids[vacposting->ndeletedtids++] = i;
    1694             :         }
    1695             :         else
    1696             :         {
    1697             :             /* Second or subsequent dead table TID */
    1698      168984 :             vacposting->deletetids[vacposting->ndeletedtids++] = i;
    1699             :         }
    1700             :     }
    1701             : 
    1702      116136 :     *nremaining = live;
    1703      116136 :     return vacposting;
    1704             : }
    1705             : 
    1706             : /*
    1707             :  *  btcanreturn() -- Check whether btree indexes support index-only scans.
    1708             :  *
    1709             :  * btrees always do, so this is trivial.
    1710             :  */
    1711             : bool
    1712     1128926 : btcanreturn(Relation index, int attno)
    1713             : {
    1714     1128926 :     return true;
    1715             : }
    1716             : 
    1717             : /*
    1718             :  * btgettreeheight() -- Compute tree height for use by btcostestimate().
    1719             :  */
    1720             : int
    1721      720490 : btgettreeheight(Relation rel)
    1722             : {
    1723      720490 :     return _bt_getrootheight(rel);
    1724             : }
    1725             : 
    1726             : CompareType
    1727           0 : bttranslatestrategy(StrategyNumber strategy, Oid opfamily)
    1728             : {
    1729           0 :     switch (strategy)
    1730             :     {
    1731           0 :         case BTLessStrategyNumber:
    1732           0 :             return COMPARE_LT;
    1733           0 :         case BTLessEqualStrategyNumber:
    1734           0 :             return COMPARE_LE;
    1735           0 :         case BTEqualStrategyNumber:
    1736           0 :             return COMPARE_EQ;
    1737           0 :         case BTGreaterEqualStrategyNumber:
    1738           0 :             return COMPARE_GE;
    1739           0 :         case BTGreaterStrategyNumber:
    1740           0 :             return COMPARE_GT;
    1741           0 :         default:
    1742           0 :             return COMPARE_INVALID;
    1743             :     }
    1744             : }
    1745             : 
    1746             : StrategyNumber
    1747           0 : bttranslatecmptype(CompareType cmptype, Oid opfamily)
    1748             : {
    1749           0 :     switch (cmptype)
    1750             :     {
    1751           0 :         case COMPARE_LT:
    1752           0 :             return BTLessStrategyNumber;
    1753           0 :         case COMPARE_LE:
    1754           0 :             return BTLessEqualStrategyNumber;
    1755           0 :         case COMPARE_EQ:
    1756           0 :             return BTEqualStrategyNumber;
    1757           0 :         case COMPARE_GE:
    1758           0 :             return BTGreaterEqualStrategyNumber;
    1759           0 :         case COMPARE_GT:
    1760           0 :             return BTGreaterStrategyNumber;
    1761           0 :         default:
    1762           0 :             return InvalidStrategy;
    1763             :     }
    1764             : }

Generated by: LCOV version 1.14