LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtree.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13beta1 Lines: 411 448 91.7 %
Date: 2020-06-01 10:07:15 Functions: 23 24 95.8 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtree.c
       4             :  *    Implementation of Lehman and Yao's btree management algorithm for
       5             :  *    Postgres.
       6             :  *
       7             :  * NOTES
       8             :  *    This file contains only the public interface routines.
       9             :  *
      10             :  *
      11             :  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
      12             :  * Portions Copyright (c) 1994, Regents of the University of California
      13             :  *
      14             :  * IDENTIFICATION
      15             :  *    src/backend/access/nbtree/nbtree.c
      16             :  *
      17             :  *-------------------------------------------------------------------------
      18             :  */
      19             : #include "postgres.h"
      20             : 
      21             : #include "access/nbtree.h"
      22             : #include "access/nbtxlog.h"
      23             : #include "access/relscan.h"
      24             : #include "access/xlog.h"
      25             : #include "commands/progress.h"
      26             : #include "commands/vacuum.h"
      27             : #include "miscadmin.h"
      28             : #include "nodes/execnodes.h"
      29             : #include "pgstat.h"
      30             : #include "postmaster/autovacuum.h"
      31             : #include "storage/condition_variable.h"
      32             : #include "storage/indexfsm.h"
      33             : #include "storage/ipc.h"
      34             : #include "storage/lmgr.h"
      35             : #include "storage/smgr.h"
      36             : #include "utils/builtins.h"
      37             : #include "utils/index_selfuncs.h"
      38             : #include "utils/memutils.h"
      39             : 
      40             : 
      41             : /* Working state needed by btvacuumpage */
      42             : typedef struct
      43             : {
      44             :     IndexVacuumInfo *info;
      45             :     IndexBulkDeleteResult *stats;
      46             :     IndexBulkDeleteCallback callback;
      47             :     void       *callback_state;
      48             :     BTCycleId   cycleid;
      49             :     BlockNumber totFreePages;   /* true total # of free pages */
      50             :     TransactionId oldestBtpoXact;
      51             :     MemoryContext pagedelcontext;
      52             : } BTVacState;
      53             : 
      54             : /*
      55             :  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
      56             :  *
      57             :  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
      58             :  * a new page; others must wait.
      59             :  *
      60             :  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
      61             :  * to a new page; some process can start doing that.
      62             :  *
      63             :  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
      64             :  * We reach this state once for every distinct combination of array keys.
      65             :  */
      66             : typedef enum
      67             : {
      68             :     BTPARALLEL_NOT_INITIALIZED,
      69             :     BTPARALLEL_ADVANCING,
      70             :     BTPARALLEL_IDLE,
      71             :     BTPARALLEL_DONE
      72             : } BTPS_State;
      73             : 
      74             : /*
      75             :  * BTParallelScanDescData contains btree specific shared information required
      76             :  * for parallel scan.
      77             :  */
      78             : typedef struct BTParallelScanDescData
      79             : {
      80             :     BlockNumber btps_scanPage;  /* latest or next page to be scanned */
      81             :     BTPS_State  btps_pageStatus;    /* indicates whether next page is
      82             :                                      * available for scan. see above for
      83             :                                      * possible states of parallel scan. */
      84             :     int         btps_arrayKeyCount; /* count indicating number of array scan
      85             :                                      * keys processed by parallel scan */
      86             :     slock_t     btps_mutex;     /* protects above variables */
      87             :     ConditionVariable btps_cv;  /* used to synchronize parallel scan */
      88             : }           BTParallelScanDescData;
      89             : 
      90             : typedef struct BTParallelScanDescData *BTParallelScanDesc;
      91             : 
      92             : 
      93             : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
      94             :                          IndexBulkDeleteCallback callback, void *callback_state,
      95             :                          BTCycleId cycleid);
      96             : static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
      97             : static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
      98             :                                           IndexTuple posting,
      99             :                                           OffsetNumber updatedoffset,
     100             :                                           int *nremaining);
     101             : 
     102             : 
     103             : /*
     104             :  * Btree handler function: return IndexAmRoutine with access method parameters
     105             :  * and callbacks.
     106             :  */
     107             : Datum
     108     1090098 : bthandler(PG_FUNCTION_ARGS)
     109             : {
     110     1090098 :     IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
     111             : 
     112     1090098 :     amroutine->amstrategies = BTMaxStrategyNumber;
     113     1090098 :     amroutine->amsupport = BTNProcs;
     114     1090098 :     amroutine->amoptsprocnum = BTOPTIONS_PROC;
     115     1090098 :     amroutine->amcanorder = true;
     116     1090098 :     amroutine->amcanorderbyop = false;
     117     1090098 :     amroutine->amcanbackward = true;
     118     1090098 :     amroutine->amcanunique = true;
     119     1090098 :     amroutine->amcanmulticol = true;
     120     1090098 :     amroutine->amoptionalkey = true;
     121     1090098 :     amroutine->amsearcharray = true;
     122     1090098 :     amroutine->amsearchnulls = true;
     123     1090098 :     amroutine->amstorage = false;
     124     1090098 :     amroutine->amclusterable = true;
     125     1090098 :     amroutine->ampredlocks = true;
     126     1090098 :     amroutine->amcanparallel = true;
     127     1090098 :     amroutine->amcaninclude = true;
     128     1090098 :     amroutine->amusemaintenanceworkmem = false;
     129     1090098 :     amroutine->amparallelvacuumoptions =
     130             :         VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
     131     1090098 :     amroutine->amkeytype = InvalidOid;
     132             : 
     133     1090098 :     amroutine->ambuild = btbuild;
     134     1090098 :     amroutine->ambuildempty = btbuildempty;
     135     1090098 :     amroutine->aminsert = btinsert;
     136     1090098 :     amroutine->ambulkdelete = btbulkdelete;
     137     1090098 :     amroutine->amvacuumcleanup = btvacuumcleanup;
     138     1090098 :     amroutine->amcanreturn = btcanreturn;
     139     1090098 :     amroutine->amcostestimate = btcostestimate;
     140     1090098 :     amroutine->amoptions = btoptions;
     141     1090098 :     amroutine->amproperty = btproperty;
     142     1090098 :     amroutine->ambuildphasename = btbuildphasename;
     143     1090098 :     amroutine->amvalidate = btvalidate;
     144     1090098 :     amroutine->ambeginscan = btbeginscan;
     145     1090098 :     amroutine->amrescan = btrescan;
     146     1090098 :     amroutine->amgettuple = btgettuple;
     147     1090098 :     amroutine->amgetbitmap = btgetbitmap;
     148     1090098 :     amroutine->amendscan = btendscan;
     149     1090098 :     amroutine->ammarkpos = btmarkpos;
     150     1090098 :     amroutine->amrestrpos = btrestrpos;
     151     1090098 :     amroutine->amestimateparallelscan = btestimateparallelscan;
     152     1090098 :     amroutine->aminitparallelscan = btinitparallelscan;
     153     1090098 :     amroutine->amparallelrescan = btparallelrescan;
     154             : 
     155     1090098 :     PG_RETURN_POINTER(amroutine);
     156             : }
     157             : 
     158             : /*
     159             :  *  btbuildempty() -- build an empty btree index in the initialization fork
     160             :  */
     161             : void
     162          86 : btbuildempty(Relation index)
     163             : {
     164             :     Page        metapage;
     165             : 
     166             :     /* Construct metapage. */
     167          86 :     metapage = (Page) palloc(BLCKSZ);
     168          86 :     _bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));
     169             : 
     170             :     /*
     171             :      * Write the page and log it.  It might seem that an immediate sync would
     172             :      * be sufficient to guarantee that the file exists on disk, but recovery
     173             :      * itself might remove it while replaying, for example, an
     174             :      * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record.  Therefore, we need
     175             :      * this even when wal_level=minimal.
     176             :      */
     177          86 :     PageSetChecksumInplace(metapage, BTREE_METAPAGE);
     178          86 :     smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
     179             :               (char *) metapage, true);
     180          86 :     log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
     181             :                 BTREE_METAPAGE, metapage, true);
     182             : 
     183             :     /*
     184             :      * An immediate sync is required even if we xlog'd the page, because the
     185             :      * write did not go through shared_buffers and therefore a concurrent
     186             :      * checkpoint may have moved the redo pointer past our xlog record.
     187             :      */
     188          86 :     smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
     189          86 : }
     190             : 
     191             : /*
     192             :  *  btinsert() -- insert an index tuple into a btree.
     193             :  *
     194             :  *      Descend the tree recursively, find the appropriate location for our
     195             :  *      new tuple, and put it there.
     196             :  */
     197             : bool
     198    11077580 : btinsert(Relation rel, Datum *values, bool *isnull,
     199             :          ItemPointer ht_ctid, Relation heapRel,
     200             :          IndexUniqueCheck checkUnique,
     201             :          IndexInfo *indexInfo)
     202             : {
     203             :     bool        result;
     204             :     IndexTuple  itup;
     205             : 
     206             :     /* generate an index tuple */
     207    11077580 :     itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
     208    11077580 :     itup->t_tid = *ht_ctid;
     209             : 
     210    11077580 :     result = _bt_doinsert(rel, itup, checkUnique, heapRel);
     211             : 
     212    11077234 :     pfree(itup);
     213             : 
     214    11077234 :     return result;
     215             : }
     216             : 
     217             : /*
     218             :  *  btgettuple() -- Get the next tuple in the scan.
     219             :  */
     220             : bool
     221    21268908 : btgettuple(IndexScanDesc scan, ScanDirection dir)
     222             : {
     223    21268908 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     224             :     bool        res;
     225             : 
     226             :     /* btree indexes are never lossy */
     227    21268908 :     scan->xs_recheck = false;
     228             : 
     229             :     /*
     230             :      * If we have any array keys, initialize them during first call for a
     231             :      * scan.  We can't do this in btrescan because we don't know the scan
     232             :      * direction at that time.
     233             :      */
     234    21268908 :     if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
     235             :     {
     236             :         /* punt if we have any unsatisfiable array keys */
     237         108 :         if (so->numArrayKeys < 0)
     238           0 :             return false;
     239             : 
     240         108 :         _bt_start_array_keys(scan, dir);
     241             :     }
     242             : 
     243             :     /* This loop handles advancing to the next array elements, if any */
     244             :     do
     245             :     {
     246             :         /*
     247             :          * If we've already initialized this scan, we can just advance it in
     248             :          * the appropriate direction.  If we haven't done so yet, we call
     249             :          * _bt_first() to get the first item in the scan.
     250             :          */
     251    21269276 :         if (!BTScanPosIsValid(so->currPos))
     252    11178474 :             res = _bt_first(scan, dir);
     253             :         else
     254             :         {
     255             :             /*
     256             :              * Check to see if we should kill the previously-fetched tuple.
     257             :              */
     258    10090802 :             if (scan->kill_prior_tuple)
     259             :             {
     260             :                 /*
     261             :                  * Yes, remember it for later. (We'll deal with all such
     262             :                  * tuples at once right before leaving the index page.)  The
     263             :                  * test for numKilled overrun is not just paranoia: if the
     264             :                  * caller reverses direction in the indexscan then the same
     265             :                  * item might get entered multiple times. It's not worth
     266             :                  * trying to optimize that, so we don't detect it, but instead
     267             :                  * just forget any excess entries.
     268             :                  */
     269      131828 :                 if (so->killedItems == NULL)
     270       98556 :                     so->killedItems = (int *)
     271       98556 :                         palloc(MaxTIDsPerBTreePage * sizeof(int));
     272      131828 :                 if (so->numKilled < MaxTIDsPerBTreePage)
     273      131828 :                     so->killedItems[so->numKilled++] = so->currPos.itemIndex;
     274             :             }
     275             : 
     276             :             /*
     277             :              * Now continue the scan.
     278             :              */
     279    10090802 :             res = _bt_next(scan, dir);
     280             :         }
     281             : 
     282             :         /* If we have a tuple, return it ... */
     283    21269276 :         if (res)
     284    17079956 :             break;
     285             :         /* ... otherwise see if we have more array keys to deal with */
     286     4189320 :     } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
     287             : 
     288    21268908 :     return res;
     289             : }
     290             : 
     291             : /*
     292             :  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
     293             :  */
     294             : int64
     295       15562 : btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
     296             : {
     297       15562 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     298       15562 :     int64       ntids = 0;
     299             :     ItemPointer heapTid;
     300             : 
     301             :     /*
     302             :      * If we have any array keys, initialize them.
     303             :      */
     304       15562 :     if (so->numArrayKeys)
     305             :     {
     306             :         /* punt if we have any unsatisfiable array keys */
     307         282 :         if (so->numArrayKeys < 0)
     308           0 :             return ntids;
     309             : 
     310         282 :         _bt_start_array_keys(scan, ForwardScanDirection);
     311             :     }
     312             : 
     313             :     /* This loop handles advancing to the next array elements, if any */
     314             :     do
     315             :     {
     316             :         /* Fetch the first page & tuple */
     317       17160 :         if (_bt_first(scan, ForwardScanDirection))
     318             :         {
     319             :             /* Save tuple ID, and continue scanning */
     320        7380 :             heapTid = &scan->xs_heaptid;
     321        7380 :             tbm_add_tuples(tbm, heapTid, 1, false);
     322        7380 :             ntids++;
     323             : 
     324             :             for (;;)
     325             :             {
     326             :                 /*
     327             :                  * Advance to next tuple within page.  This is the same as the
     328             :                  * easy case in _bt_next().
     329             :                  */
     330     1034486 :                 if (++so->currPos.itemIndex > so->currPos.lastItem)
     331             :                 {
     332             :                     /* let _bt_next do the heavy lifting */
     333        9826 :                     if (!_bt_next(scan, ForwardScanDirection))
     334        7380 :                         break;
     335             :                 }
     336             : 
     337             :                 /* Save tuple ID, and continue scanning */
     338     1027106 :                 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
     339     1027106 :                 tbm_add_tuples(tbm, heapTid, 1, false);
     340     1027106 :                 ntids++;
     341             :             }
     342             :         }
     343             :         /* Now see if we have more array keys to deal with */
     344       17160 :     } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
     345             : 
     346       15562 :     return ntids;
     347             : }
     348             : 
     349             : /*
     350             :  *  btbeginscan() -- start a scan on a btree index
     351             :  */
     352             : IndexScanDesc
     353    11006796 : btbeginscan(Relation rel, int nkeys, int norderbys)
     354             : {
     355             :     IndexScanDesc scan;
     356             :     BTScanOpaque so;
     357             : 
     358             :     /* no order by operators allowed */
     359             :     Assert(norderbys == 0);
     360             : 
     361             :     /* get the scan */
     362    11006796 :     scan = RelationGetIndexScan(rel, nkeys, norderbys);
     363             : 
     364             :     /* allocate private workspace */
     365    11006796 :     so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
     366    11006796 :     BTScanPosInvalidate(so->currPos);
     367    11006796 :     BTScanPosInvalidate(so->markPos);
     368    11006796 :     if (scan->numberOfKeys > 0)
     369    11001412 :         so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
     370             :     else
     371        5384 :         so->keyData = NULL;
     372             : 
     373    11006796 :     so->arrayKeyData = NULL; /* assume no array keys for now */
     374    11006796 :     so->numArrayKeys = 0;
     375    11006796 :     so->arrayKeys = NULL;
     376    11006796 :     so->arrayContext = NULL;
     377             : 
     378    11006796 :     so->killedItems = NULL;      /* until needed */
     379    11006796 :     so->numKilled = 0;
     380             : 
     381             :     /*
     382             :      * We don't know yet whether the scan will be index-only, so we do not
     383             :      * allocate the tuple workspace arrays until btrescan.  However, we set up
     384             :      * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
     385             :      */
     386    11006796 :     so->currTuples = so->markTuples = NULL;
     387             : 
     388    11006796 :     scan->xs_itupdesc = RelationGetDescr(rel);
     389             : 
     390    11006796 :     scan->opaque = so;
     391             : 
     392    11006796 :     return scan;
     393             : }
     394             : 
     395             : /*
     396             :  *  btrescan() -- rescan an index relation
     397             :  */
     398             : void
     399    11200862 : btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
     400             :          ScanKey orderbys, int norderbys)
     401             : {
     402    11200862 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     403             : 
     404             :     /* we aren't holding any read locks, but gotta drop the pins */
     405    11200862 :     if (BTScanPosIsValid(so->currPos))
     406             :     {
     407             :         /* Before leaving current page, deal with any killed items */
     408       33336 :         if (so->numKilled > 0)
     409         162 :             _bt_killitems(scan);
     410       33336 :         BTScanPosUnpinIfPinned(so->currPos);
     411       33336 :         BTScanPosInvalidate(so->currPos);
     412             :     }
     413             : 
     414    11200862 :     so->markItemIndex = -1;
     415    11200862 :     so->arrayKeyCount = 0;
     416    11200862 :     BTScanPosUnpinIfPinned(so->markPos);
     417    11200862 :     BTScanPosInvalidate(so->markPos);
     418             : 
     419             :     /*
     420             :      * Allocate tuple workspace arrays, if needed for an index-only scan and
     421             :      * not already done in a previous rescan call.  To save on palloc
     422             :      * overhead, both workspaces are allocated as one palloc block; only this
     423             :      * function and btendscan know that.
     424             :      *
     425             :      * NOTE: this data structure also makes it safe to return data from a
     426             :      * "name" column, even though btree name_ops uses an underlying storage
     427             :      * datatype of cstring.  The risk there is that "name" is supposed to be
     428             :      * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
     429             :      * However, since we only return data out of tuples sitting in the
     430             :      * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
     431             :      * data out of the markTuples array --- running off the end of memory for
     432             :      * a SIGSEGV is not possible.  Yeah, this is ugly as sin, but it beats
     433             :      * adding special-case treatment for name_ops elsewhere.
     434             :      */
     435    11200862 :     if (scan->xs_want_itup && so->currTuples == NULL)
     436             :     {
     437       78346 :         so->currTuples = (char *) palloc(BLCKSZ * 2);
     438       78346 :         so->markTuples = so->currTuples + BLCKSZ;
     439             :     }
     440             : 
     441             :     /*
     442             :      * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
     443             :      * - vadim 05/05/97
     444             :      */
     445    11200862 :     if (scankey && scan->numberOfKeys > 0)
     446    11195478 :         memmove(scan->keyData,
     447             :                 scankey,
     448    11195478 :                 scan->numberOfKeys * sizeof(ScanKeyData));
     449    11200862 :     so->numberOfKeys = 0;        /* until _bt_preprocess_keys sets it */
     450             : 
     451             :     /* If any keys are SK_SEARCHARRAY type, set up array-key info */
     452    11200862 :     _bt_preprocess_array_keys(scan);
     453    11200862 : }
     454             : 
     455             : /*
     456             :  *  btendscan() -- close down a scan
     457             :  */
     458             : void
     459    11006040 : btendscan(IndexScanDesc scan)
     460             : {
     461    11006040 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     462             : 
     463             :     /* we aren't holding any read locks, but gotta drop the pins */
     464    11006040 :     if (BTScanPosIsValid(so->currPos))
     465             :     {
     466             :         /* Before leaving current page, deal with any killed items */
     467     6955370 :         if (so->numKilled > 0)
     468       56080 :             _bt_killitems(scan);
     469     6955370 :         BTScanPosUnpinIfPinned(so->currPos);
     470             :     }
     471             : 
     472    11006040 :     so->markItemIndex = -1;
     473    11006040 :     BTScanPosUnpinIfPinned(so->markPos);
     474             : 
     475             :     /* No need to invalidate positions, the RAM is about to be freed. */
     476             : 
     477             :     /* Release storage */
     478    11006040 :     if (so->keyData != NULL)
     479    11000714 :         pfree(so->keyData);
     480             :     /* so->arrayKeyData and so->arrayKeys are in arrayContext */
     481    11006040 :     if (so->arrayContext != NULL)
     482         396 :         MemoryContextDelete(so->arrayContext);
     483    11006040 :     if (so->killedItems != NULL)
     484       98528 :         pfree(so->killedItems);
     485    11006040 :     if (so->currTuples != NULL)
     486       78280 :         pfree(so->currTuples);
     487             :     /* so->markTuples should not be pfree'd, see btrescan */
     488    11006040 :     pfree(so);
     489    11006040 : }
     490             : 
     491             : /*
     492             :  *  btmarkpos() -- save current scan position
     493             :  */
     494             : void
     495       88036 : btmarkpos(IndexScanDesc scan)
     496             : {
     497       88036 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     498             : 
     499             :     /* There may be an old mark with a pin (but no lock). */
     500       88036 :     BTScanPosUnpinIfPinned(so->markPos);
     501             : 
     502             :     /*
     503             :      * Just record the current itemIndex.  If we later step to next page
     504             :      * before releasing the marked position, _bt_steppage makes a full copy of
     505             :      * the currPos struct in markPos.  If (as often happens) the mark is moved
     506             :      * before we leave the page, we don't have to do that work.
     507             :      */
     508       88036 :     if (BTScanPosIsValid(so->currPos))
     509       88036 :         so->markItemIndex = so->currPos.itemIndex;
     510             :     else
     511             :     {
     512           0 :         BTScanPosInvalidate(so->markPos);
     513           0 :         so->markItemIndex = -1;
     514             :     }
     515             : 
     516             :     /* Also record the current positions of any array keys */
     517       88036 :     if (so->numArrayKeys)
     518           4 :         _bt_mark_array_keys(scan);
     519       88036 : }
     520             : 
     521             : /*
     522             :  *  btrestrpos() -- restore scan to last saved position
     523             :  */
     524             : void
     525       36012 : btrestrpos(IndexScanDesc scan)
     526             : {
     527       36012 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     528             : 
     529             :     /* Restore the marked positions of any array keys */
     530       36012 :     if (so->numArrayKeys)
     531           4 :         _bt_restore_array_keys(scan);
     532             : 
     533       36012 :     if (so->markItemIndex >= 0)
     534             :     {
     535             :         /*
     536             :          * The scan has never moved to a new page since the last mark.  Just
     537             :          * restore the itemIndex.
     538             :          *
     539             :          * NB: In this case we can't count on anything in so->markPos to be
     540             :          * accurate.
     541             :          */
     542       35940 :         so->currPos.itemIndex = so->markItemIndex;
     543             :     }
     544             :     else
     545             :     {
     546             :         /*
     547             :          * The scan moved to a new page after last mark or restore, and we are
     548             :          * now restoring to the marked page.  We aren't holding any read
     549             :          * locks, but if we're still holding the pin for the current position,
     550             :          * we must drop it.
     551             :          */
     552          72 :         if (BTScanPosIsValid(so->currPos))
     553             :         {
     554             :             /* Before leaving current page, deal with any killed items */
     555          72 :             if (so->numKilled > 0)
     556           0 :                 _bt_killitems(scan);
     557          72 :             BTScanPosUnpinIfPinned(so->currPos);
     558             :         }
     559             : 
     560          72 :         if (BTScanPosIsValid(so->markPos))
     561             :         {
     562             :             /* bump pin on mark buffer for assignment to current buffer */
     563          72 :             if (BTScanPosIsPinned(so->markPos))
     564           0 :                 IncrBufferRefCount(so->markPos.buf);
     565          72 :             memcpy(&so->currPos, &so->markPos,
     566             :                    offsetof(BTScanPosData, items[1]) +
     567          72 :                    so->markPos.lastItem * sizeof(BTScanPosItem));
     568          72 :             if (so->currTuples)
     569           0 :                 memcpy(so->currTuples, so->markTuples,
     570           0 :                        so->markPos.nextTupleOffset);
     571             :         }
     572             :         else
     573           0 :             BTScanPosInvalidate(so->currPos);
     574             :     }
     575       36012 : }
     576             : 
     577             : /*
     578             :  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
     579             :  */
     580             : Size
     581          32 : btestimateparallelscan(void)
     582             : {
     583          32 :     return sizeof(BTParallelScanDescData);
     584             : }
     585             : 
     586             : /*
     587             :  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
     588             :  */
     589             : void
     590          32 : btinitparallelscan(void *target)
     591             : {
     592          32 :     BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
     593             : 
     594          32 :     SpinLockInit(&bt_target->btps_mutex);
     595          32 :     bt_target->btps_scanPage = InvalidBlockNumber;
     596          32 :     bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
     597          32 :     bt_target->btps_arrayKeyCount = 0;
     598          32 :     ConditionVariableInit(&bt_target->btps_cv);
     599          32 : }
     600             : 
     601             : /*
     602             :  *  btparallelrescan() -- reset parallel scan
     603             :  */
     604             : void
     605          16 : btparallelrescan(IndexScanDesc scan)
     606             : {
     607             :     BTParallelScanDesc btscan;
     608          16 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
     609             : 
     610             :     Assert(parallel_scan);
     611             : 
     612          16 :     btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
     613             :                                                   parallel_scan->ps_offset);
     614             : 
     615             :     /*
     616             :      * In theory, we don't need to acquire the spinlock here, because there
     617             :      * shouldn't be any other workers running at this point, but we do so for
     618             :      * consistency.
     619             :      */
     620          16 :     SpinLockAcquire(&btscan->btps_mutex);
     621          16 :     btscan->btps_scanPage = InvalidBlockNumber;
     622          16 :     btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
     623          16 :     btscan->btps_arrayKeyCount = 0;
     624          16 :     SpinLockRelease(&btscan->btps_mutex);
     625          16 : }
     626             : 
     627             : /*
     628             :  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
     629             :  *      page.  Other scans must wait until we call _bt_parallel_release()
     630             :  *      or _bt_parallel_done().
     631             :  *
     632             :  * The return value is true if we successfully seized the scan and false
     633             :  * if we did not.  The latter case occurs if no pages remain for the current
     634             :  * set of scankeys.
     635             :  *
     636             :  * If the return value is true, *pageno returns the next or current page
     637             :  * of the scan (depending on the scan direction).  An invalid block number
     638             :  * means the scan hasn't yet started, and P_NONE means we've reached the end.
     639             :  * The first time a participating process reaches the last page, it will return
     640             :  * true and set *pageno to P_NONE; after that, further attempts to seize the
     641             :  * scan will return false.
     642             :  *
     643             :  * Callers should ignore the value of pageno if the return value is false.
     644             :  */
     645             : bool
     646        1096 : _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
     647             : {
     648        1096 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     649             :     BTPS_State  pageStatus;
     650        1096 :     bool        exit_loop = false;
     651        1096 :     bool        status = true;
     652        1096 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
     653             :     BTParallelScanDesc btscan;
     654             : 
     655        1096 :     *pageno = P_NONE;
     656             : 
     657        1096 :     btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
     658             :                                                   parallel_scan->ps_offset);
     659             : 
     660             :     while (1)
     661             :     {
     662        1096 :         SpinLockAcquire(&btscan->btps_mutex);
     663        1096 :         pageStatus = btscan->btps_pageStatus;
     664             : 
     665        1096 :         if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
     666             :         {
     667             :             /* Parallel scan has already advanced to a new set of scankeys. */
     668           0 :             status = false;
     669             :         }
     670        1096 :         else if (pageStatus == BTPARALLEL_DONE)
     671             :         {
     672             :             /*
     673             :              * We're done with this set of scankeys.  This may be the end, or
     674             :              * there could be more sets to try.
     675             :              */
     676         192 :             status = false;
     677             :         }
     678         904 :         else if (pageStatus != BTPARALLEL_ADVANCING)
     679             :         {
     680             :             /*
     681             :              * We have successfully seized control of the scan for the purpose
     682             :              * of advancing it to a new page!
     683             :              */
     684         904 :             btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
     685         904 :             *pageno = btscan->btps_scanPage;
     686         904 :             exit_loop = true;
     687             :         }
     688        1096 :         SpinLockRelease(&btscan->btps_mutex);
     689        1096 :         if (exit_loop || !status)
     690             :             break;
     691           0 :         ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
     692             :     }
     693        1096 :     ConditionVariableCancelSleep();
     694             : 
     695        1096 :     return status;
     696             : }
     697             : 
     698             : /*
     699             :  * _bt_parallel_release() -- Complete the process of advancing the scan to a
     700             :  *      new page.  We now have the new value btps_scanPage; some other backend
     701             :  *      can now begin advancing the scan.
     702             :  */
     703             : void
     704         856 : _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
     705             : {
     706         856 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
     707             :     BTParallelScanDesc btscan;
     708             : 
     709         856 :     btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
     710             :                                                   parallel_scan->ps_offset);
     711             : 
     712         856 :     SpinLockAcquire(&btscan->btps_mutex);
     713         856 :     btscan->btps_scanPage = scan_page;
     714         856 :     btscan->btps_pageStatus = BTPARALLEL_IDLE;
     715         856 :     SpinLockRelease(&btscan->btps_mutex);
     716         856 :     ConditionVariableSignal(&btscan->btps_cv);
     717         856 : }
     718             : 
     719             : /*
     720             :  * _bt_parallel_done() -- Mark the parallel scan as complete.
     721             :  *
     722             :  * When there are no pages left to scan, this function should be called to
     723             :  * notify other workers.  Otherwise, they might wait forever for the scan to
     724             :  * advance to the next page.
     725             :  */
     726             : void
     727     4205572 : _bt_parallel_done(IndexScanDesc scan)
     728             : {
     729     4205572 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     730     4205572 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
     731             :     BTParallelScanDesc btscan;
     732     4205572 :     bool        status_changed = false;
     733             : 
     734             :     /* Do nothing, for non-parallel scans */
     735     4205572 :     if (parallel_scan == NULL)
     736     4205524 :         return;
     737             : 
     738          48 :     btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
     739             :                                                   parallel_scan->ps_offset);
     740             : 
     741             :     /*
     742             :      * Mark the parallel scan as done for this combination of scan keys,
     743             :      * unless some other process already did so.  See also
     744             :      * _bt_advance_array_keys.
     745             :      */
     746          48 :     SpinLockAcquire(&btscan->btps_mutex);
     747          48 :     if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
     748          48 :         btscan->btps_pageStatus != BTPARALLEL_DONE)
     749             :     {
     750          48 :         btscan->btps_pageStatus = BTPARALLEL_DONE;
     751          48 :         status_changed = true;
     752             :     }
     753          48 :     SpinLockRelease(&btscan->btps_mutex);
     754             : 
     755             :     /* wake up all the workers associated with this parallel scan */
     756          48 :     if (status_changed)
     757          48 :         ConditionVariableBroadcast(&btscan->btps_cv);
     758             : }
     759             : 
     760             : /*
     761             :  * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
     762             :  *          keys.
     763             :  *
     764             :  * Updates the count of array keys processed for both local and parallel
     765             :  * scans.
     766             :  */
     767             : void
     768           0 : _bt_parallel_advance_array_keys(IndexScanDesc scan)
     769             : {
     770           0 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     771           0 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
     772             :     BTParallelScanDesc btscan;
     773             : 
     774           0 :     btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
     775             :                                                   parallel_scan->ps_offset);
     776             : 
     777           0 :     so->arrayKeyCount++;
     778           0 :     SpinLockAcquire(&btscan->btps_mutex);
     779           0 :     if (btscan->btps_pageStatus == BTPARALLEL_DONE)
     780             :     {
     781           0 :         btscan->btps_scanPage = InvalidBlockNumber;
     782           0 :         btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
     783           0 :         btscan->btps_arrayKeyCount++;
     784             :     }
     785           0 :     SpinLockRelease(&btscan->btps_mutex);
     786           0 : }
     787             : 
     788             : /*
     789             :  * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup
     790             :  *
     791             :  * Called by btvacuumcleanup when btbulkdelete was never called because no
     792             :  * tuples need to be deleted.
     793             :  *
     794             :  * When we return false, VACUUM can even skip the cleanup-only call to
     795             :  * btvacuumscan (i.e. there will be no btvacuumscan call for this index at
     796             :  * all).  Otherwise, a cleanup-only btvacuumscan call is required.
     797             :  */
     798             : static bool
     799       59646 : _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
     800             : {
     801             :     Buffer      metabuf;
     802             :     Page        metapg;
     803             :     BTMetaPageData *metad;
     804       59646 :     bool        result = false;
     805             : 
     806       59646 :     metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
     807       59646 :     metapg = BufferGetPage(metabuf);
     808       59646 :     metad = BTPageGetMeta(metapg);
     809             : 
     810       59646 :     if (metad->btm_version < BTREE_NOVAC_VERSION)
     811             :     {
     812             :         /*
     813             :          * Do cleanup if metapage needs upgrade, because we don't have
     814             :          * cleanup-related meta-information yet.
     815             :          */
     816           0 :         result = true;
     817             :     }
     818       59656 :     else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
     819          10 :              TransactionIdPrecedes(metad->btm_oldest_btpo_xact,
     820             :                                    RecentGlobalXmin))
     821             :     {
     822             :         /*
     823             :          * If any oldest btpo.xact from a previously deleted page in the index
     824             :          * is older than RecentGlobalXmin, then at least one deleted page can
     825             :          * be recycled -- don't skip cleanup.
     826             :          */
     827           6 :         result = true;
     828             :     }
     829             :     else
     830             :     {
     831             :         BTOptions  *relopts;
     832             :         float8      cleanup_scale_factor;
     833             :         float8      prev_num_heap_tuples;
     834             : 
     835             :         /*
     836             :          * If table receives enough insertions and no cleanup was performed,
     837             :          * then index would appear have stale statistics.  If scale factor is
     838             :          * set, we avoid that by performing cleanup if the number of inserted
     839             :          * tuples exceeds vacuum_cleanup_index_scale_factor fraction of
     840             :          * original tuples count.
     841             :          */
     842       59640 :         relopts = (BTOptions *) info->index->rd_options;
     843       59640 :         cleanup_scale_factor = (relopts &&
     844           0 :                                 relopts->vacuum_cleanup_index_scale_factor >= 0)
     845             :             ? relopts->vacuum_cleanup_index_scale_factor
     846       59640 :             : vacuum_cleanup_index_scale_factor;
     847       59640 :         prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
     848             : 
     849       59640 :         if (cleanup_scale_factor <= 0 ||
     850        2754 :             prev_num_heap_tuples <= 0 ||
     851        2754 :             (info->num_heap_tuples - prev_num_heap_tuples) /
     852             :             prev_num_heap_tuples >= cleanup_scale_factor)
     853       57654 :             result = true;
     854             :     }
     855             : 
     856       59646 :     _bt_relbuf(info->index, metabuf);
     857       59646 :     return result;
     858             : }
     859             : 
     860             : /*
     861             :  * Bulk deletion of all index entries pointing to a set of heap tuples.
     862             :  * The set of target tuples is specified via a callback routine that tells
     863             :  * whether any given heap tuple (identified by ItemPointer) is being deleted.
     864             :  *
     865             :  * Result: a palloc'd struct containing statistical info for VACUUM displays.
     866             :  */
     867             : IndexBulkDeleteResult *
     868        4112 : btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
     869             :              IndexBulkDeleteCallback callback, void *callback_state)
     870             : {
     871        4112 :     Relation    rel = info->index;
     872             :     BTCycleId   cycleid;
     873             : 
     874             :     /* allocate stats if first time through, else re-use existing struct */
     875        4112 :     if (stats == NULL)
     876        4112 :         stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
     877             : 
     878             :     /* Establish the vacuum cycle ID to use for this scan */
     879             :     /* The ENSURE stuff ensures we clean up shared memory on failure */
     880        4112 :     PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
     881             :     {
     882        4112 :         cycleid = _bt_start_vacuum(rel);
     883             : 
     884        4112 :         btvacuumscan(info, stats, callback, callback_state, cycleid);
     885             :     }
     886        8224 :     PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
     887        4112 :     _bt_end_vacuum(rel);
     888             : 
     889        4112 :     return stats;
     890             : }
     891             : 
     892             : /*
     893             :  * Post-VACUUM cleanup.
     894             :  *
     895             :  * Result: a palloc'd struct containing statistical info for VACUUM displays.
     896             :  */
     897             : IndexBulkDeleteResult *
     898      114674 : btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
     899             : {
     900             :     /* No-op in ANALYZE ONLY mode */
     901      114674 :     if (info->analyze_only)
     902       51076 :         return stats;
     903             : 
     904             :     /*
     905             :      * If btbulkdelete was called, we need not do anything, just return the
     906             :      * stats from the latest btbulkdelete call.  If it wasn't called, we might
     907             :      * still need to do a pass over the index, to recycle any newly-recyclable
     908             :      * pages or to obtain index statistics.  _bt_vacuum_needs_cleanup
     909             :      * determines if either are needed.
     910             :      *
     911             :      * Since we aren't going to actually delete any leaf items, there's no
     912             :      * need to go through all the vacuum-cycle-ID pushups.
     913             :      */
     914       63598 :     if (stats == NULL)
     915             :     {
     916             :         /* Check if we need a cleanup */
     917       59646 :         if (!_bt_vacuum_needs_cleanup(info))
     918        1986 :             return NULL;
     919             : 
     920       57660 :         stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
     921       57660 :         btvacuumscan(info, stats, NULL, NULL, 0);
     922             :     }
     923             : 
     924             :     /*
     925             :      * It's quite possible for us to be fooled by concurrent page splits into
     926             :      * double-counting some index tuples, so disbelieve any total that exceeds
     927             :      * the underlying heap's count ... if we know that accurately.  Otherwise
     928             :      * this might just make matters worse.
     929             :      */
     930       61612 :     if (!info->estimated_count)
     931             :     {
     932       61542 :         if (stats->num_index_tuples > info->num_heap_tuples)
     933          22 :             stats->num_index_tuples = info->num_heap_tuples;
     934             :     }
     935             : 
     936       61612 :     return stats;
     937             : }
     938             : 
     939             : /*
     940             :  * btvacuumscan --- scan the index for VACUUMing purposes
     941             :  *
     942             :  * This combines the functions of looking for leaf tuples that are deletable
     943             :  * according to the vacuum callback, looking for empty pages that can be
     944             :  * deleted, and looking for old deleted pages that can be recycled.  Both
     945             :  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
     946             :  * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
     947             :  * Note that this is also where the metadata used by _bt_vacuum_needs_cleanup
     948             :  * is maintained.
     949             :  *
     950             :  * The caller is responsible for initially allocating/zeroing a stats struct
     951             :  * and for obtaining a vacuum cycle ID if necessary.
     952             :  */
     953             : static void
     954       61772 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
     955             :              IndexBulkDeleteCallback callback, void *callback_state,
     956             :              BTCycleId cycleid)
     957             : {
     958       61772 :     Relation    rel = info->index;
     959             :     BTVacState  vstate;
     960             :     BlockNumber num_pages;
     961             :     BlockNumber scanblkno;
     962             :     bool        needLock;
     963             : 
     964             :     /*
     965             :      * Reset counts that will be incremented during the scan; needed in case
     966             :      * of multiple scans during a single VACUUM command
     967             :      */
     968       61772 :     stats->estimated_count = false;
     969       61772 :     stats->num_index_tuples = 0;
     970       61772 :     stats->pages_deleted = 0;
     971             : 
     972             :     /* Set up info to pass down to btvacuumpage */
     973       61772 :     vstate.info = info;
     974       61772 :     vstate.stats = stats;
     975       61772 :     vstate.callback = callback;
     976       61772 :     vstate.callback_state = callback_state;
     977       61772 :     vstate.cycleid = cycleid;
     978       61772 :     vstate.totFreePages = 0;
     979       61772 :     vstate.oldestBtpoXact = InvalidTransactionId;
     980             : 
     981             :     /* Create a temporary memory context to run _bt_pagedel in */
     982       61772 :     vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
     983             :                                                   "_bt_pagedel",
     984             :                                                   ALLOCSET_DEFAULT_SIZES);
     985             : 
     986             :     /*
     987             :      * The outer loop iterates over all index pages except the metapage, in
     988             :      * physical order (we hope the kernel will cooperate in providing
     989             :      * read-ahead for speed).  It is critical that we visit all leaf pages,
     990             :      * including ones added after we start the scan, else we might fail to
     991             :      * delete some deletable tuples.  Hence, we must repeatedly check the
     992             :      * relation length.  We must acquire the relation-extension lock while
     993             :      * doing so to avoid a race condition: if someone else is extending the
     994             :      * relation, there is a window where bufmgr/smgr have created a new
     995             :      * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
     996             :      * we manage to scan such a page here, we'll improperly assume it can be
     997             :      * recycled.  Taking the lock synchronizes things enough to prevent a
     998             :      * problem: either num_pages won't include the new page, or _bt_getbuf
     999             :      * already has write lock on the buffer and it will be fully initialized
    1000             :      * before we can examine it.  (See also vacuumlazy.c, which has the same
    1001             :      * issue.)  Also, we need not worry if a page is added immediately after
    1002             :      * we look; the page splitting code already has write-lock on the left
    1003             :      * page before it adds a right page, so we must already have processed any
    1004             :      * tuples due to be moved into such a page.
    1005             :      *
    1006             :      * We can skip locking for new or temp relations, however, since no one
    1007             :      * else could be accessing them.
    1008             :      */
    1009       61772 :     needLock = !RELATION_IS_LOCAL(rel);
    1010             : 
    1011       61772 :     scanblkno = BTREE_METAPAGE + 1;
    1012             :     for (;;)
    1013             :     {
    1014             :         /* Get the current relation length */
    1015       89826 :         if (needLock)
    1016       89818 :             LockRelationForExtension(rel, ExclusiveLock);
    1017       89826 :         num_pages = RelationGetNumberOfBlocks(rel);
    1018       89826 :         if (needLock)
    1019       89818 :             UnlockRelationForExtension(rel, ExclusiveLock);
    1020             : 
    1021       89826 :         if (info->report_progress)
    1022         236 :             pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
    1023             :                                          num_pages);
    1024             : 
    1025             :         /* Quit if we've scanned the whole relation */
    1026       89826 :         if (scanblkno >= num_pages)
    1027       61772 :             break;
    1028             :         /* Iterate over pages, then loop back to recheck length */
    1029      137020 :         for (; scanblkno < num_pages; scanblkno++)
    1030             :         {
    1031      108966 :             btvacuumpage(&vstate, scanblkno);
    1032      108966 :             if (info->report_progress)
    1033         136 :                 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
    1034             :                                              scanblkno);
    1035             :         }
    1036             :     }
    1037             : 
    1038       61772 :     MemoryContextDelete(vstate.pagedelcontext);
    1039             : 
    1040             :     /*
    1041             :      * If we found any recyclable pages (and recorded them in the FSM), then
    1042             :      * forcibly update the upper-level FSM pages to ensure that searchers can
    1043             :      * find them.  It's possible that the pages were also found during
    1044             :      * previous scans and so this is a waste of time, but it's cheap enough
    1045             :      * relative to scanning the index that it shouldn't matter much, and
    1046             :      * making sure that free pages are available sooner not later seems
    1047             :      * worthwhile.
    1048             :      *
    1049             :      * Note that if no recyclable pages exist, we don't bother vacuuming the
    1050             :      * FSM at all.
    1051             :      */
    1052       61772 :     if (vstate.totFreePages > 0)
    1053          66 :         IndexFreeSpaceMapVacuum(rel);
    1054             : 
    1055             :     /*
    1056             :      * Maintain the oldest btpo.xact and a count of the current number of heap
    1057             :      * tuples in the metapage (for the benefit of _bt_vacuum_needs_cleanup).
    1058             :      *
    1059             :      * The page with the oldest btpo.xact is typically a page deleted by this
    1060             :      * VACUUM operation, since pages deleted by a previous VACUUM operation
    1061             :      * tend to be placed in the FSM (by the current VACUUM operation) -- such
    1062             :      * pages are not candidates to be the oldest btpo.xact.  (Note that pages
    1063             :      * placed in the FSM are reported as deleted pages in the bulk delete
    1064             :      * statistics, despite not counting as deleted pages for the purposes of
    1065             :      * determining the oldest btpo.xact.)
    1066             :      */
    1067       61772 :     _bt_update_meta_cleanup_info(rel, vstate.oldestBtpoXact,
    1068             :                                  info->num_heap_tuples);
    1069             : 
    1070             :     /* update statistics */
    1071       61772 :     stats->num_pages = num_pages;
    1072       61772 :     stats->pages_free = vstate.totFreePages;
    1073       61772 : }
    1074             : 
    1075             : /*
    1076             :  * btvacuumpage --- VACUUM one page
    1077             :  *
    1078             :  * This processes a single page for btvacuumscan().  In some cases we must
    1079             :  * backtrack to re-examine and VACUUM pages that were the scanblkno during
    1080             :  * a previous call here.  This is how we handle page splits (that happened
    1081             :  * after our cycleid was acquired) whose right half page happened to reuse
    1082             :  * a block that we might have processed at some point before it was
    1083             :  * recycled (i.e. before the page split).
    1084             :  */
    1085             : static void
    1086      108966 : btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
    1087             : {
    1088      108966 :     IndexVacuumInfo *info = vstate->info;
    1089      108966 :     IndexBulkDeleteResult *stats = vstate->stats;
    1090      108966 :     IndexBulkDeleteCallback callback = vstate->callback;
    1091      108966 :     void       *callback_state = vstate->callback_state;
    1092      108966 :     Relation    rel = info->index;
    1093             :     bool        attempt_pagedel;
    1094             :     BlockNumber blkno,
    1095             :                 backtrack_to;
    1096             :     Buffer      buf;
    1097             :     Page        page;
    1098             :     BTPageOpaque opaque;
    1099             : 
    1100      108966 :     blkno = scanblkno;
    1101             : 
    1102      108966 : backtrack:
    1103             : 
    1104      108966 :     attempt_pagedel = false;
    1105      108966 :     backtrack_to = P_NONE;
    1106             : 
    1107             :     /* call vacuum_delay_point while not holding any buffer lock */
    1108      108966 :     vacuum_delay_point();
    1109             : 
    1110             :     /*
    1111             :      * We can't use _bt_getbuf() here because it always applies
    1112             :      * _bt_checkpage(), which will barf on an all-zero page. We want to
    1113             :      * recycle all-zero pages, not fail.  Also, we want to use a nondefault
    1114             :      * buffer access strategy.
    1115             :      */
    1116      108966 :     buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
    1117             :                              info->strategy);
    1118      108966 :     LockBuffer(buf, BT_READ);
    1119      108966 :     page = BufferGetPage(buf);
    1120      108966 :     opaque = NULL;
    1121      108966 :     if (!PageIsNew(page))
    1122             :     {
    1123      108966 :         _bt_checkpage(rel, buf);
    1124      108966 :         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    1125             :     }
    1126             : 
    1127             :     Assert(blkno <= scanblkno);
    1128      108966 :     if (blkno != scanblkno)
    1129             :     {
    1130             :         /*
    1131             :          * We're backtracking.
    1132             :          *
    1133             :          * We followed a right link to a sibling leaf page (a page that
    1134             :          * happens to be from a block located before scanblkno).  The only
    1135             :          * case we want to do anything with is a live leaf page having the
    1136             :          * current vacuum cycle ID.
    1137             :          *
    1138             :          * The page had better be in a state that's consistent with what we
    1139             :          * expect.  Check for conditions that imply corruption in passing.  It
    1140             :          * can't be half-dead because only an interrupted VACUUM process can
    1141             :          * leave pages in that state, so we'd definitely have dealt with it
    1142             :          * back when the page was the scanblkno page (half-dead pages are
    1143             :          * always marked fully deleted by _bt_pagedel()).  This assumes that
    1144             :          * there can be only one vacuum process running at a time.
    1145             :          */
    1146           0 :         if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
    1147             :         {
    1148             :             Assert(false);
    1149           0 :             ereport(LOG,
    1150             :                     (errcode(ERRCODE_INDEX_CORRUPTED),
    1151             :                      errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
    1152             :                                      blkno, scanblkno, RelationGetRelationName(rel))));
    1153           0 :             _bt_relbuf(rel, buf);
    1154           0 :             return;
    1155             :         }
    1156             : 
    1157             :         /*
    1158             :          * We may have already processed the page in an earlier call, when the
    1159             :          * page was scanblkno.  This happens when the leaf page split occurred
    1160             :          * after the scan began, but before the right sibling page became the
    1161             :          * scanblkno.
    1162             :          *
    1163             :          * Page may also have been deleted by current btvacuumpage() call,
    1164             :          * since _bt_pagedel() sometimes deletes the right sibling page of
    1165             :          * scanblkno in passing (it does so after we decided where to
    1166             :          * backtrack to).  We don't need to process this page as a deleted
    1167             :          * page a second time now (in fact, it would be wrong to count it as a
    1168             :          * deleted page in the bulk delete statistics a second time).
    1169             :          */
    1170           0 :         if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
    1171             :         {
    1172             :             /* Done with current scanblkno (and all lower split pages) */
    1173           0 :             _bt_relbuf(rel, buf);
    1174           0 :             return;
    1175             :         }
    1176             :     }
    1177             : 
    1178             :     /* Page is valid, see what to do with it */
    1179      108966 :     if (_bt_page_recyclable(page))
    1180             :     {
    1181             :         /* Okay to recycle this page (which could be leaf or internal) */
    1182        1556 :         RecordFreeIndexPage(rel, blkno);
    1183        1556 :         vstate->totFreePages++;
    1184        1556 :         stats->pages_deleted++;
    1185             :     }
    1186      107410 :     else if (P_ISDELETED(opaque))
    1187             :     {
    1188             :         /*
    1189             :          * Already deleted page (which could be leaf or internal).  Can't
    1190             :          * recycle yet.
    1191             :          */
    1192          86 :         stats->pages_deleted++;
    1193             : 
    1194             :         /* Maintain the oldest btpo.xact */
    1195         172 :         if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
    1196          86 :             TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
    1197           0 :             vstate->oldestBtpoXact = opaque->btpo.xact;
    1198             :     }
    1199      107324 :     else if (P_ISHALFDEAD(opaque))
    1200             :     {
    1201             :         /*
    1202             :          * Half-dead leaf page.  Try to delete now.  Might update
    1203             :          * oldestBtpoXact and pages_deleted below.
    1204             :          */
    1205           0 :         attempt_pagedel = true;
    1206             :     }
    1207      107324 :     else if (P_ISLEAF(opaque))
    1208             :     {
    1209             :         OffsetNumber deletable[MaxIndexTuplesPerPage];
    1210             :         int         ndeletable;
    1211             :         BTVacuumPosting updatable[MaxIndexTuplesPerPage];
    1212             :         int         nupdatable;
    1213             :         OffsetNumber offnum,
    1214             :                     minoff,
    1215             :                     maxoff;
    1216             :         int         nhtidsdead,
    1217             :                     nhtidslive;
    1218             : 
    1219             :         /*
    1220             :          * Trade in the initial read lock for a super-exclusive write lock on
    1221             :          * this page.  We must get such a lock on every leaf page over the
    1222             :          * course of the vacuum scan, whether or not it actually contains any
    1223             :          * deletable tuples --- see nbtree/README.
    1224             :          */
    1225       99464 :         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    1226       99464 :         LockBufferForCleanup(buf);
    1227             : 
    1228             :         /*
    1229             :          * Check whether we need to backtrack to earlier pages.  What we are
    1230             :          * concerned about is a page split that happened since we started the
    1231             :          * vacuum scan.  If the split moved tuples on the right half of the
    1232             :          * split (i.e. the tuples that sort high) to a block that we already
    1233             :          * passed over, then we might have missed the tuples.  We need to
    1234             :          * backtrack now.  (Must do this before possibly clearing btpo_cycleid
    1235             :          * or deleting scanblkno page below!)
    1236             :          */
    1237       99464 :         if (vstate->cycleid != 0 &&
    1238       60420 :             opaque->btpo_cycleid == vstate->cycleid &&
    1239           4 :             !(opaque->btpo_flags & BTP_SPLIT_END) &&
    1240           2 :             !P_RIGHTMOST(opaque) &&
    1241           2 :             opaque->btpo_next < scanblkno)
    1242           0 :             backtrack_to = opaque->btpo_next;
    1243             : 
    1244             :         /*
    1245             :          * When each VACUUM begins, it determines an OldestXmin cutoff value.
    1246             :          * Tuples before the cutoff are removed by VACUUM.  Scan over all
    1247             :          * items to see which ones need to be deleted according to cutoff
    1248             :          * point using callback.
    1249             :          */
    1250       99464 :         ndeletable = 0;
    1251       99464 :         nupdatable = 0;
    1252       99464 :         minoff = P_FIRSTDATAKEY(opaque);
    1253       99464 :         maxoff = PageGetMaxOffsetNumber(page);
    1254       99464 :         nhtidsdead = 0;
    1255       99464 :         nhtidslive = 0;
    1256       99464 :         if (callback)
    1257             :         {
    1258    12301708 :             for (offnum = minoff;
    1259             :                  offnum <= maxoff;
    1260    12241288 :                  offnum = OffsetNumberNext(offnum))
    1261             :             {
    1262             :                 IndexTuple  itup;
    1263             : 
    1264    12241288 :                 itup = (IndexTuple) PageGetItem(page,
    1265             :                                                 PageGetItemId(page, offnum));
    1266             : 
    1267             :                 /*
    1268             :                  * Hot Standby assumes that it's okay that XLOG_BTREE_VACUUM
    1269             :                  * records do not produce their own conflicts.  This is safe
    1270             :                  * as long as the callback function only considers whether the
    1271             :                  * index tuple refers to pre-cutoff heap tuples that were
    1272             :                  * certainly already pruned away during VACUUM's initial heap
    1273             :                  * scan by the time we get here. (XLOG_HEAP2_CLEANUP_INFO
    1274             :                  * records produce conflicts using a latestRemovedXid value
    1275             :                  * for the entire VACUUM, so there is no need to produce our
    1276             :                  * own conflict now.)
    1277             :                  *
    1278             :                  * Backends with snapshots acquired after a VACUUM starts but
    1279             :                  * before it finishes could have a RecentGlobalXmin with a
    1280             :                  * later xid than the VACUUM's OldestXmin cutoff.  These
    1281             :                  * backends might happen to opportunistically mark some index
    1282             :                  * tuples LP_DEAD before we reach them, even though they may
    1283             :                  * be after our cutoff.  We don't try to kill these "extra"
    1284             :                  * index tuples in _bt_delitems_vacuum().  This keep things
    1285             :                  * simple, and allows us to always avoid generating our own
    1286             :                  * conflicts.
    1287             :                  */
    1288             :                 Assert(!BTreeTupleIsPivot(itup));
    1289    12241288 :                 if (!BTreeTupleIsPosting(itup))
    1290             :                 {
    1291             :                     /* Regular tuple, standard table TID representation */
    1292    12227648 :                     if (callback(&itup->t_tid, callback_state))
    1293             :                     {
    1294     1331028 :                         deletable[ndeletable++] = offnum;
    1295     1331028 :                         nhtidsdead++;
    1296             :                     }
    1297             :                     else
    1298    10896620 :                         nhtidslive++;
    1299             :                 }
    1300             :                 else
    1301             :                 {
    1302             :                     BTVacuumPosting vacposting;
    1303             :                     int         nremaining;
    1304             : 
    1305             :                     /* Posting list tuple */
    1306       13640 :                     vacposting = btreevacuumposting(vstate, itup, offnum,
    1307             :                                                     &nremaining);
    1308       13640 :                     if (vacposting == NULL)
    1309             :                     {
    1310             :                         /*
    1311             :                          * All table TIDs from the posting tuple remain, so no
    1312             :                          * delete or update required
    1313             :                          */
    1314             :                         Assert(nremaining == BTreeTupleGetNPosting(itup));
    1315             :                     }
    1316       12248 :                     else if (nremaining > 0)
    1317             :                     {
    1318             : 
    1319             :                         /*
    1320             :                          * Store metadata about posting list tuple in
    1321             :                          * updatable array for entire page.  Existing tuple
    1322             :                          * will be updated during the later call to
    1323             :                          * _bt_delitems_vacuum().
    1324             :                          */
    1325             :                         Assert(nremaining < BTreeTupleGetNPosting(itup));
    1326        8856 :                         updatable[nupdatable++] = vacposting;
    1327        8856 :                         nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
    1328             :                     }
    1329             :                     else
    1330             :                     {
    1331             :                         /*
    1332             :                          * All table TIDs from the posting list must be
    1333             :                          * deleted.  We'll delete the index tuple completely
    1334             :                          * (no update required).
    1335             :                          */
    1336             :                         Assert(nremaining == 0);
    1337        3392 :                         deletable[ndeletable++] = offnum;
    1338        3392 :                         nhtidsdead += BTreeTupleGetNPosting(itup);
    1339        3392 :                         pfree(vacposting);
    1340             :                     }
    1341             : 
    1342       13640 :                     nhtidslive += nremaining;
    1343             :                 }
    1344             :             }
    1345             :         }
    1346             : 
    1347             :         /*
    1348             :          * Apply any needed deletes or updates.  We issue just one
    1349             :          * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
    1350             :          */
    1351       99464 :         if (ndeletable > 0 || nupdatable > 0)
    1352             :         {
    1353             :             Assert(nhtidsdead >= ndeletable + nupdatable);
    1354       18184 :             _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
    1355             :                                 nupdatable);
    1356             : 
    1357       18184 :             stats->tuples_removed += nhtidsdead;
    1358             :             /* must recompute maxoff */
    1359       18184 :             maxoff = PageGetMaxOffsetNumber(page);
    1360             : 
    1361             :             /* can't leak memory here */
    1362       27040 :             for (int i = 0; i < nupdatable; i++)
    1363        8856 :                 pfree(updatable[i]);
    1364             :         }
    1365             :         else
    1366             :         {
    1367             :             /*
    1368             :              * If the leaf page has been split during this vacuum cycle, it
    1369             :              * seems worth expending a write to clear btpo_cycleid even if we
    1370             :              * don't have any deletions to do.  (If we do, _bt_delitems_vacuum
    1371             :              * takes care of this.)  This ensures we won't process the page
    1372             :              * again.
    1373             :              *
    1374             :              * We treat this like a hint-bit update because there's no need to
    1375             :              * WAL-log it.
    1376             :              */
    1377             :             Assert(nhtidsdead == 0);
    1378       81280 :             if (vstate->cycleid != 0 &&
    1379       42236 :                 opaque->btpo_cycleid == vstate->cycleid)
    1380             :             {
    1381           2 :                 opaque->btpo_cycleid = 0;
    1382           2 :                 MarkBufferDirtyHint(buf, true);
    1383             :             }
    1384             :         }
    1385             : 
    1386             :         /*
    1387             :          * If the leaf page is now empty, try to delete it; else count the
    1388             :          * live tuples (live table TIDs in posting lists are counted as
    1389             :          * separate live tuples).  We don't delete when backtracking, though,
    1390             :          * since that would require teaching _bt_pagedel() about backtracking
    1391             :          * (doesn't seem worth adding more complexity to deal with that).
    1392             :          */
    1393       99464 :         if (minoff > maxoff)
    1394        4164 :             attempt_pagedel = (blkno == scanblkno);
    1395             :         else
    1396       95300 :             stats->num_index_tuples += nhtidslive;
    1397             : 
    1398             :         Assert(!attempt_pagedel || nhtidslive == 0);
    1399             :     }
    1400             : 
    1401      108966 :     if (attempt_pagedel)
    1402             :     {
    1403             :         MemoryContext oldcontext;
    1404             : 
    1405             :         /* Run pagedel in a temp context to avoid memory leakage */
    1406        4164 :         MemoryContextReset(vstate->pagedelcontext);
    1407        4164 :         oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
    1408             : 
    1409             :         /*
    1410             :          * We trust the _bt_pagedel return value because it does not include
    1411             :          * any page that a future call here from btvacuumscan is expected to
    1412             :          * count.  There will be no double-counting.
    1413             :          */
    1414             :         Assert(blkno == scanblkno);
    1415        4164 :         stats->pages_deleted += _bt_pagedel(rel, buf, &vstate->oldestBtpoXact);
    1416             : 
    1417        4164 :         MemoryContextSwitchTo(oldcontext);
    1418             :         /* pagedel released buffer, so we shouldn't */
    1419             :     }
    1420             :     else
    1421      104802 :         _bt_relbuf(rel, buf);
    1422             : 
    1423      108966 :     if (backtrack_to != P_NONE)
    1424             :     {
    1425           0 :         blkno = backtrack_to;
    1426           0 :         goto backtrack;
    1427             :     }
    1428             : }
    1429             : 
    1430             : /*
    1431             :  * btreevacuumposting --- determine TIDs still needed in posting list
    1432             :  *
    1433             :  * Returns metadata describing how to build replacement tuple without the TIDs
    1434             :  * that VACUUM needs to delete.  Returned value is NULL in the common case
    1435             :  * where no changes are needed to caller's posting list tuple (we avoid
    1436             :  * allocating memory here as an optimization).
    1437             :  *
    1438             :  * The number of TIDs that should remain in the posting list tuple is set for
    1439             :  * caller in *nremaining.
    1440             :  */
    1441             : static BTVacuumPosting
    1442       13640 : btreevacuumposting(BTVacState *vstate, IndexTuple posting,
    1443             :                    OffsetNumber updatedoffset, int *nremaining)
    1444             : {
    1445       13640 :     int         live = 0;
    1446       13640 :     int         nitem = BTreeTupleGetNPosting(posting);
    1447       13640 :     ItemPointer items = BTreeTupleGetPosting(posting);
    1448       13640 :     BTVacuumPosting vacposting = NULL;
    1449             : 
    1450       82704 :     for (int i = 0; i < nitem; i++)
    1451             :     {
    1452       69064 :         if (!vstate->callback(items + i, vstate->callback_state))
    1453             :         {
    1454             :             /* Live table TID */
    1455       20708 :             live++;
    1456             :         }
    1457       48356 :         else if (vacposting == NULL)
    1458             :         {
    1459             :             /*
    1460             :              * First dead table TID encountered.
    1461             :              *
    1462             :              * It's now clear that we need to delete one or more dead table
    1463             :              * TIDs, so start maintaining metadata describing how to update
    1464             :              * existing posting list tuple.
    1465             :              */
    1466       12248 :             vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
    1467             :                                 nitem * sizeof(uint16));
    1468             : 
    1469       12248 :             vacposting->itup = posting;
    1470       12248 :             vacposting->updatedoffset = updatedoffset;
    1471       12248 :             vacposting->ndeletedtids = 0;
    1472       12248 :             vacposting->deletetids[vacposting->ndeletedtids++] = i;
    1473             :         }
    1474             :         else
    1475             :         {
    1476             :             /* Second or subsequent dead table TID */
    1477       36108 :             vacposting->deletetids[vacposting->ndeletedtids++] = i;
    1478             :         }
    1479             :     }
    1480             : 
    1481       13640 :     *nremaining = live;
    1482       13640 :     return vacposting;
    1483             : }
    1484             : 
    1485             : /*
    1486             :  *  btcanreturn() -- Check whether btree indexes support index-only scans.
    1487             :  *
    1488             :  * btrees always do, so this is trivial.
    1489             :  */
    1490             : bool
    1491      634114 : btcanreturn(Relation index, int attno)
    1492             : {
    1493      634114 :     return true;
    1494             : }

Generated by: LCOV version 1.13