LCOV - code coverage report
Current view: top level - src/backend/access/brin - brin.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 775 844 91.8 %
Date: 2025-11-22 15:17:49 Functions: 41 41 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * brin.c
       3             :  *      Implementation of BRIN indexes for Postgres
       4             :  *
       5             :  * See src/backend/access/brin/README for details.
       6             :  *
       7             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       8             :  * Portions Copyright (c) 1994, Regents of the University of California
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/access/brin/brin.c
      12             :  *
      13             :  * TODO
      14             :  *      * ScalarArrayOpExpr (amsearcharray -> SK_SEARCHARRAY)
      15             :  */
      16             : #include "postgres.h"
      17             : 
      18             : #include "access/brin.h"
      19             : #include "access/brin_page.h"
      20             : #include "access/brin_pageops.h"
      21             : #include "access/brin_xlog.h"
      22             : #include "access/relation.h"
      23             : #include "access/reloptions.h"
      24             : #include "access/relscan.h"
      25             : #include "access/table.h"
      26             : #include "access/tableam.h"
      27             : #include "access/xloginsert.h"
      28             : #include "catalog/index.h"
      29             : #include "catalog/pg_am.h"
      30             : #include "commands/vacuum.h"
      31             : #include "miscadmin.h"
      32             : #include "pgstat.h"
      33             : #include "postmaster/autovacuum.h"
      34             : #include "storage/bufmgr.h"
      35             : #include "storage/freespace.h"
      36             : #include "tcop/tcopprot.h"
      37             : #include "utils/acl.h"
      38             : #include "utils/datum.h"
      39             : #include "utils/fmgrprotos.h"
      40             : #include "utils/guc.h"
      41             : #include "utils/index_selfuncs.h"
      42             : #include "utils/memutils.h"
      43             : #include "utils/rel.h"
      44             : #include "utils/tuplesort.h"
      45             : 
      46             : /* Magic numbers for parallel state sharing */
      47             : #define PARALLEL_KEY_BRIN_SHARED        UINT64CONST(0xB000000000000001)
      48             : #define PARALLEL_KEY_TUPLESORT          UINT64CONST(0xB000000000000002)
      49             : #define PARALLEL_KEY_QUERY_TEXT         UINT64CONST(0xB000000000000003)
      50             : #define PARALLEL_KEY_WAL_USAGE          UINT64CONST(0xB000000000000004)
      51             : #define PARALLEL_KEY_BUFFER_USAGE       UINT64CONST(0xB000000000000005)
      52             : 
      53             : /*
      54             :  * Status for index builds performed in parallel.  This is allocated in a
      55             :  * dynamic shared memory segment.
      56             :  */
      57             : typedef struct BrinShared
      58             : {
      59             :     /*
      60             :      * These fields are not modified during the build.  They primarily exist
      61             :      * for the benefit of worker processes that need to create state
      62             :      * corresponding to that used by the leader.
      63             :      */
      64             :     Oid         heaprelid;
      65             :     Oid         indexrelid;
      66             :     bool        isconcurrent;
      67             :     BlockNumber pagesPerRange;
      68             :     int         scantuplesortstates;
      69             : 
      70             :     /* Query ID, for report in worker processes */
      71             :     int64       queryid;
      72             : 
      73             :     /*
      74             :      * workersdonecv is used to monitor the progress of workers.  All parallel
      75             :      * participants must indicate that they are done before leader can use
      76             :      * results built by the workers (and before leader can write the data into
      77             :      * the index).
      78             :      */
      79             :     ConditionVariable workersdonecv;
      80             : 
      81             :     /*
      82             :      * mutex protects all fields before heapdesc.
      83             :      *
      84             :      * These fields contain status information of interest to BRIN index
      85             :      * builds that must work just the same when an index is built in parallel.
      86             :      */
      87             :     slock_t     mutex;
      88             : 
      89             :     /*
      90             :      * Mutable state that is maintained by workers, and reported back to
      91             :      * leader at end of the scans.
      92             :      *
      93             :      * nparticipantsdone is number of worker processes finished.
      94             :      *
      95             :      * reltuples is the total number of input heap tuples.
      96             :      *
      97             :      * indtuples is the total number of tuples that made it into the index.
      98             :      */
      99             :     int         nparticipantsdone;
     100             :     double      reltuples;
     101             :     double      indtuples;
     102             : 
     103             :     /*
     104             :      * ParallelTableScanDescData data follows. Can't directly embed here, as
     105             :      * implementations of the parallel table scan desc interface might need
     106             :      * stronger alignment.
     107             :      */
     108             : } BrinShared;
     109             : 
     110             : /*
     111             :  * Return pointer to a BrinShared's parallel table scan.
     112             :  *
     113             :  * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
     114             :  * MAXALIGN.
     115             :  */
     116             : #define ParallelTableScanFromBrinShared(shared) \
     117             :     (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BrinShared)))
     118             : 
     119             : /*
     120             :  * Status for leader in parallel index build.
     121             :  */
     122             : typedef struct BrinLeader
     123             : {
     124             :     /* parallel context itself */
     125             :     ParallelContext *pcxt;
     126             : 
     127             :     /*
     128             :      * nparticipanttuplesorts is the exact number of worker processes
     129             :      * successfully launched, plus one leader process if it participates as a
     130             :      * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
     131             :      * participating as a worker).
     132             :      */
     133             :     int         nparticipanttuplesorts;
     134             : 
     135             :     /*
     136             :      * Leader process convenience pointers to shared state (leader avoids TOC
     137             :      * lookups).
     138             :      *
     139             :      * brinshared is the shared state for entire build.  sharedsort is the
     140             :      * shared, tuplesort-managed state passed to each process tuplesort.
     141             :      * snapshot is the snapshot used by the scan iff an MVCC snapshot is
     142             :      * required.
     143             :      */
     144             :     BrinShared *brinshared;
     145             :     Sharedsort *sharedsort;
     146             :     Snapshot    snapshot;
     147             :     WalUsage   *walusage;
     148             :     BufferUsage *bufferusage;
     149             : } BrinLeader;
     150             : 
     151             : /*
     152             :  * We use a BrinBuildState during initial construction of a BRIN index.
     153             :  * The running state is kept in a BrinMemTuple.
     154             :  */
     155             : typedef struct BrinBuildState
     156             : {
     157             :     Relation    bs_irel;
     158             :     double      bs_numtuples;
     159             :     double      bs_reltuples;
     160             :     Buffer      bs_currentInsertBuf;
     161             :     BlockNumber bs_pagesPerRange;
     162             :     BlockNumber bs_currRangeStart;
     163             :     BlockNumber bs_maxRangeStart;
     164             :     BrinRevmap *bs_rmAccess;
     165             :     BrinDesc   *bs_bdesc;
     166             :     BrinMemTuple *bs_dtuple;
     167             : 
     168             :     BrinTuple  *bs_emptyTuple;
     169             :     Size        bs_emptyTupleLen;
     170             :     MemoryContext bs_context;
     171             : 
     172             :     /*
     173             :      * bs_leader is only present when a parallel index build is performed, and
     174             :      * only in the leader process. (Actually, only the leader process has a
     175             :      * BrinBuildState.)
     176             :      */
     177             :     BrinLeader *bs_leader;
     178             :     int         bs_worker_id;
     179             : 
     180             :     /*
     181             :      * The sortstate is used by workers (including the leader). It has to be
     182             :      * part of the build state, because that's the only thing passed to the
     183             :      * build callback etc.
     184             :      */
     185             :     Tuplesortstate *bs_sortstate;
     186             : } BrinBuildState;
     187             : 
     188             : /*
     189             :  * We use a BrinInsertState to capture running state spanning multiple
     190             :  * brininsert invocations, within the same command.
     191             :  */
     192             : typedef struct BrinInsertState
     193             : {
     194             :     BrinRevmap *bis_rmAccess;
     195             :     BrinDesc   *bis_desc;
     196             :     BlockNumber bis_pages_per_range;
     197             : } BrinInsertState;
     198             : 
     199             : /*
     200             :  * Struct used as "opaque" during index scans
     201             :  */
     202             : typedef struct BrinOpaque
     203             : {
     204             :     BlockNumber bo_pagesPerRange;
     205             :     BrinRevmap *bo_rmAccess;
     206             :     BrinDesc   *bo_bdesc;
     207             : } BrinOpaque;
     208             : 
     209             : #define BRIN_ALL_BLOCKRANGES    InvalidBlockNumber
     210             : 
     211             : static BrinBuildState *initialize_brin_buildstate(Relation idxRel,
     212             :                                                   BrinRevmap *revmap,
     213             :                                                   BlockNumber pagesPerRange,
     214             :                                                   BlockNumber tablePages);
     215             : static BrinInsertState *initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo);
     216             : static void terminate_brin_buildstate(BrinBuildState *state);
     217             : static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
     218             :                           bool include_partial, double *numSummarized, double *numExisting);
     219             : static void form_and_insert_tuple(BrinBuildState *state);
     220             : static void form_and_spill_tuple(BrinBuildState *state);
     221             : static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a,
     222             :                          BrinTuple *b);
     223             : static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy);
     224             : static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc,
     225             :                                 BrinMemTuple *dtup, const Datum *values, const bool *nulls);
     226             : static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys);
     227             : static void brin_fill_empty_ranges(BrinBuildState *state,
     228             :                                    BlockNumber prevRange, BlockNumber nextRange);
     229             : 
     230             : /* parallel index builds */
     231             : static void _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
     232             :                                  bool isconcurrent, int request);
     233             : static void _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state);
     234             : static Size _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot);
     235             : static double _brin_parallel_heapscan(BrinBuildState *state);
     236             : static double _brin_parallel_merge(BrinBuildState *state);
     237             : static void _brin_leader_participate_as_worker(BrinBuildState *buildstate,
     238             :                                                Relation heap, Relation index);
     239             : static void _brin_parallel_scan_and_build(BrinBuildState *state,
     240             :                                           BrinShared *brinshared,
     241             :                                           Sharedsort *sharedsort,
     242             :                                           Relation heap, Relation index,
     243             :                                           int sortmem, bool progress);
     244             : 
     245             : /*
     246             :  * BRIN handler function: return IndexAmRoutine with access method parameters
     247             :  * and callbacks.
     248             :  */
     249             : Datum
     250        4404 : brinhandler(PG_FUNCTION_ARGS)
     251             : {
     252        4404 :     IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
     253             : 
     254        4404 :     amroutine->amstrategies = 0;
     255        4404 :     amroutine->amsupport = BRIN_LAST_OPTIONAL_PROCNUM;
     256        4404 :     amroutine->amoptsprocnum = BRIN_PROCNUM_OPTIONS;
     257        4404 :     amroutine->amcanorder = false;
     258        4404 :     amroutine->amcanorderbyop = false;
     259        4404 :     amroutine->amcanhash = false;
     260        4404 :     amroutine->amconsistentequality = false;
     261        4404 :     amroutine->amconsistentordering = false;
     262        4404 :     amroutine->amcanbackward = false;
     263        4404 :     amroutine->amcanunique = false;
     264        4404 :     amroutine->amcanmulticol = true;
     265        4404 :     amroutine->amoptionalkey = true;
     266        4404 :     amroutine->amsearcharray = false;
     267        4404 :     amroutine->amsearchnulls = true;
     268        4404 :     amroutine->amstorage = true;
     269        4404 :     amroutine->amclusterable = false;
     270        4404 :     amroutine->ampredlocks = false;
     271        4404 :     amroutine->amcanparallel = false;
     272        4404 :     amroutine->amcanbuildparallel = true;
     273        4404 :     amroutine->amcaninclude = false;
     274        4404 :     amroutine->amusemaintenanceworkmem = false;
     275        4404 :     amroutine->amsummarizing = true;
     276        4404 :     amroutine->amparallelvacuumoptions =
     277             :         VACUUM_OPTION_PARALLEL_CLEANUP;
     278        4404 :     amroutine->amkeytype = InvalidOid;
     279             : 
     280        4404 :     amroutine->ambuild = brinbuild;
     281        4404 :     amroutine->ambuildempty = brinbuildempty;
     282        4404 :     amroutine->aminsert = brininsert;
     283        4404 :     amroutine->aminsertcleanup = brininsertcleanup;
     284        4404 :     amroutine->ambulkdelete = brinbulkdelete;
     285        4404 :     amroutine->amvacuumcleanup = brinvacuumcleanup;
     286        4404 :     amroutine->amcanreturn = NULL;
     287        4404 :     amroutine->amcostestimate = brincostestimate;
     288        4404 :     amroutine->amgettreeheight = NULL;
     289        4404 :     amroutine->amoptions = brinoptions;
     290        4404 :     amroutine->amproperty = NULL;
     291        4404 :     amroutine->ambuildphasename = NULL;
     292        4404 :     amroutine->amvalidate = brinvalidate;
     293        4404 :     amroutine->amadjustmembers = NULL;
     294        4404 :     amroutine->ambeginscan = brinbeginscan;
     295        4404 :     amroutine->amrescan = brinrescan;
     296        4404 :     amroutine->amgettuple = NULL;
     297        4404 :     amroutine->amgetbitmap = bringetbitmap;
     298        4404 :     amroutine->amendscan = brinendscan;
     299        4404 :     amroutine->ammarkpos = NULL;
     300        4404 :     amroutine->amrestrpos = NULL;
     301        4404 :     amroutine->amestimateparallelscan = NULL;
     302        4404 :     amroutine->aminitparallelscan = NULL;
     303        4404 :     amroutine->amparallelrescan = NULL;
     304        4404 :     amroutine->amtranslatestrategy = NULL;
     305        4404 :     amroutine->amtranslatecmptype = NULL;
     306             : 
     307        4404 :     PG_RETURN_POINTER(amroutine);
     308             : }
     309             : 
     310             : /*
     311             :  * Initialize a BrinInsertState to maintain state to be used across multiple
     312             :  * tuple inserts, within the same command.
     313             :  */
     314             : static BrinInsertState *
     315        1122 : initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo)
     316             : {
     317             :     BrinInsertState *bistate;
     318             :     MemoryContext oldcxt;
     319             : 
     320        1122 :     oldcxt = MemoryContextSwitchTo(indexInfo->ii_Context);
     321        1122 :     bistate = palloc0(sizeof(BrinInsertState));
     322        1122 :     bistate->bis_desc = brin_build_desc(idxRel);
     323        1122 :     bistate->bis_rmAccess = brinRevmapInitialize(idxRel,
     324             :                                                  &bistate->bis_pages_per_range);
     325        1122 :     indexInfo->ii_AmCache = bistate;
     326        1122 :     MemoryContextSwitchTo(oldcxt);
     327             : 
     328        1122 :     return bistate;
     329             : }
     330             : 
     331             : /*
     332             :  * A tuple in the heap is being inserted.  To keep a brin index up to date,
     333             :  * we need to obtain the relevant index tuple and compare its stored values
     334             :  * with those of the new tuple.  If the tuple values are not consistent with
     335             :  * the summary tuple, we need to update the index tuple.
     336             :  *
     337             :  * If autosummarization is enabled, check if we need to summarize the previous
     338             :  * page range.
     339             :  *
     340             :  * If the range is not currently summarized (i.e. the revmap returns NULL for
     341             :  * it), there's nothing to do for this tuple.
     342             :  */
     343             : bool
     344      126136 : brininsert(Relation idxRel, Datum *values, bool *nulls,
     345             :            ItemPointer heaptid, Relation heapRel,
     346             :            IndexUniqueCheck checkUnique,
     347             :            bool indexUnchanged,
     348             :            IndexInfo *indexInfo)
     349             : {
     350             :     BlockNumber pagesPerRange;
     351             :     BlockNumber origHeapBlk;
     352             :     BlockNumber heapBlk;
     353      126136 :     BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
     354             :     BrinRevmap *revmap;
     355             :     BrinDesc   *bdesc;
     356      126136 :     Buffer      buf = InvalidBuffer;
     357      126136 :     MemoryContext tupcxt = NULL;
     358      126136 :     MemoryContext oldcxt = CurrentMemoryContext;
     359      126136 :     bool        autosummarize = BrinGetAutoSummarize(idxRel);
     360             : 
     361             :     /*
     362             :      * If first time through in this statement, initialize the insert state
     363             :      * that we keep for all the inserts in the command.
     364             :      */
     365      126136 :     if (!bistate)
     366        1122 :         bistate = initialize_brin_insertstate(idxRel, indexInfo);
     367             : 
     368      126136 :     revmap = bistate->bis_rmAccess;
     369      126136 :     bdesc = bistate->bis_desc;
     370      126136 :     pagesPerRange = bistate->bis_pages_per_range;
     371             : 
     372             :     /*
     373             :      * origHeapBlk is the block number where the insertion occurred.  heapBlk
     374             :      * is the first block in the corresponding page range.
     375             :      */
     376      126136 :     origHeapBlk = ItemPointerGetBlockNumber(heaptid);
     377      126136 :     heapBlk = (origHeapBlk / pagesPerRange) * pagesPerRange;
     378             : 
     379             :     for (;;)
     380           0 :     {
     381      126136 :         bool        need_insert = false;
     382             :         OffsetNumber off;
     383             :         BrinTuple  *brtup;
     384             :         BrinMemTuple *dtup;
     385             : 
     386      126136 :         CHECK_FOR_INTERRUPTS();
     387             : 
     388             :         /*
     389             :          * If auto-summarization is enabled and we just inserted the first
     390             :          * tuple into the first block of a new non-first page range, request a
     391             :          * summarization run of the previous range.
     392             :          */
     393      126136 :         if (autosummarize &&
     394         290 :             heapBlk > 0 &&
     395         290 :             heapBlk == origHeapBlk &&
     396         290 :             ItemPointerGetOffsetNumber(heaptid) == FirstOffsetNumber)
     397             :         {
     398          16 :             BlockNumber lastPageRange = heapBlk - 1;
     399             :             BrinTuple  *lastPageTuple;
     400             : 
     401             :             lastPageTuple =
     402          16 :                 brinGetTupleForHeapBlock(revmap, lastPageRange, &buf, &off,
     403             :                                          NULL, BUFFER_LOCK_SHARE);
     404          16 :             if (!lastPageTuple)
     405             :             {
     406             :                 bool        recorded;
     407             : 
     408          12 :                 recorded = AutoVacuumRequestWork(AVW_BRINSummarizeRange,
     409             :                                                  RelationGetRelid(idxRel),
     410             :                                                  lastPageRange);
     411          12 :                 if (!recorded)
     412           0 :                     ereport(LOG,
     413             :                             (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     414             :                              errmsg("request for BRIN range summarization for index \"%s\" page %u was not recorded",
     415             :                                     RelationGetRelationName(idxRel),
     416             :                                     lastPageRange)));
     417             :             }
     418             :             else
     419           4 :                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     420             :         }
     421             : 
     422      126136 :         brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off,
     423             :                                          NULL, BUFFER_LOCK_SHARE);
     424             : 
     425             :         /* if range is unsummarized, there's nothing to do */
     426      126136 :         if (!brtup)
     427       78254 :             break;
     428             : 
     429             :         /* First time through in this brininsert call? */
     430       47882 :         if (tupcxt == NULL)
     431             :         {
     432       47882 :             tupcxt = AllocSetContextCreate(CurrentMemoryContext,
     433             :                                            "brininsert cxt",
     434             :                                            ALLOCSET_DEFAULT_SIZES);
     435       47882 :             MemoryContextSwitchTo(tupcxt);
     436             :         }
     437             : 
     438       47882 :         dtup = brin_deform_tuple(bdesc, brtup, NULL);
     439             : 
     440       47882 :         need_insert = add_values_to_range(idxRel, bdesc, dtup, values, nulls);
     441             : 
     442       47882 :         if (!need_insert)
     443             :         {
     444             :             /*
     445             :              * The tuple is consistent with the new values, so there's nothing
     446             :              * to do.
     447             :              */
     448       23954 :             LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     449             :         }
     450             :         else
     451             :         {
     452       23928 :             Page        page = BufferGetPage(buf);
     453       23928 :             ItemId      lp = PageGetItemId(page, off);
     454             :             Size        origsz;
     455             :             BrinTuple  *origtup;
     456             :             Size        newsz;
     457             :             BrinTuple  *newtup;
     458             :             bool        samepage;
     459             : 
     460             :             /*
     461             :              * Make a copy of the old tuple, so that we can compare it after
     462             :              * re-acquiring the lock.
     463             :              */
     464       23928 :             origsz = ItemIdGetLength(lp);
     465       23928 :             origtup = brin_copy_tuple(brtup, origsz, NULL, NULL);
     466             : 
     467             :             /*
     468             :              * Before releasing the lock, check if we can attempt a same-page
     469             :              * update.  Another process could insert a tuple concurrently in
     470             :              * the same page though, so downstream we must be prepared to cope
     471             :              * if this turns out to not be possible after all.
     472             :              */
     473       23928 :             newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz);
     474       23928 :             samepage = brin_can_do_samepage_update(buf, origsz, newsz);
     475       23928 :             LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     476             : 
     477             :             /*
     478             :              * Try to update the tuple.  If this doesn't work for whatever
     479             :              * reason, we need to restart from the top; the revmap might be
     480             :              * pointing at a different tuple for this block now, so we need to
     481             :              * recompute to ensure both our new heap tuple and the other
     482             :              * inserter's are covered by the combined tuple.  It might be that
     483             :              * we don't need to update at all.
     484             :              */
     485       23928 :             if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk,
     486             :                                buf, off, origtup, origsz, newtup, newsz,
     487             :                                samepage))
     488             :             {
     489             :                 /* no luck; start over */
     490           0 :                 MemoryContextReset(tupcxt);
     491           0 :                 continue;
     492             :             }
     493             :         }
     494             : 
     495             :         /* success! */
     496       47882 :         break;
     497             :     }
     498             : 
     499      126136 :     if (BufferIsValid(buf))
     500       47886 :         ReleaseBuffer(buf);
     501      126136 :     MemoryContextSwitchTo(oldcxt);
     502      126136 :     if (tupcxt != NULL)
     503       47882 :         MemoryContextDelete(tupcxt);
     504             : 
     505      126136 :     return false;
     506             : }
     507             : 
     508             : /*
     509             :  * Callback to clean up the BrinInsertState once all tuple inserts are done.
     510             :  */
     511             : void
     512        1156 : brininsertcleanup(Relation index, IndexInfo *indexInfo)
     513             : {
     514        1156 :     BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
     515             : 
     516             :     /* bail out if cache not initialized */
     517        1156 :     if (bistate == NULL)
     518          34 :         return;
     519             : 
     520             :     /* do this first to avoid dangling pointer if we fail partway through */
     521        1122 :     indexInfo->ii_AmCache = NULL;
     522             : 
     523             :     /*
     524             :      * Clean up the revmap. Note that the brinDesc has already been cleaned up
     525             :      * as part of its own memory context.
     526             :      */
     527        1122 :     brinRevmapTerminate(bistate->bis_rmAccess);
     528        1122 :     pfree(bistate);
     529             : }
     530             : 
     531             : /*
     532             :  * Initialize state for a BRIN index scan.
     533             :  *
     534             :  * We read the metapage here to determine the pages-per-range number that this
     535             :  * index was built with.  Note that since this cannot be changed while we're
     536             :  * holding lock on index, it's not necessary to recompute it during brinrescan.
     537             :  */
     538             : IndexScanDesc
     539        2946 : brinbeginscan(Relation r, int nkeys, int norderbys)
     540             : {
     541             :     IndexScanDesc scan;
     542             :     BrinOpaque *opaque;
     543             : 
     544        2946 :     scan = RelationGetIndexScan(r, nkeys, norderbys);
     545             : 
     546        2946 :     opaque = palloc_object(BrinOpaque);
     547        2946 :     opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange);
     548        2946 :     opaque->bo_bdesc = brin_build_desc(r);
     549        2946 :     scan->opaque = opaque;
     550             : 
     551        2946 :     return scan;
     552             : }
     553             : 
     554             : /*
     555             :  * Execute the index scan.
     556             :  *
     557             :  * This works by reading index TIDs from the revmap, and obtaining the index
     558             :  * tuples pointed to by them; the summary values in the index tuples are
     559             :  * compared to the scan keys.  We return into the TID bitmap all the pages in
     560             :  * ranges corresponding to index tuples that match the scan keys.
     561             :  *
     562             :  * If a TID from the revmap is read as InvalidTID, we know that range is
     563             :  * unsummarized.  Pages in those ranges need to be returned regardless of scan
     564             :  * keys.
     565             :  */
     566             : int64
     567        2946 : bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
     568             : {
     569        2946 :     Relation    idxRel = scan->indexRelation;
     570        2946 :     Buffer      buf = InvalidBuffer;
     571             :     BrinDesc   *bdesc;
     572             :     Oid         heapOid;
     573             :     Relation    heapRel;
     574             :     BrinOpaque *opaque;
     575             :     BlockNumber nblocks;
     576        2946 :     int64       totalpages = 0;
     577             :     FmgrInfo   *consistentFn;
     578             :     MemoryContext oldcxt;
     579             :     MemoryContext perRangeCxt;
     580             :     BrinMemTuple *dtup;
     581        2946 :     BrinTuple  *btup = NULL;
     582        2946 :     Size        btupsz = 0;
     583             :     ScanKey   **keys,
     584             :               **nullkeys;
     585             :     int        *nkeys,
     586             :                *nnullkeys;
     587             :     char       *ptr;
     588             :     Size        len;
     589             :     char       *tmp PG_USED_FOR_ASSERTS_ONLY;
     590             : 
     591        2946 :     opaque = (BrinOpaque *) scan->opaque;
     592        2946 :     bdesc = opaque->bo_bdesc;
     593        2946 :     pgstat_count_index_scan(idxRel);
     594        2946 :     if (scan->instrument)
     595        2946 :         scan->instrument->nsearches++;
     596             : 
     597             :     /*
     598             :      * We need to know the size of the table so that we know how long to
     599             :      * iterate on the revmap.
     600             :      */
     601        2946 :     heapOid = IndexGetRelation(RelationGetRelid(idxRel), false);
     602        2946 :     heapRel = table_open(heapOid, AccessShareLock);
     603        2946 :     nblocks = RelationGetNumberOfBlocks(heapRel);
     604        2946 :     table_close(heapRel, AccessShareLock);
     605             : 
     606             :     /*
     607             :      * Make room for the consistent support procedures of indexed columns.  We
     608             :      * don't look them up here; we do that lazily the first time we see a scan
     609             :      * key reference each of them.  We rely on zeroing fn_oid to InvalidOid.
     610             :      */
     611        2946 :     consistentFn = palloc0_array(FmgrInfo, bdesc->bd_tupdesc->natts);
     612             : 
     613             :     /*
     614             :      * Make room for per-attribute lists of scan keys that we'll pass to the
     615             :      * consistent support procedure. We don't know which attributes have scan
     616             :      * keys, so we allocate space for all attributes. That may use more memory
     617             :      * but it's probably cheaper than determining which attributes are used.
     618             :      *
     619             :      * We keep null and regular keys separate, so that we can pass just the
     620             :      * regular keys to the consistent function easily.
     621             :      *
     622             :      * To reduce the allocation overhead, we allocate one big chunk and then
     623             :      * carve it into smaller arrays ourselves. All the pieces have exactly the
     624             :      * same lifetime, so that's OK.
     625             :      *
     626             :      * XXX The widest index can have 32 attributes, so the amount of wasted
     627             :      * memory is negligible. We could invent a more compact approach (with
     628             :      * just space for used attributes) but that would make the matching more
     629             :      * complex so it's not a good trade-off.
     630             :      */
     631        2946 :     len =
     632        2946 :         MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) +  /* regular keys */
     633        2946 :         MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts +
     634        2946 :         MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts) +
     635        2946 :         MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) +  /* NULL keys */
     636        2946 :         MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts +
     637        2946 :         MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
     638             : 
     639        2946 :     ptr = palloc(len);
     640        2946 :     tmp = ptr;
     641             : 
     642        2946 :     keys = (ScanKey **) ptr;
     643        2946 :     ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts);
     644             : 
     645        2946 :     nullkeys = (ScanKey **) ptr;
     646        2946 :     ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts);
     647             : 
     648        2946 :     nkeys = (int *) ptr;
     649        2946 :     ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
     650             : 
     651        2946 :     nnullkeys = (int *) ptr;
     652        2946 :     ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
     653             : 
     654       69978 :     for (int i = 0; i < bdesc->bd_tupdesc->natts; i++)
     655             :     {
     656       67032 :         keys[i] = (ScanKey *) ptr;
     657       67032 :         ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys);
     658             : 
     659       67032 :         nullkeys[i] = (ScanKey *) ptr;
     660       67032 :         ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys);
     661             :     }
     662             : 
     663             :     Assert(tmp + len == ptr);
     664             : 
     665             :     /* zero the number of keys */
     666        2946 :     memset(nkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts);
     667        2946 :     memset(nnullkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts);
     668             : 
     669             :     /* Preprocess the scan keys - split them into per-attribute arrays. */
     670        5892 :     for (int keyno = 0; keyno < scan->numberOfKeys; keyno++)
     671             :     {
     672        2946 :         ScanKey     key = &scan->keyData[keyno];
     673        2946 :         AttrNumber  keyattno = key->sk_attno;
     674             : 
     675             :         /*
     676             :          * The collation of the scan key must match the collation used in the
     677             :          * index column (but only if the search is not IS NULL/ IS NOT NULL).
     678             :          * Otherwise we shouldn't be using this index ...
     679             :          */
     680             :         Assert((key->sk_flags & SK_ISNULL) ||
     681             :                (key->sk_collation ==
     682             :                 TupleDescAttr(bdesc->bd_tupdesc,
     683             :                               keyattno - 1)->attcollation));
     684             : 
     685             :         /*
     686             :          * First time we see this index attribute, so init as needed.
     687             :          *
     688             :          * This is a bit of an overkill - we don't know how many scan keys are
     689             :          * there for this attribute, so we simply allocate the largest number
     690             :          * possible (as if all keys were for this attribute). This may waste a
     691             :          * bit of memory, but we only expect small number of scan keys in
     692             :          * general, so this should be negligible, and repeated repalloc calls
     693             :          * are not free either.
     694             :          */
     695        2946 :         if (consistentFn[keyattno - 1].fn_oid == InvalidOid)
     696             :         {
     697             :             FmgrInfo   *tmp;
     698             : 
     699             :             /* First time we see this attribute, so no key/null keys. */
     700             :             Assert(nkeys[keyattno - 1] == 0);
     701             :             Assert(nnullkeys[keyattno - 1] == 0);
     702             : 
     703        2946 :             tmp = index_getprocinfo(idxRel, keyattno,
     704             :                                     BRIN_PROCNUM_CONSISTENT);
     705        2946 :             fmgr_info_copy(&consistentFn[keyattno - 1], tmp,
     706             :                            CurrentMemoryContext);
     707             :         }
     708             : 
     709             :         /* Add key to the proper per-attribute array. */
     710        2946 :         if (key->sk_flags & SK_ISNULL)
     711             :         {
     712          36 :             nullkeys[keyattno - 1][nnullkeys[keyattno - 1]] = key;
     713          36 :             nnullkeys[keyattno - 1]++;
     714             :         }
     715             :         else
     716             :         {
     717        2910 :             keys[keyattno - 1][nkeys[keyattno - 1]] = key;
     718        2910 :             nkeys[keyattno - 1]++;
     719             :         }
     720             :     }
     721             : 
     722             :     /* allocate an initial in-memory tuple, out of the per-range memcxt */
     723        2946 :     dtup = brin_new_memtuple(bdesc);
     724             : 
     725             :     /*
     726             :      * Setup and use a per-range memory context, which is reset every time we
     727             :      * loop below.  This avoids having to free the tuples within the loop.
     728             :      */
     729        2946 :     perRangeCxt = AllocSetContextCreate(CurrentMemoryContext,
     730             :                                         "bringetbitmap cxt",
     731             :                                         ALLOCSET_DEFAULT_SIZES);
     732        2946 :     oldcxt = MemoryContextSwitchTo(perRangeCxt);
     733             : 
     734             :     /*
     735             :      * Now scan the revmap.  We start by querying for heap page 0,
     736             :      * incrementing by the number of pages per range; this gives us a full
     737             :      * view of the table.  We make use of uint64 for heapBlk as a BlockNumber
     738             :      * could wrap for tables with close to 2^32 pages.
     739             :      */
     740      194598 :     for (uint64 heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange)
     741             :     {
     742             :         bool        addrange;
     743      191652 :         bool        gottuple = false;
     744             :         BrinTuple  *tup;
     745             :         OffsetNumber off;
     746             :         Size        size;
     747             : 
     748      191652 :         CHECK_FOR_INTERRUPTS();
     749             : 
     750      191652 :         MemoryContextReset(perRangeCxt);
     751             : 
     752      191652 :         tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, (BlockNumber) heapBlk, &buf,
     753             :                                        &off, &size, BUFFER_LOCK_SHARE);
     754      191652 :         if (tup)
     755             :         {
     756      189936 :             gottuple = true;
     757      189936 :             btup = brin_copy_tuple(tup, size, btup, &btupsz);
     758      189936 :             LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     759             :         }
     760             : 
     761             :         /*
     762             :          * For page ranges with no indexed tuple, we must return the whole
     763             :          * range; otherwise, compare it to the scan keys.
     764             :          */
     765      191652 :         if (!gottuple)
     766             :         {
     767        1716 :             addrange = true;
     768             :         }
     769             :         else
     770             :         {
     771      189936 :             dtup = brin_deform_tuple(bdesc, btup, dtup);
     772      189936 :             if (dtup->bt_placeholder)
     773             :             {
     774             :                 /*
     775             :                  * Placeholder tuples are always returned, regardless of the
     776             :                  * values stored in them.
     777             :                  */
     778           0 :                 addrange = true;
     779             :             }
     780             :             else
     781             :             {
     782             :                 int         attno;
     783             : 
     784             :                 /*
     785             :                  * Compare scan keys with summary values stored for the range.
     786             :                  * If scan keys are matched, the page range must be added to
     787             :                  * the bitmap.  We initially assume the range needs to be
     788             :                  * added; in particular this serves the case where there are
     789             :                  * no keys.
     790             :                  */
     791      189936 :                 addrange = true;
     792     4704068 :                 for (attno = 1; attno <= bdesc->bd_tupdesc->natts; attno++)
     793             :                 {
     794             :                     BrinValues *bval;
     795             :                     Datum       add;
     796             :                     Oid         collation;
     797             : 
     798             :                     /*
     799             :                      * skip attributes without any scan keys (both regular and
     800             :                      * IS [NOT] NULL)
     801             :                      */
     802     4567734 :                     if (nkeys[attno - 1] == 0 && nnullkeys[attno - 1] == 0)
     803     4377798 :                         continue;
     804             : 
     805      189936 :                     bval = &dtup->bt_columns[attno - 1];
     806             : 
     807             :                     /*
     808             :                      * If the BRIN tuple indicates that this range is empty,
     809             :                      * we can skip it: there's nothing to match.  We don't
     810             :                      * need to examine the next columns.
     811             :                      */
     812      189936 :                     if (dtup->bt_empty_range)
     813             :                     {
     814           0 :                         addrange = false;
     815           0 :                         break;
     816             :                     }
     817             : 
     818             :                     /*
     819             :                      * First check if there are any IS [NOT] NULL scan keys,
     820             :                      * and if we're violating them. In that case we can
     821             :                      * terminate early, without invoking the support function.
     822             :                      *
     823             :                      * As there may be more keys, we can only determine
     824             :                      * mismatch within this loop.
     825             :                      */
     826      189936 :                     if (bdesc->bd_info[attno - 1]->oi_regular_nulls &&
     827      189936 :                         !check_null_keys(bval, nullkeys[attno - 1],
     828      189936 :                                          nnullkeys[attno - 1]))
     829             :                     {
     830             :                         /*
     831             :                          * If any of the IS [NOT] NULL keys failed, the page
     832             :                          * range as a whole can't pass. So terminate the loop.
     833             :                          */
     834         996 :                         addrange = false;
     835         996 :                         break;
     836             :                     }
     837             : 
     838             :                     /*
     839             :                      * So either there are no IS [NOT] NULL keys, or all
     840             :                      * passed. If there are no regular scan keys, we're done -
     841             :                      * the page range matches. If there are regular keys, but
     842             :                      * the page range is marked as 'all nulls' it can't
     843             :                      * possibly pass (we're assuming the operators are
     844             :                      * strict).
     845             :                      */
     846             : 
     847             :                     /* No regular scan keys - page range as a whole passes. */
     848      188940 :                     if (!nkeys[attno - 1])
     849        1236 :                         continue;
     850             : 
     851             :                     Assert((nkeys[attno - 1] > 0) &&
     852             :                            (nkeys[attno - 1] <= scan->numberOfKeys));
     853             : 
     854             :                     /* If it is all nulls, it cannot possibly be consistent. */
     855      187704 :                     if (bval->bv_allnulls)
     856             :                     {
     857         378 :                         addrange = false;
     858         378 :                         break;
     859             :                     }
     860             : 
     861             :                     /*
     862             :                      * Collation from the first key (has to be the same for
     863             :                      * all keys for the same attribute).
     864             :                      */
     865      187326 :                     collation = keys[attno - 1][0]->sk_collation;
     866             : 
     867             :                     /*
     868             :                      * Check whether the scan key is consistent with the page
     869             :                      * range values; if so, have the pages in the range added
     870             :                      * to the output bitmap.
     871             :                      *
     872             :                      * The opclass may or may not support processing of
     873             :                      * multiple scan keys. We can determine that based on the
     874             :                      * number of arguments - functions with extra parameter
     875             :                      * (number of scan keys) do support this, otherwise we
     876             :                      * have to simply pass the scan keys one by one.
     877             :                      */
     878      187326 :                     if (consistentFn[attno - 1].fn_nargs >= 4)
     879             :                     {
     880             :                         /* Check all keys at once */
     881       39594 :                         add = FunctionCall4Coll(&consistentFn[attno - 1],
     882             :                                                 collation,
     883             :                                                 PointerGetDatum(bdesc),
     884             :                                                 PointerGetDatum(bval),
     885       39594 :                                                 PointerGetDatum(keys[attno - 1]),
     886       39594 :                                                 Int32GetDatum(nkeys[attno - 1]));
     887       39594 :                         addrange = DatumGetBool(add);
     888             :                     }
     889             :                     else
     890             :                     {
     891             :                         /*
     892             :                          * Check keys one by one
     893             :                          *
     894             :                          * When there are multiple scan keys, failure to meet
     895             :                          * the criteria for a single one of them is enough to
     896             :                          * discard the range as a whole, so break out of the
     897             :                          * loop as soon as a false return value is obtained.
     898             :                          */
     899             :                         int         keyno;
     900             : 
     901      258078 :                         for (keyno = 0; keyno < nkeys[attno - 1]; keyno++)
     902             :                         {
     903      147732 :                             add = FunctionCall3Coll(&consistentFn[attno - 1],
     904      147732 :                                                     keys[attno - 1][keyno]->sk_collation,
     905             :                                                     PointerGetDatum(bdesc),
     906             :                                                     PointerGetDatum(bval),
     907      147732 :                                                     PointerGetDatum(keys[attno - 1][keyno]));
     908      147732 :                             addrange = DatumGetBool(add);
     909      147732 :                             if (!addrange)
     910       37386 :                                 break;
     911             :                         }
     912             :                     }
     913             : 
     914             :                     /*
     915             :                      * If we found a scan key eliminating the range, no need
     916             :                      * to check additional ones.
     917             :                      */
     918      187326 :                     if (!addrange)
     919       52228 :                         break;
     920             :                 }
     921             :             }
     922             :         }
     923             : 
     924             :         /* add the pages in the range to the output bitmap, if needed */
     925      191652 :         if (addrange)
     926             :         {
     927             :             uint64      pageno;
     928             : 
     929      138050 :             for (pageno = heapBlk;
     930      286020 :                  pageno <= Min(nblocks, heapBlk + opaque->bo_pagesPerRange) - 1;
     931      147970 :                  pageno++)
     932             :             {
     933      147970 :                 MemoryContextSwitchTo(oldcxt);
     934      147970 :                 tbm_add_page(tbm, pageno);
     935      147970 :                 totalpages++;
     936      147970 :                 MemoryContextSwitchTo(perRangeCxt);
     937             :             }
     938             :         }
     939             :     }
     940             : 
     941        2946 :     MemoryContextSwitchTo(oldcxt);
     942        2946 :     MemoryContextDelete(perRangeCxt);
     943             : 
     944        2946 :     if (buf != InvalidBuffer)
     945        2946 :         ReleaseBuffer(buf);
     946             : 
     947             :     /*
     948             :      * XXX We have an approximation of the number of *pages* that our scan
     949             :      * returns, but we don't have a precise idea of the number of heap tuples
     950             :      * involved.
     951             :      */
     952        2946 :     return totalpages * 10;
     953             : }
     954             : 
     955             : /*
     956             :  * Re-initialize state for a BRIN index scan
     957             :  */
     958             : void
     959        2946 : brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
     960             :            ScanKey orderbys, int norderbys)
     961             : {
     962             :     /*
     963             :      * Other index AMs preprocess the scan keys at this point, or sometime
     964             :      * early during the scan; this lets them optimize by removing redundant
     965             :      * keys, or doing early returns when they are impossible to satisfy; see
     966             :      * _bt_preprocess_keys for an example.  Something like that could be added
     967             :      * here someday, too.
     968             :      */
     969             : 
     970        2946 :     if (scankey && scan->numberOfKeys > 0)
     971        2946 :         memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
     972        2946 : }
     973             : 
     974             : /*
     975             :  * Close down a BRIN index scan
     976             :  */
     977             : void
     978        2946 : brinendscan(IndexScanDesc scan)
     979             : {
     980        2946 :     BrinOpaque *opaque = (BrinOpaque *) scan->opaque;
     981             : 
     982        2946 :     brinRevmapTerminate(opaque->bo_rmAccess);
     983        2946 :     brin_free_desc(opaque->bo_bdesc);
     984        2946 :     pfree(opaque);
     985        2946 : }
     986             : 
     987             : /*
     988             :  * Per-heap-tuple callback for table_index_build_scan.
     989             :  *
     990             :  * Note we don't worry about the page range at the end of the table here; it is
     991             :  * present in the build state struct after we're called the last time, but not
     992             :  * inserted into the index.  Caller must ensure to do so, if appropriate.
     993             :  */
     994             : static void
     995      728452 : brinbuildCallback(Relation index,
     996             :                   ItemPointer tid,
     997             :                   Datum *values,
     998             :                   bool *isnull,
     999             :                   bool tupleIsAlive,
    1000             :                   void *brstate)
    1001             : {
    1002      728452 :     BrinBuildState *state = (BrinBuildState *) brstate;
    1003             :     BlockNumber thisblock;
    1004             : 
    1005      728452 :     thisblock = ItemPointerGetBlockNumber(tid);
    1006             : 
    1007             :     /*
    1008             :      * If we're in a block that belongs to a future range, summarize what
    1009             :      * we've got and start afresh.  Note the scan might have skipped many
    1010             :      * pages, if they were devoid of live tuples; make sure to insert index
    1011             :      * tuples for those too.
    1012             :      */
    1013      730748 :     while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1)
    1014             :     {
    1015             : 
    1016             :         BRIN_elog((DEBUG2,
    1017             :                    "brinbuildCallback: completed a range: %u--%u",
    1018             :                    state->bs_currRangeStart,
    1019             :                    state->bs_currRangeStart + state->bs_pagesPerRange));
    1020             : 
    1021             :         /* create the index tuple and insert it */
    1022        2296 :         form_and_insert_tuple(state);
    1023             : 
    1024             :         /* set state to correspond to the next range */
    1025        2296 :         state->bs_currRangeStart += state->bs_pagesPerRange;
    1026             : 
    1027             :         /* re-initialize state for it */
    1028        2296 :         brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
    1029             :     }
    1030             : 
    1031             :     /* Accumulate the current tuple into the running state */
    1032      728452 :     (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple,
    1033             :                                values, isnull);
    1034      728452 : }
    1035             : 
    1036             : /*
    1037             :  * Per-heap-tuple callback for table_index_build_scan with parallelism.
    1038             :  *
    1039             :  * A version of the callback used by parallel index builds. The main difference
    1040             :  * is that instead of writing the BRIN tuples into the index, we write them
    1041             :  * into a shared tuplesort, and leave the insertion up to the leader (which may
    1042             :  * reorder them a bit etc.). The callback also does not generate empty ranges,
    1043             :  * those will be added by the leader when merging results from workers.
    1044             :  */
    1045             : static void
    1046        7962 : brinbuildCallbackParallel(Relation index,
    1047             :                           ItemPointer tid,
    1048             :                           Datum *values,
    1049             :                           bool *isnull,
    1050             :                           bool tupleIsAlive,
    1051             :                           void *brstate)
    1052             : {
    1053        7962 :     BrinBuildState *state = (BrinBuildState *) brstate;
    1054             :     BlockNumber thisblock;
    1055             : 
    1056        7962 :     thisblock = ItemPointerGetBlockNumber(tid);
    1057             : 
    1058             :     /*
    1059             :      * If we're in a block that belongs to a different range, summarize what
    1060             :      * we've got and start afresh.  Note the scan might have skipped many
    1061             :      * pages, if they were devoid of live tuples; we do not create empty BRIN
    1062             :      * ranges here - the leader is responsible for filling them in.
    1063             :      *
    1064             :      * Unlike serial builds, parallel index builds allow synchronized seqscans
    1065             :      * (because that's what parallel scans do). This means the block may wrap
    1066             :      * around to the beginning of the relation, so the condition needs to
    1067             :      * check for both future and past ranges.
    1068             :      */
    1069        7962 :     if ((thisblock < state->bs_currRangeStart) ||
    1070        7962 :         (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1))
    1071             :     {
    1072             : 
    1073             :         BRIN_elog((DEBUG2,
    1074             :                    "brinbuildCallbackParallel: completed a range: %u--%u",
    1075             :                    state->bs_currRangeStart,
    1076             :                    state->bs_currRangeStart + state->bs_pagesPerRange));
    1077             : 
    1078             :         /* create the index tuple and write it into the tuplesort */
    1079          40 :         form_and_spill_tuple(state);
    1080             : 
    1081             :         /*
    1082             :          * Set state to correspond to the next range (for this block).
    1083             :          *
    1084             :          * This skips ranges that are either empty (and so we don't get any
    1085             :          * tuples to summarize), or processed by other workers. We can't
    1086             :          * differentiate those cases here easily, so we leave it up to the
    1087             :          * leader to fill empty ranges where needed.
    1088             :          */
    1089             :         state->bs_currRangeStart
    1090          40 :             = state->bs_pagesPerRange * (thisblock / state->bs_pagesPerRange);
    1091             : 
    1092             :         /* re-initialize state for it */
    1093          40 :         brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
    1094             :     }
    1095             : 
    1096             :     /* Accumulate the current tuple into the running state */
    1097        7962 :     (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple,
    1098             :                                values, isnull);
    1099        7962 : }
    1100             : 
    1101             : /*
    1102             :  * brinbuild() -- build a new BRIN index.
    1103             :  */
    1104             : IndexBuildResult *
    1105         368 : brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
    1106             : {
    1107             :     IndexBuildResult *result;
    1108             :     double      reltuples;
    1109             :     double      idxtuples;
    1110             :     BrinRevmap *revmap;
    1111             :     BrinBuildState *state;
    1112             :     Buffer      meta;
    1113             :     BlockNumber pagesPerRange;
    1114             : 
    1115             :     /*
    1116             :      * We expect to be called exactly once for any index relation.
    1117             :      */
    1118         368 :     if (RelationGetNumberOfBlocks(index) != 0)
    1119           0 :         elog(ERROR, "index \"%s\" already contains data",
    1120             :              RelationGetRelationName(index));
    1121             : 
    1122             :     /*
    1123             :      * Critical section not required, because on error the creation of the
    1124             :      * whole relation will be rolled back.
    1125             :      */
    1126             : 
    1127         368 :     meta = ExtendBufferedRel(BMR_REL(index), MAIN_FORKNUM, NULL,
    1128             :                              EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);
    1129             :     Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO);
    1130             : 
    1131         368 :     brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index),
    1132             :                        BRIN_CURRENT_VERSION);
    1133         368 :     MarkBufferDirty(meta);
    1134             : 
    1135         368 :     if (RelationNeedsWAL(index))
    1136             :     {
    1137             :         xl_brin_createidx xlrec;
    1138             :         XLogRecPtr  recptr;
    1139             :         Page        page;
    1140             : 
    1141         180 :         xlrec.version = BRIN_CURRENT_VERSION;
    1142         180 :         xlrec.pagesPerRange = BrinGetPagesPerRange(index);
    1143             : 
    1144         180 :         XLogBeginInsert();
    1145         180 :         XLogRegisterData(&xlrec, SizeOfBrinCreateIdx);
    1146         180 :         XLogRegisterBuffer(0, meta, REGBUF_WILL_INIT | REGBUF_STANDARD);
    1147             : 
    1148         180 :         recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX);
    1149             : 
    1150         180 :         page = BufferGetPage(meta);
    1151         180 :         PageSetLSN(page, recptr);
    1152             :     }
    1153             : 
    1154         368 :     UnlockReleaseBuffer(meta);
    1155             : 
    1156             :     /*
    1157             :      * Initialize our state, including the deformed tuple state.
    1158             :      */
    1159         368 :     revmap = brinRevmapInitialize(index, &pagesPerRange);
    1160         368 :     state = initialize_brin_buildstate(index, revmap, pagesPerRange,
    1161             :                                        RelationGetNumberOfBlocks(heap));
    1162             : 
    1163             :     /*
    1164             :      * Attempt to launch parallel worker scan when required
    1165             :      *
    1166             :      * XXX plan_create_index_workers makes the number of workers dependent on
    1167             :      * maintenance_work_mem, requiring 32MB for each worker. That makes sense
    1168             :      * for btree, but not for BRIN, which can do with much less memory. So
    1169             :      * maybe make that somehow less strict, optionally?
    1170             :      */
    1171         368 :     if (indexInfo->ii_ParallelWorkers > 0)
    1172          10 :         _brin_begin_parallel(state, heap, index, indexInfo->ii_Concurrent,
    1173             :                              indexInfo->ii_ParallelWorkers);
    1174             : 
    1175             :     /*
    1176             :      * If parallel build requested and at least one worker process was
    1177             :      * successfully launched, set up coordination state, wait for workers to
    1178             :      * complete. Then read all tuples from the shared tuplesort and insert
    1179             :      * them into the index.
    1180             :      *
    1181             :      * In serial mode, simply scan the table and build the index one index
    1182             :      * tuple at a time.
    1183             :      */
    1184         368 :     if (state->bs_leader)
    1185             :     {
    1186             :         SortCoordinate coordinate;
    1187             : 
    1188           8 :         coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
    1189           8 :         coordinate->isWorker = false;
    1190           8 :         coordinate->nParticipants =
    1191           8 :             state->bs_leader->nparticipanttuplesorts;
    1192           8 :         coordinate->sharedsort = state->bs_leader->sharedsort;
    1193             : 
    1194             :         /*
    1195             :          * Begin leader tuplesort.
    1196             :          *
    1197             :          * In cases where parallelism is involved, the leader receives the
    1198             :          * same share of maintenance_work_mem as a serial sort (it is
    1199             :          * generally treated in the same way as a serial sort once we return).
    1200             :          * Parallel worker Tuplesortstates will have received only a fraction
    1201             :          * of maintenance_work_mem, though.
    1202             :          *
    1203             :          * We rely on the lifetime of the Leader Tuplesortstate almost not
    1204             :          * overlapping with any worker Tuplesortstate's lifetime.  There may
    1205             :          * be some small overlap, but that's okay because we rely on leader
    1206             :          * Tuplesortstate only allocating a small, fixed amount of memory
    1207             :          * here. When its tuplesort_performsort() is called (by our caller),
    1208             :          * and significant amounts of memory are likely to be used, all
    1209             :          * workers must have already freed almost all memory held by their
    1210             :          * Tuplesortstates (they are about to go away completely, too).  The
    1211             :          * overall effect is that maintenance_work_mem always represents an
    1212             :          * absolute high watermark on the amount of memory used by a CREATE
    1213             :          * INDEX operation, regardless of the use of parallelism or any other
    1214             :          * factor.
    1215             :          */
    1216           8 :         state->bs_sortstate =
    1217           8 :             tuplesort_begin_index_brin(maintenance_work_mem, coordinate,
    1218             :                                        TUPLESORT_NONE);
    1219             : 
    1220             :         /* scan the relation and merge per-worker results */
    1221           8 :         reltuples = _brin_parallel_merge(state);
    1222             : 
    1223           8 :         _brin_end_parallel(state->bs_leader, state);
    1224             :     }
    1225             :     else                        /* no parallel index build */
    1226             :     {
    1227             :         /*
    1228             :          * Now scan the relation.  No syncscan allowed here because we want
    1229             :          * the heap blocks in physical order (we want to produce the ranges
    1230             :          * starting from block 0, and the callback also relies on this to not
    1231             :          * generate summary for the same range twice).
    1232             :          */
    1233         360 :         reltuples = table_index_build_scan(heap, index, indexInfo, false, true,
    1234             :                                            brinbuildCallback, state, NULL);
    1235             : 
    1236             :         /*
    1237             :          * process the final batch
    1238             :          *
    1239             :          * XXX Note this does not update state->bs_currRangeStart, i.e. it
    1240             :          * stays set to the last range added to the index. This is OK, because
    1241             :          * that's what brin_fill_empty_ranges expects.
    1242             :          */
    1243         360 :         form_and_insert_tuple(state);
    1244             : 
    1245             :         /*
    1246             :          * Backfill the final ranges with empty data.
    1247             :          *
    1248             :          * This saves us from doing what amounts to full table scans when the
    1249             :          * index with a predicate like WHERE (nonnull_column IS NULL), or
    1250             :          * other very selective predicates.
    1251             :          */
    1252         360 :         brin_fill_empty_ranges(state,
    1253             :                                state->bs_currRangeStart,
    1254             :                                state->bs_maxRangeStart);
    1255             :     }
    1256             : 
    1257             :     /* release resources */
    1258         368 :     idxtuples = state->bs_numtuples;
    1259         368 :     brinRevmapTerminate(state->bs_rmAccess);
    1260         368 :     terminate_brin_buildstate(state);
    1261             : 
    1262             :     /*
    1263             :      * Return statistics
    1264             :      */
    1265         368 :     result = palloc_object(IndexBuildResult);
    1266             : 
    1267         368 :     result->heap_tuples = reltuples;
    1268         368 :     result->index_tuples = idxtuples;
    1269             : 
    1270         368 :     return result;
    1271             : }
    1272             : 
    1273             : void
    1274           6 : brinbuildempty(Relation index)
    1275             : {
    1276             :     Buffer      metabuf;
    1277             : 
    1278             :     /* An empty BRIN index has a metapage only. */
    1279           6 :     metabuf = ExtendBufferedRel(BMR_REL(index), INIT_FORKNUM, NULL,
    1280             :                                 EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);
    1281             : 
    1282             :     /* Initialize and xlog metabuffer. */
    1283           6 :     START_CRIT_SECTION();
    1284           6 :     brin_metapage_init(BufferGetPage(metabuf), BrinGetPagesPerRange(index),
    1285             :                        BRIN_CURRENT_VERSION);
    1286           6 :     MarkBufferDirty(metabuf);
    1287           6 :     log_newpage_buffer(metabuf, true);
    1288           6 :     END_CRIT_SECTION();
    1289             : 
    1290           6 :     UnlockReleaseBuffer(metabuf);
    1291           6 : }
    1292             : 
    1293             : /*
    1294             :  * brinbulkdelete
    1295             :  *      Since there are no per-heap-tuple index tuples in BRIN indexes,
    1296             :  *      there's not a lot we can do here.
    1297             :  *
    1298             :  * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap
    1299             :  * tuple is deleted), meaning the need to re-run summarization on the affected
    1300             :  * range.  Would need to add an extra flag in brintuples for that.
    1301             :  */
    1302             : IndexBulkDeleteResult *
    1303          26 : brinbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
    1304             :                IndexBulkDeleteCallback callback, void *callback_state)
    1305             : {
    1306             :     /* allocate stats if first time through, else re-use existing struct */
    1307          26 :     if (stats == NULL)
    1308          26 :         stats = palloc0_object(IndexBulkDeleteResult);
    1309             : 
    1310          26 :     return stats;
    1311             : }
    1312             : 
    1313             : /*
    1314             :  * This routine is in charge of "vacuuming" a BRIN index: we just summarize
    1315             :  * ranges that are currently unsummarized.
    1316             :  */
    1317             : IndexBulkDeleteResult *
    1318         100 : brinvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
    1319             : {
    1320             :     Relation    heapRel;
    1321             : 
    1322             :     /* No-op in ANALYZE ONLY mode */
    1323         100 :     if (info->analyze_only)
    1324           6 :         return stats;
    1325             : 
    1326          94 :     if (!stats)
    1327          74 :         stats = palloc0_object(IndexBulkDeleteResult);
    1328          94 :     stats->num_pages = RelationGetNumberOfBlocks(info->index);
    1329             :     /* rest of stats is initialized by zeroing */
    1330             : 
    1331          94 :     heapRel = table_open(IndexGetRelation(RelationGetRelid(info->index), false),
    1332             :                          AccessShareLock);
    1333             : 
    1334          94 :     brin_vacuum_scan(info->index, info->strategy);
    1335             : 
    1336          94 :     brinsummarize(info->index, heapRel, BRIN_ALL_BLOCKRANGES, false,
    1337             :                   &stats->num_index_tuples, &stats->num_index_tuples);
    1338             : 
    1339          94 :     table_close(heapRel, AccessShareLock);
    1340             : 
    1341          94 :     return stats;
    1342             : }
    1343             : 
    1344             : /*
    1345             :  * reloptions processor for BRIN indexes
    1346             :  */
    1347             : bytea *
    1348        1160 : brinoptions(Datum reloptions, bool validate)
    1349             : {
    1350             :     static const relopt_parse_elt tab[] = {
    1351             :         {"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)},
    1352             :         {"autosummarize", RELOPT_TYPE_BOOL, offsetof(BrinOptions, autosummarize)}
    1353             :     };
    1354             : 
    1355        1160 :     return (bytea *) build_reloptions(reloptions, validate,
    1356             :                                       RELOPT_KIND_BRIN,
    1357             :                                       sizeof(BrinOptions),
    1358             :                                       tab, lengthof(tab));
    1359             : }
    1360             : 
    1361             : /*
    1362             :  * SQL-callable function to scan through an index and summarize all ranges
    1363             :  * that are not currently summarized.
    1364             :  */
    1365             : Datum
    1366          76 : brin_summarize_new_values(PG_FUNCTION_ARGS)
    1367             : {
    1368          76 :     Datum       relation = PG_GETARG_DATUM(0);
    1369             : 
    1370          76 :     return DirectFunctionCall2(brin_summarize_range,
    1371             :                                relation,
    1372             :                                Int64GetDatum((int64) BRIN_ALL_BLOCKRANGES));
    1373             : }
    1374             : 
    1375             : /*
    1376             :  * SQL-callable function to summarize the indicated page range, if not already
    1377             :  * summarized.  If the second argument is BRIN_ALL_BLOCKRANGES, all
    1378             :  * unsummarized ranges are summarized.
    1379             :  */
    1380             : Datum
    1381         210 : brin_summarize_range(PG_FUNCTION_ARGS)
    1382             : {
    1383         210 :     Oid         indexoid = PG_GETARG_OID(0);
    1384         210 :     int64       heapBlk64 = PG_GETARG_INT64(1);
    1385             :     BlockNumber heapBlk;
    1386             :     Oid         heapoid;
    1387             :     Relation    indexRel;
    1388             :     Relation    heapRel;
    1389             :     Oid         save_userid;
    1390             :     int         save_sec_context;
    1391             :     int         save_nestlevel;
    1392         210 :     double      numSummarized = 0;
    1393             : 
    1394         210 :     if (RecoveryInProgress())
    1395           0 :         ereport(ERROR,
    1396             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    1397             :                  errmsg("recovery is in progress"),
    1398             :                  errhint("BRIN control functions cannot be executed during recovery.")));
    1399             : 
    1400         210 :     if (heapBlk64 > BRIN_ALL_BLOCKRANGES || heapBlk64 < 0)
    1401          36 :         ereport(ERROR,
    1402             :                 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
    1403             :                  errmsg("block number out of range: %" PRId64, heapBlk64)));
    1404         174 :     heapBlk = (BlockNumber) heapBlk64;
    1405             : 
    1406             :     /*
    1407             :      * We must lock table before index to avoid deadlocks.  However, if the
    1408             :      * passed indexoid isn't an index then IndexGetRelation() will fail.
    1409             :      * Rather than emitting a not-very-helpful error message, postpone
    1410             :      * complaining, expecting that the is-it-an-index test below will fail.
    1411             :      */
    1412         174 :     heapoid = IndexGetRelation(indexoid, true);
    1413         174 :     if (OidIsValid(heapoid))
    1414             :     {
    1415         156 :         heapRel = table_open(heapoid, ShareUpdateExclusiveLock);
    1416             : 
    1417             :         /*
    1418             :          * Autovacuum calls us.  For its benefit, switch to the table owner's
    1419             :          * userid, so that any index functions are run as that user.  Also
    1420             :          * lock down security-restricted operations and arrange to make GUC
    1421             :          * variable changes local to this command.  This is harmless, albeit
    1422             :          * unnecessary, when called from SQL, because we fail shortly if the
    1423             :          * user does not own the index.
    1424             :          */
    1425         156 :         GetUserIdAndSecContext(&save_userid, &save_sec_context);
    1426         156 :         SetUserIdAndSecContext(heapRel->rd_rel->relowner,
    1427             :                                save_sec_context | SECURITY_RESTRICTED_OPERATION);
    1428         156 :         save_nestlevel = NewGUCNestLevel();
    1429         156 :         RestrictSearchPath();
    1430             :     }
    1431             :     else
    1432             :     {
    1433          18 :         heapRel = NULL;
    1434             :         /* Set these just to suppress "uninitialized variable" warnings */
    1435          18 :         save_userid = InvalidOid;
    1436          18 :         save_sec_context = -1;
    1437          18 :         save_nestlevel = -1;
    1438             :     }
    1439             : 
    1440         174 :     indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
    1441             : 
    1442             :     /* Must be a BRIN index */
    1443         156 :     if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
    1444         156 :         indexRel->rd_rel->relam != BRIN_AM_OID)
    1445          18 :         ereport(ERROR,
    1446             :                 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
    1447             :                  errmsg("\"%s\" is not a BRIN index",
    1448             :                         RelationGetRelationName(indexRel))));
    1449             : 
    1450             :     /* User must own the index (comparable to privileges needed for VACUUM) */
    1451         138 :     if (heapRel != NULL && !object_ownercheck(RelationRelationId, indexoid, save_userid))
    1452           0 :         aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX,
    1453           0 :                        RelationGetRelationName(indexRel));
    1454             : 
    1455             :     /*
    1456             :      * Since we did the IndexGetRelation call above without any lock, it's
    1457             :      * barely possible that a race against an index drop/recreation could have
    1458             :      * netted us the wrong table.  Recheck.
    1459             :      */
    1460         138 :     if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
    1461           0 :         ereport(ERROR,
    1462             :                 (errcode(ERRCODE_UNDEFINED_TABLE),
    1463             :                  errmsg("could not open parent table of index \"%s\"",
    1464             :                         RelationGetRelationName(indexRel))));
    1465             : 
    1466             :     /* see gin_clean_pending_list() */
    1467         138 :     if (indexRel->rd_index->indisvalid)
    1468         138 :         brinsummarize(indexRel, heapRel, heapBlk, true, &numSummarized, NULL);
    1469             :     else
    1470           0 :         ereport(DEBUG1,
    1471             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    1472             :                  errmsg("index \"%s\" is not valid",
    1473             :                         RelationGetRelationName(indexRel))));
    1474             : 
    1475             :     /* Roll back any GUC changes executed by index functions */
    1476         138 :     AtEOXact_GUC(false, save_nestlevel);
    1477             : 
    1478             :     /* Restore userid and security context */
    1479         138 :     SetUserIdAndSecContext(save_userid, save_sec_context);
    1480             : 
    1481         138 :     relation_close(indexRel, ShareUpdateExclusiveLock);
    1482         138 :     relation_close(heapRel, ShareUpdateExclusiveLock);
    1483             : 
    1484         138 :     PG_RETURN_INT32((int32) numSummarized);
    1485             : }
    1486             : 
    1487             : /*
    1488             :  * SQL-callable interface to mark a range as no longer summarized
    1489             :  */
    1490             : Datum
    1491         104 : brin_desummarize_range(PG_FUNCTION_ARGS)
    1492             : {
    1493         104 :     Oid         indexoid = PG_GETARG_OID(0);
    1494         104 :     int64       heapBlk64 = PG_GETARG_INT64(1);
    1495             :     BlockNumber heapBlk;
    1496             :     Oid         heapoid;
    1497             :     Relation    heapRel;
    1498             :     Relation    indexRel;
    1499             :     bool        done;
    1500             : 
    1501         104 :     if (RecoveryInProgress())
    1502           0 :         ereport(ERROR,
    1503             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    1504             :                  errmsg("recovery is in progress"),
    1505             :                  errhint("BRIN control functions cannot be executed during recovery.")));
    1506             : 
    1507         104 :     if (heapBlk64 > MaxBlockNumber || heapBlk64 < 0)
    1508          18 :         ereport(ERROR,
    1509             :                 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
    1510             :                  errmsg("block number out of range: %" PRId64,
    1511             :                         heapBlk64)));
    1512          86 :     heapBlk = (BlockNumber) heapBlk64;
    1513             : 
    1514             :     /*
    1515             :      * We must lock table before index to avoid deadlocks.  However, if the
    1516             :      * passed indexoid isn't an index then IndexGetRelation() will fail.
    1517             :      * Rather than emitting a not-very-helpful error message, postpone
    1518             :      * complaining, expecting that the is-it-an-index test below will fail.
    1519             :      *
    1520             :      * Unlike brin_summarize_range(), autovacuum never calls this.  Hence, we
    1521             :      * don't switch userid.
    1522             :      */
    1523          86 :     heapoid = IndexGetRelation(indexoid, true);
    1524          86 :     if (OidIsValid(heapoid))
    1525          86 :         heapRel = table_open(heapoid, ShareUpdateExclusiveLock);
    1526             :     else
    1527           0 :         heapRel = NULL;
    1528             : 
    1529          86 :     indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
    1530             : 
    1531             :     /* Must be a BRIN index */
    1532          86 :     if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
    1533          86 :         indexRel->rd_rel->relam != BRIN_AM_OID)
    1534           0 :         ereport(ERROR,
    1535             :                 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
    1536             :                  errmsg("\"%s\" is not a BRIN index",
    1537             :                         RelationGetRelationName(indexRel))));
    1538             : 
    1539             :     /* User must own the index (comparable to privileges needed for VACUUM) */
    1540          86 :     if (!object_ownercheck(RelationRelationId, indexoid, GetUserId()))
    1541           0 :         aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX,
    1542           0 :                        RelationGetRelationName(indexRel));
    1543             : 
    1544             :     /*
    1545             :      * Since we did the IndexGetRelation call above without any lock, it's
    1546             :      * barely possible that a race against an index drop/recreation could have
    1547             :      * netted us the wrong table.  Recheck.
    1548             :      */
    1549          86 :     if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
    1550           0 :         ereport(ERROR,
    1551             :                 (errcode(ERRCODE_UNDEFINED_TABLE),
    1552             :                  errmsg("could not open parent table of index \"%s\"",
    1553             :                         RelationGetRelationName(indexRel))));
    1554             : 
    1555             :     /* see gin_clean_pending_list() */
    1556          86 :     if (indexRel->rd_index->indisvalid)
    1557             :     {
    1558             :         /* the revmap does the hard work */
    1559             :         do
    1560             :         {
    1561          86 :             done = brinRevmapDesummarizeRange(indexRel, heapBlk);
    1562             :         }
    1563          86 :         while (!done);
    1564             :     }
    1565             :     else
    1566           0 :         ereport(DEBUG1,
    1567             :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    1568             :                  errmsg("index \"%s\" is not valid",
    1569             :                         RelationGetRelationName(indexRel))));
    1570             : 
    1571          86 :     relation_close(indexRel, ShareUpdateExclusiveLock);
    1572          86 :     relation_close(heapRel, ShareUpdateExclusiveLock);
    1573             : 
    1574          86 :     PG_RETURN_VOID();
    1575             : }
    1576             : 
    1577             : /*
    1578             :  * Build a BrinDesc used to create or scan a BRIN index
    1579             :  */
    1580             : BrinDesc *
    1581        4582 : brin_build_desc(Relation rel)
    1582             : {
    1583             :     BrinOpcInfo **opcinfo;
    1584             :     BrinDesc   *bdesc;
    1585             :     TupleDesc   tupdesc;
    1586        4582 :     int         totalstored = 0;
    1587             :     int         keyno;
    1588             :     long        totalsize;
    1589             :     MemoryContext cxt;
    1590             :     MemoryContext oldcxt;
    1591             : 
    1592        4582 :     cxt = AllocSetContextCreate(CurrentMemoryContext,
    1593             :                                 "brin desc cxt",
    1594             :                                 ALLOCSET_SMALL_SIZES);
    1595        4582 :     oldcxt = MemoryContextSwitchTo(cxt);
    1596        4582 :     tupdesc = RelationGetDescr(rel);
    1597             : 
    1598             :     /*
    1599             :      * Obtain BrinOpcInfo for each indexed column.  While at it, accumulate
    1600             :      * the number of columns stored, since the number is opclass-defined.
    1601             :      */
    1602        4582 :     opcinfo = palloc_array(BrinOpcInfo *, tupdesc->natts);
    1603       76082 :     for (keyno = 0; keyno < tupdesc->natts; keyno++)
    1604             :     {
    1605             :         FmgrInfo   *opcInfoFn;
    1606       71500 :         Form_pg_attribute attr = TupleDescAttr(tupdesc, keyno);
    1607             : 
    1608       71500 :         opcInfoFn = index_getprocinfo(rel, keyno + 1, BRIN_PROCNUM_OPCINFO);
    1609             : 
    1610      143000 :         opcinfo[keyno] = (BrinOpcInfo *)
    1611       71500 :             DatumGetPointer(FunctionCall1(opcInfoFn, ObjectIdGetDatum(attr->atttypid)));
    1612       71500 :         totalstored += opcinfo[keyno]->oi_nstored;
    1613             :     }
    1614             : 
    1615             :     /* Allocate our result struct and fill it in */
    1616        4582 :     totalsize = offsetof(BrinDesc, bd_info) +
    1617        4582 :         sizeof(BrinOpcInfo *) * tupdesc->natts;
    1618             : 
    1619        4582 :     bdesc = palloc(totalsize);
    1620        4582 :     bdesc->bd_context = cxt;
    1621        4582 :     bdesc->bd_index = rel;
    1622        4582 :     bdesc->bd_tupdesc = tupdesc;
    1623        4582 :     bdesc->bd_disktdesc = NULL; /* generated lazily */
    1624        4582 :     bdesc->bd_totalstored = totalstored;
    1625             : 
    1626       76082 :     for (keyno = 0; keyno < tupdesc->natts; keyno++)
    1627       71500 :         bdesc->bd_info[keyno] = opcinfo[keyno];
    1628        4582 :     pfree(opcinfo);
    1629             : 
    1630        4582 :     MemoryContextSwitchTo(oldcxt);
    1631             : 
    1632        4582 :     return bdesc;
    1633             : }
    1634             : 
    1635             : void
    1636        3446 : brin_free_desc(BrinDesc *bdesc)
    1637             : {
    1638             :     /* make sure the tupdesc is still valid */
    1639             :     Assert(bdesc->bd_tupdesc->tdrefcount >= 1);
    1640             :     /* no need for retail pfree */
    1641        3446 :     MemoryContextDelete(bdesc->bd_context);
    1642        3446 : }
    1643             : 
    1644             : /*
    1645             :  * Fetch index's statistical data into *stats
    1646             :  */
    1647             : void
    1648       10730 : brinGetStats(Relation index, BrinStatsData *stats)
    1649             : {
    1650             :     Buffer      metabuffer;
    1651             :     Page        metapage;
    1652             :     BrinMetaPageData *metadata;
    1653             : 
    1654       10730 :     metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO);
    1655       10730 :     LockBuffer(metabuffer, BUFFER_LOCK_SHARE);
    1656       10730 :     metapage = BufferGetPage(metabuffer);
    1657       10730 :     metadata = (BrinMetaPageData *) PageGetContents(metapage);
    1658             : 
    1659       10730 :     stats->pagesPerRange = metadata->pagesPerRange;
    1660       10730 :     stats->revmapNumPages = metadata->lastRevmapPage - 1;
    1661             : 
    1662       10730 :     UnlockReleaseBuffer(metabuffer);
    1663       10730 : }
    1664             : 
    1665             : /*
    1666             :  * Initialize a BrinBuildState appropriate to create tuples on the given index.
    1667             :  */
    1668             : static BrinBuildState *
    1669         468 : initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap,
    1670             :                            BlockNumber pagesPerRange, BlockNumber tablePages)
    1671             : {
    1672             :     BrinBuildState *state;
    1673         468 :     BlockNumber lastRange = 0;
    1674             : 
    1675         468 :     state = palloc_object(BrinBuildState);
    1676             : 
    1677         468 :     state->bs_irel = idxRel;
    1678         468 :     state->bs_numtuples = 0;
    1679         468 :     state->bs_reltuples = 0;
    1680         468 :     state->bs_currentInsertBuf = InvalidBuffer;
    1681         468 :     state->bs_pagesPerRange = pagesPerRange;
    1682         468 :     state->bs_currRangeStart = 0;
    1683         468 :     state->bs_rmAccess = revmap;
    1684         468 :     state->bs_bdesc = brin_build_desc(idxRel);
    1685         468 :     state->bs_dtuple = brin_new_memtuple(state->bs_bdesc);
    1686         468 :     state->bs_leader = NULL;
    1687         468 :     state->bs_worker_id = 0;
    1688         468 :     state->bs_sortstate = NULL;
    1689         468 :     state->bs_context = CurrentMemoryContext;
    1690         468 :     state->bs_emptyTuple = NULL;
    1691         468 :     state->bs_emptyTupleLen = 0;
    1692             : 
    1693             :     /* Remember the memory context to use for an empty tuple, if needed. */
    1694         468 :     state->bs_context = CurrentMemoryContext;
    1695         468 :     state->bs_emptyTuple = NULL;
    1696         468 :     state->bs_emptyTupleLen = 0;
    1697             : 
    1698             :     /*
    1699             :      * Calculate the start of the last page range. Page numbers are 0-based,
    1700             :      * so to calculate the index we need to subtract one. The integer division
    1701             :      * gives us the index of the page range.
    1702             :      */
    1703         468 :     if (tablePages > 0)
    1704         344 :         lastRange = ((tablePages - 1) / pagesPerRange) * pagesPerRange;
    1705             : 
    1706             :     /* Now calculate the start of the next range. */
    1707         468 :     state->bs_maxRangeStart = lastRange + state->bs_pagesPerRange;
    1708             : 
    1709         468 :     return state;
    1710             : }
    1711             : 
    1712             : /*
    1713             :  * Release resources associated with a BrinBuildState.
    1714             :  */
    1715             : static void
    1716         456 : terminate_brin_buildstate(BrinBuildState *state)
    1717             : {
    1718             :     /*
    1719             :      * Release the last index buffer used.  We might as well ensure that
    1720             :      * whatever free space remains in that page is available in FSM, too.
    1721             :      */
    1722         456 :     if (!BufferIsInvalid(state->bs_currentInsertBuf))
    1723             :     {
    1724             :         Page        page;
    1725             :         Size        freespace;
    1726             :         BlockNumber blk;
    1727             : 
    1728         368 :         page = BufferGetPage(state->bs_currentInsertBuf);
    1729         368 :         freespace = PageGetFreeSpace(page);
    1730         368 :         blk = BufferGetBlockNumber(state->bs_currentInsertBuf);
    1731         368 :         ReleaseBuffer(state->bs_currentInsertBuf);
    1732         368 :         RecordPageWithFreeSpace(state->bs_irel, blk, freespace);
    1733         368 :         FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1);
    1734             :     }
    1735             : 
    1736         456 :     brin_free_desc(state->bs_bdesc);
    1737         456 :     pfree(state->bs_dtuple);
    1738         456 :     pfree(state);
    1739         456 : }
    1740             : 
    1741             : /*
    1742             :  * On the given BRIN index, summarize the heap page range that corresponds
    1743             :  * to the heap block number given.
    1744             :  *
    1745             :  * This routine can run in parallel with insertions into the heap.  To avoid
    1746             :  * missing those values from the summary tuple, we first insert a placeholder
    1747             :  * index tuple into the index, then execute the heap scan; transactions
    1748             :  * concurrent with the scan update the placeholder tuple.  After the scan, we
    1749             :  * union the placeholder tuple with the one computed by this routine.  The
    1750             :  * update of the index value happens in a loop, so that if somebody updates
    1751             :  * the placeholder tuple after we read it, we detect the case and try again.
    1752             :  * This ensures that the concurrently inserted tuples are not lost.
    1753             :  *
    1754             :  * A further corner case is this routine being asked to summarize the partial
    1755             :  * range at the end of the table.  heapNumBlocks is the (possibly outdated)
    1756             :  * table size; if we notice that the requested range lies beyond that size,
    1757             :  * we re-compute the table size after inserting the placeholder tuple, to
    1758             :  * avoid missing pages that were appended recently.
    1759             :  */
    1760             : static void
    1761        2944 : summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel,
    1762             :                 BlockNumber heapBlk, BlockNumber heapNumBlks)
    1763             : {
    1764             :     Buffer      phbuf;
    1765             :     BrinTuple  *phtup;
    1766             :     Size        phsz;
    1767             :     OffsetNumber offset;
    1768             :     BlockNumber scanNumBlks;
    1769             : 
    1770             :     /*
    1771             :      * Insert the placeholder tuple
    1772             :      */
    1773        2944 :     phbuf = InvalidBuffer;
    1774        2944 :     phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz);
    1775        2944 :     offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange,
    1776             :                            state->bs_rmAccess, &phbuf,
    1777             :                            heapBlk, phtup, phsz);
    1778             : 
    1779             :     /*
    1780             :      * Compute range end.  We hold ShareUpdateExclusive lock on table, so it
    1781             :      * cannot shrink concurrently (but it can grow).
    1782             :      */
    1783             :     Assert(heapBlk % state->bs_pagesPerRange == 0);
    1784        2944 :     if (heapBlk + state->bs_pagesPerRange > heapNumBlks)
    1785             :     {
    1786             :         /*
    1787             :          * If we're asked to scan what we believe to be the final range on the
    1788             :          * table (i.e. a range that might be partial) we need to recompute our
    1789             :          * idea of what the latest page is after inserting the placeholder
    1790             :          * tuple.  Anyone that grows the table later will update the
    1791             :          * placeholder tuple, so it doesn't matter that we won't scan these
    1792             :          * pages ourselves.  Careful: the table might have been extended
    1793             :          * beyond the current range, so clamp our result.
    1794             :          *
    1795             :          * Fortunately, this should occur infrequently.
    1796             :          */
    1797          24 :         scanNumBlks = Min(RelationGetNumberOfBlocks(heapRel) - heapBlk,
    1798             :                           state->bs_pagesPerRange);
    1799             :     }
    1800             :     else
    1801             :     {
    1802             :         /* Easy case: range is known to be complete */
    1803        2920 :         scanNumBlks = state->bs_pagesPerRange;
    1804             :     }
    1805             : 
    1806             :     /*
    1807             :      * Execute the partial heap scan covering the heap blocks in the specified
    1808             :      * page range, summarizing the heap tuples in it.  This scan stops just
    1809             :      * short of brinbuildCallback creating the new index entry.
    1810             :      *
    1811             :      * Note that it is critical we use the "any visible" mode of
    1812             :      * table_index_build_range_scan here: otherwise, we would miss tuples
    1813             :      * inserted by transactions that are still in progress, among other corner
    1814             :      * cases.
    1815             :      */
    1816        2944 :     state->bs_currRangeStart = heapBlk;
    1817        2944 :     table_index_build_range_scan(heapRel, state->bs_irel, indexInfo, false, true, false,
    1818             :                                  heapBlk, scanNumBlks,
    1819             :                                  brinbuildCallback, state, NULL);
    1820             : 
    1821             :     /*
    1822             :      * Now we update the values obtained by the scan with the placeholder
    1823             :      * tuple.  We do this in a loop which only terminates if we're able to
    1824             :      * update the placeholder tuple successfully; if we are not, this means
    1825             :      * somebody else modified the placeholder tuple after we read it.
    1826             :      */
    1827             :     for (;;)
    1828           0 :     {
    1829             :         BrinTuple  *newtup;
    1830             :         Size        newsize;
    1831             :         bool        didupdate;
    1832             :         bool        samepage;
    1833             : 
    1834        2944 :         CHECK_FOR_INTERRUPTS();
    1835             : 
    1836             :         /*
    1837             :          * Update the summary tuple and try to update.
    1838             :          */
    1839        2944 :         newtup = brin_form_tuple(state->bs_bdesc,
    1840             :                                  heapBlk, state->bs_dtuple, &newsize);
    1841        2944 :         samepage = brin_can_do_samepage_update(phbuf, phsz, newsize);
    1842             :         didupdate =
    1843        2944 :             brin_doupdate(state->bs_irel, state->bs_pagesPerRange,
    1844             :                           state->bs_rmAccess, heapBlk, phbuf, offset,
    1845             :                           phtup, phsz, newtup, newsize, samepage);
    1846        2944 :         brin_free_tuple(phtup);
    1847        2944 :         brin_free_tuple(newtup);
    1848             : 
    1849             :         /* If the update succeeded, we're done. */
    1850        2944 :         if (didupdate)
    1851        2944 :             break;
    1852             : 
    1853             :         /*
    1854             :          * If the update didn't work, it might be because somebody updated the
    1855             :          * placeholder tuple concurrently.  Extract the new version, union it
    1856             :          * with the values we have from the scan, and start over.  (There are
    1857             :          * other reasons for the update to fail, but it's simple to treat them
    1858             :          * the same.)
    1859             :          */
    1860           0 :         phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf,
    1861             :                                          &offset, &phsz, BUFFER_LOCK_SHARE);
    1862             :         /* the placeholder tuple must exist */
    1863           0 :         if (phtup == NULL)
    1864           0 :             elog(ERROR, "missing placeholder tuple");
    1865           0 :         phtup = brin_copy_tuple(phtup, phsz, NULL, NULL);
    1866           0 :         LockBuffer(phbuf, BUFFER_LOCK_UNLOCK);
    1867             : 
    1868             :         /* merge it into the tuple from the heap scan */
    1869           0 :         union_tuples(state->bs_bdesc, state->bs_dtuple, phtup);
    1870             :     }
    1871             : 
    1872        2944 :     ReleaseBuffer(phbuf);
    1873        2944 : }
    1874             : 
    1875             : /*
    1876             :  * Summarize page ranges that are not already summarized.  If pageRange is
    1877             :  * BRIN_ALL_BLOCKRANGES then the whole table is scanned; otherwise, only the
    1878             :  * page range containing the given heap page number is scanned.
    1879             :  * If include_partial is true, then the partial range at the end of the table
    1880             :  * is summarized, otherwise not.
    1881             :  *
    1882             :  * For each new index tuple inserted, *numSummarized (if not NULL) is
    1883             :  * incremented; for each existing tuple, *numExisting (if not NULL) is
    1884             :  * incremented.
    1885             :  */
    1886             : static void
    1887         232 : brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
    1888             :               bool include_partial, double *numSummarized, double *numExisting)
    1889             : {
    1890             :     BrinRevmap *revmap;
    1891         232 :     BrinBuildState *state = NULL;
    1892         232 :     IndexInfo  *indexInfo = NULL;
    1893             :     BlockNumber heapNumBlocks;
    1894             :     BlockNumber pagesPerRange;
    1895             :     Buffer      buf;
    1896             :     BlockNumber startBlk;
    1897             : 
    1898         232 :     revmap = brinRevmapInitialize(index, &pagesPerRange);
    1899             : 
    1900             :     /* determine range of pages to process */
    1901         232 :     heapNumBlocks = RelationGetNumberOfBlocks(heapRel);
    1902         232 :     if (pageRange == BRIN_ALL_BLOCKRANGES)
    1903         152 :         startBlk = 0;
    1904             :     else
    1905             :     {
    1906          80 :         startBlk = (pageRange / pagesPerRange) * pagesPerRange;
    1907          80 :         heapNumBlocks = Min(heapNumBlocks, startBlk + pagesPerRange);
    1908             :     }
    1909         232 :     if (startBlk > heapNumBlocks)
    1910             :     {
    1911             :         /* Nothing to do if start point is beyond end of table */
    1912           0 :         brinRevmapTerminate(revmap);
    1913           0 :         return;
    1914             :     }
    1915             : 
    1916             :     /*
    1917             :      * Scan the revmap to find unsummarized items.
    1918             :      */
    1919         232 :     buf = InvalidBuffer;
    1920       19238 :     for (; startBlk < heapNumBlocks; startBlk += pagesPerRange)
    1921             :     {
    1922             :         BrinTuple  *tup;
    1923             :         OffsetNumber off;
    1924             : 
    1925             :         /*
    1926             :          * Unless requested to summarize even a partial range, go away now if
    1927             :          * we think the next range is partial.  Caller would pass true when it
    1928             :          * is typically run once bulk data loading is done
    1929             :          * (brin_summarize_new_values), and false when it is typically the
    1930             :          * result of arbitrarily-scheduled maintenance command (vacuuming).
    1931             :          */
    1932       19076 :         if (!include_partial &&
    1933        2328 :             (startBlk + pagesPerRange > heapNumBlocks))
    1934          70 :             break;
    1935             : 
    1936       19006 :         CHECK_FOR_INTERRUPTS();
    1937             : 
    1938       19006 :         tup = brinGetTupleForHeapBlock(revmap, startBlk, &buf, &off, NULL,
    1939             :                                        BUFFER_LOCK_SHARE);
    1940       19006 :         if (tup == NULL)
    1941             :         {
    1942             :             /* no revmap entry for this heap range. Summarize it. */
    1943        2944 :             if (state == NULL)
    1944             :             {
    1945             :                 /* first time through */
    1946             :                 Assert(!indexInfo);
    1947          88 :                 state = initialize_brin_buildstate(index, revmap,
    1948             :                                                    pagesPerRange,
    1949             :                                                    InvalidBlockNumber);
    1950          88 :                 indexInfo = BuildIndexInfo(index);
    1951             :             }
    1952        2944 :             summarize_range(indexInfo, state, heapRel, startBlk, heapNumBlocks);
    1953             : 
    1954             :             /* and re-initialize state for the next range */
    1955        2944 :             brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
    1956             : 
    1957        2944 :             if (numSummarized)
    1958        2944 :                 *numSummarized += 1.0;
    1959             :         }
    1960             :         else
    1961             :         {
    1962       16062 :             if (numExisting)
    1963        2162 :                 *numExisting += 1.0;
    1964       16062 :             LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    1965             :         }
    1966             :     }
    1967             : 
    1968         232 :     if (BufferIsValid(buf))
    1969         160 :         ReleaseBuffer(buf);
    1970             : 
    1971             :     /* free resources */
    1972         232 :     brinRevmapTerminate(revmap);
    1973         232 :     if (state)
    1974             :     {
    1975          88 :         terminate_brin_buildstate(state);
    1976          88 :         pfree(indexInfo);
    1977             :     }
    1978             : }
    1979             : 
    1980             : /*
    1981             :  * Given a deformed tuple in the build state, convert it into the on-disk
    1982             :  * format and insert it into the index, making the revmap point to it.
    1983             :  */
    1984             : static void
    1985        2656 : form_and_insert_tuple(BrinBuildState *state)
    1986             : {
    1987             :     BrinTuple  *tup;
    1988             :     Size        size;
    1989             : 
    1990        2656 :     tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
    1991             :                           state->bs_dtuple, &size);
    1992        2656 :     brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
    1993             :                   &state->bs_currentInsertBuf, state->bs_currRangeStart,
    1994             :                   tup, size);
    1995        2656 :     state->bs_numtuples++;
    1996             : 
    1997        2656 :     pfree(tup);
    1998        2656 : }
    1999             : 
    2000             : /*
    2001             :  * Given a deformed tuple in the build state, convert it into the on-disk
    2002             :  * format and write it to a (shared) tuplesort (the leader will insert it
    2003             :  * into the index later).
    2004             :  */
    2005             : static void
    2006          60 : form_and_spill_tuple(BrinBuildState *state)
    2007             : {
    2008             :     BrinTuple  *tup;
    2009             :     Size        size;
    2010             : 
    2011             :     /* don't insert empty tuples in parallel build */
    2012          60 :     if (state->bs_dtuple->bt_empty_range)
    2013          18 :         return;
    2014             : 
    2015          42 :     tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
    2016             :                           state->bs_dtuple, &size);
    2017             : 
    2018             :     /* write the BRIN tuple to the tuplesort */
    2019          42 :     tuplesort_putbrintuple(state->bs_sortstate, tup, size);
    2020             : 
    2021          42 :     state->bs_numtuples++;
    2022             : 
    2023          42 :     pfree(tup);
    2024             : }
    2025             : 
    2026             : /*
    2027             :  * Given two deformed tuples, adjust the first one so that it's consistent
    2028             :  * with the summary values in both.
    2029             :  */
    2030             : static void
    2031           2 : union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)
    2032             : {
    2033             :     int         keyno;
    2034             :     BrinMemTuple *db;
    2035             :     MemoryContext cxt;
    2036             :     MemoryContext oldcxt;
    2037             : 
    2038             :     /* Use our own memory context to avoid retail pfree */
    2039           2 :     cxt = AllocSetContextCreate(CurrentMemoryContext,
    2040             :                                 "brin union",
    2041             :                                 ALLOCSET_DEFAULT_SIZES);
    2042           2 :     oldcxt = MemoryContextSwitchTo(cxt);
    2043           2 :     db = brin_deform_tuple(bdesc, b, NULL);
    2044           2 :     MemoryContextSwitchTo(oldcxt);
    2045             : 
    2046             :     /*
    2047             :      * Check if the ranges are empty.
    2048             :      *
    2049             :      * If at least one of them is empty, we don't need to call per-key union
    2050             :      * functions at all. If "b" is empty, we just use "a" as the result (it
    2051             :      * might be empty fine, but that's fine). If "a" is empty but "b" is not,
    2052             :      * we use "b" as the result (but we have to copy the data into "a" first).
    2053             :      *
    2054             :      * Only when both ranges are non-empty, we actually do the per-key merge.
    2055             :      */
    2056             : 
    2057             :     /* If "b" is empty - ignore it and just use "a" (even if it's empty etc.). */
    2058           2 :     if (db->bt_empty_range)
    2059             :     {
    2060             :         /* skip the per-key merge */
    2061           0 :         MemoryContextDelete(cxt);
    2062           0 :         return;
    2063             :     }
    2064             : 
    2065             :     /*
    2066             :      * Now we know "b" is not empty. If "a" is empty, then "b" is the result.
    2067             :      * But we need to copy the data from "b" to "a" first, because that's how
    2068             :      * we pass result out.
    2069             :      *
    2070             :      * We have to copy all the global/per-key flags etc. too.
    2071             :      */
    2072           2 :     if (a->bt_empty_range)
    2073             :     {
    2074           0 :         for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
    2075             :         {
    2076             :             int         i;
    2077           0 :             BrinValues *col_a = &a->bt_columns[keyno];
    2078           0 :             BrinValues *col_b = &db->bt_columns[keyno];
    2079           0 :             BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];
    2080             : 
    2081           0 :             col_a->bv_allnulls = col_b->bv_allnulls;
    2082           0 :             col_a->bv_hasnulls = col_b->bv_hasnulls;
    2083             : 
    2084             :             /* If "b" has no data, we're done. */
    2085           0 :             if (col_b->bv_allnulls)
    2086           0 :                 continue;
    2087             : 
    2088           0 :             for (i = 0; i < opcinfo->oi_nstored; i++)
    2089           0 :                 col_a->bv_values[i] =
    2090           0 :                     datumCopy(col_b->bv_values[i],
    2091           0 :                               opcinfo->oi_typcache[i]->typbyval,
    2092           0 :                               opcinfo->oi_typcache[i]->typlen);
    2093             :         }
    2094             : 
    2095             :         /* "a" started empty, but "b" was not empty, so remember that */
    2096           0 :         a->bt_empty_range = false;
    2097             : 
    2098             :         /* skip the per-key merge */
    2099           0 :         MemoryContextDelete(cxt);
    2100           0 :         return;
    2101             :     }
    2102             : 
    2103             :     /* Now we know neither range is empty. */
    2104          10 :     for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
    2105             :     {
    2106             :         FmgrInfo   *unionFn;
    2107           8 :         BrinValues *col_a = &a->bt_columns[keyno];
    2108           8 :         BrinValues *col_b = &db->bt_columns[keyno];
    2109           8 :         BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];
    2110             : 
    2111           8 :         if (opcinfo->oi_regular_nulls)
    2112             :         {
    2113             :             /* Does the "b" summary represent any NULL values? */
    2114           8 :             bool        b_has_nulls = (col_b->bv_hasnulls || col_b->bv_allnulls);
    2115             : 
    2116             :             /* Adjust "hasnulls". */
    2117           8 :             if (!col_a->bv_allnulls && b_has_nulls)
    2118           0 :                 col_a->bv_hasnulls = true;
    2119             : 
    2120             :             /* If there are no values in B, there's nothing left to do. */
    2121           8 :             if (col_b->bv_allnulls)
    2122           2 :                 continue;
    2123             : 
    2124             :             /*
    2125             :              * Adjust "allnulls".  If A doesn't have values, just copy the
    2126             :              * values from B into A, and we're done.  We cannot run the
    2127             :              * operators in this case, because values in A might contain
    2128             :              * garbage.  Note we already established that B contains values.
    2129             :              *
    2130             :              * Also adjust "hasnulls" in order not to forget the summary
    2131             :              * represents NULL values. This is not redundant with the earlier
    2132             :              * update, because that only happens when allnulls=false.
    2133             :              */
    2134           6 :             if (col_a->bv_allnulls)
    2135           0 :             {
    2136             :                 int         i;
    2137             : 
    2138           0 :                 col_a->bv_allnulls = false;
    2139           0 :                 col_a->bv_hasnulls = true;
    2140             : 
    2141           0 :                 for (i = 0; i < opcinfo->oi_nstored; i++)
    2142           0 :                     col_a->bv_values[i] =
    2143           0 :                         datumCopy(col_b->bv_values[i],
    2144           0 :                                   opcinfo->oi_typcache[i]->typbyval,
    2145           0 :                                   opcinfo->oi_typcache[i]->typlen);
    2146             : 
    2147           0 :                 continue;
    2148             :             }
    2149             :         }
    2150             : 
    2151           6 :         unionFn = index_getprocinfo(bdesc->bd_index, keyno + 1,
    2152             :                                     BRIN_PROCNUM_UNION);
    2153           6 :         FunctionCall3Coll(unionFn,
    2154           6 :                           bdesc->bd_index->rd_indcollation[keyno],
    2155             :                           PointerGetDatum(bdesc),
    2156             :                           PointerGetDatum(col_a),
    2157             :                           PointerGetDatum(col_b));
    2158             :     }
    2159             : 
    2160           2 :     MemoryContextDelete(cxt);
    2161             : }
    2162             : 
    2163             : /*
    2164             :  * brin_vacuum_scan
    2165             :  *      Do a complete scan of the index during VACUUM.
    2166             :  *
    2167             :  * This routine scans the complete index looking for uncataloged index pages,
    2168             :  * i.e. those that might have been lost due to a crash after index extension
    2169             :  * and such.
    2170             :  */
    2171             : static void
    2172          94 : brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy)
    2173             : {
    2174             :     BlockRangeReadStreamPrivate p;
    2175             :     ReadStream *stream;
    2176             :     Buffer      buf;
    2177             : 
    2178          94 :     p.current_blocknum = 0;
    2179          94 :     p.last_exclusive = RelationGetNumberOfBlocks(idxrel);
    2180             : 
    2181             :     /*
    2182             :      * It is safe to use batchmode as block_range_read_stream_cb takes no
    2183             :      * locks.
    2184             :      */
    2185          94 :     stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
    2186             :                                         READ_STREAM_FULL |
    2187             :                                         READ_STREAM_USE_BATCHING,
    2188             :                                         strategy,
    2189             :                                         idxrel,
    2190             :                                         MAIN_FORKNUM,
    2191             :                                         block_range_read_stream_cb,
    2192             :                                         &p,
    2193             :                                         0);
    2194             : 
    2195             :     /*
    2196             :      * Scan the index in physical order, and clean up any possible mess in
    2197             :      * each page.
    2198             :      */
    2199         512 :     while ((buf = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
    2200             :     {
    2201         418 :         CHECK_FOR_INTERRUPTS();
    2202             : 
    2203         418 :         brin_page_cleanup(idxrel, buf);
    2204             : 
    2205         418 :         ReleaseBuffer(buf);
    2206             :     }
    2207             : 
    2208          94 :     read_stream_end(stream);
    2209             : 
    2210             :     /*
    2211             :      * Update all upper pages in the index's FSM, as well.  This ensures not
    2212             :      * only that we propagate leaf-page FSM updates made by brin_page_cleanup,
    2213             :      * but also that any pre-existing damage or out-of-dateness is repaired.
    2214             :      */
    2215          94 :     FreeSpaceMapVacuum(idxrel);
    2216          94 : }
    2217             : 
    2218             : static bool
    2219      784296 : add_values_to_range(Relation idxRel, BrinDesc *bdesc, BrinMemTuple *dtup,
    2220             :                     const Datum *values, const bool *nulls)
    2221             : {
    2222             :     int         keyno;
    2223             : 
    2224             :     /* If the range starts empty, we're certainly going to modify it. */
    2225      784296 :     bool        modified = dtup->bt_empty_range;
    2226             : 
    2227             :     /*
    2228             :      * Compare the key values of the new tuple to the stored index values; our
    2229             :      * deformed tuple will get updated if the new tuple doesn't fit the
    2230             :      * original range (note this means we can't break out of the loop early).
    2231             :      * Make a note of whether this happens, so that we know to insert the
    2232             :      * modified tuple later.
    2233             :      */
    2234     1848422 :     for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
    2235             :     {
    2236             :         Datum       result;
    2237             :         BrinValues *bval;
    2238             :         FmgrInfo   *addValue;
    2239             :         bool        has_nulls;
    2240             : 
    2241     1064126 :         bval = &dtup->bt_columns[keyno];
    2242             : 
    2243             :         /*
    2244             :          * Does the range have actual NULL values? Either of the flags can be
    2245             :          * set, but we ignore the state before adding first row.
    2246             :          *
    2247             :          * We have to remember this, because we'll modify the flags and we
    2248             :          * need to know if the range started as empty.
    2249             :          */
    2250     2091578 :         has_nulls = ((!dtup->bt_empty_range) &&
    2251     1027452 :                      (bval->bv_hasnulls || bval->bv_allnulls));
    2252             : 
    2253             :         /*
    2254             :          * If the value we're adding is NULL, handle it locally. Otherwise
    2255             :          * call the BRIN_PROCNUM_ADDVALUE procedure.
    2256             :          */
    2257     1064126 :         if (bdesc->bd_info[keyno]->oi_regular_nulls && nulls[keyno])
    2258             :         {
    2259             :             /*
    2260             :              * If the new value is null, we record that we saw it if it's the
    2261             :              * first one; otherwise, there's nothing to do.
    2262             :              */
    2263       18694 :             if (!bval->bv_hasnulls)
    2264             :             {
    2265        3610 :                 bval->bv_hasnulls = true;
    2266        3610 :                 modified = true;
    2267             :             }
    2268             : 
    2269       18694 :             continue;
    2270             :         }
    2271             : 
    2272     1045432 :         addValue = index_getprocinfo(idxRel, keyno + 1,
    2273             :                                      BRIN_PROCNUM_ADDVALUE);
    2274     1045432 :         result = FunctionCall4Coll(addValue,
    2275     1045432 :                                    idxRel->rd_indcollation[keyno],
    2276             :                                    PointerGetDatum(bdesc),
    2277             :                                    PointerGetDatum(bval),
    2278     1045432 :                                    values[keyno],
    2279     1045432 :                                    BoolGetDatum(nulls[keyno]));
    2280             :         /* if that returned true, we need to insert the updated tuple */
    2281     1045432 :         modified |= DatumGetBool(result);
    2282             : 
    2283             :         /*
    2284             :          * If the range was had actual NULL values (i.e. did not start empty),
    2285             :          * make sure we don't forget about the NULL values. Either the
    2286             :          * allnulls flag is still set to true, or (if the opclass cleared it)
    2287             :          * we need to set hasnulls=true.
    2288             :          *
    2289             :          * XXX This can only happen when the opclass modified the tuple, so
    2290             :          * the modified flag should be set.
    2291             :          */
    2292     1045432 :         if (has_nulls && !(bval->bv_hasnulls || bval->bv_allnulls))
    2293             :         {
    2294             :             Assert(modified);
    2295           4 :             bval->bv_hasnulls = true;
    2296             :         }
    2297             :     }
    2298             : 
    2299             :     /*
    2300             :      * After updating summaries for all the keys, mark it as not empty.
    2301             :      *
    2302             :      * If we're actually changing the flag value (i.e. tuple started as
    2303             :      * empty), we should have modified the tuple. So we should not see empty
    2304             :      * range that was not modified.
    2305             :      */
    2306             :     Assert(!dtup->bt_empty_range || modified);
    2307      784296 :     dtup->bt_empty_range = false;
    2308             : 
    2309      784296 :     return modified;
    2310             : }
    2311             : 
    2312             : static bool
    2313      189936 : check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys)
    2314             : {
    2315             :     int         keyno;
    2316             : 
    2317             :     /*
    2318             :      * First check if there are any IS [NOT] NULL scan keys, and if we're
    2319             :      * violating them.
    2320             :      */
    2321      191172 :     for (keyno = 0; keyno < nnullkeys; keyno++)
    2322             :     {
    2323        2232 :         ScanKey     key = nullkeys[keyno];
    2324             : 
    2325             :         Assert(key->sk_attno == bval->bv_attno);
    2326             : 
    2327             :         /* Handle only IS NULL/IS NOT NULL tests */
    2328        2232 :         if (!(key->sk_flags & SK_ISNULL))
    2329           0 :             continue;
    2330             : 
    2331        2232 :         if (key->sk_flags & SK_SEARCHNULL)
    2332             :         {
    2333             :             /* IS NULL scan key, but range has no NULLs */
    2334        1116 :             if (!bval->bv_allnulls && !bval->bv_hasnulls)
    2335         978 :                 return false;
    2336             :         }
    2337        1116 :         else if (key->sk_flags & SK_SEARCHNOTNULL)
    2338             :         {
    2339             :             /*
    2340             :              * For IS NOT NULL, we can only skip ranges that are known to have
    2341             :              * only nulls.
    2342             :              */
    2343        1116 :             if (bval->bv_allnulls)
    2344          18 :                 return false;
    2345             :         }
    2346             :         else
    2347             :         {
    2348             :             /*
    2349             :              * Neither IS NULL nor IS NOT NULL was used; assume all indexable
    2350             :              * operators are strict and thus return false with NULL value in
    2351             :              * the scan key.
    2352             :              */
    2353           0 :             return false;
    2354             :         }
    2355             :     }
    2356             : 
    2357      188940 :     return true;
    2358             : }
    2359             : 
    2360             : /*
    2361             :  * Create parallel context, and launch workers for leader.
    2362             :  *
    2363             :  * buildstate argument should be initialized (with the exception of the
    2364             :  * tuplesort states, which may later be created based on shared
    2365             :  * state initially set up here).
    2366             :  *
    2367             :  * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
    2368             :  *
    2369             :  * request is the target number of parallel worker processes to launch.
    2370             :  *
    2371             :  * Sets buildstate's BrinLeader, which caller must use to shut down parallel
    2372             :  * mode by passing it to _brin_end_parallel() at the very end of its index
    2373             :  * build.  If not even a single worker process can be launched, this is
    2374             :  * never set, and caller should proceed with a serial index build.
    2375             :  */
    2376             : static void
    2377          10 : _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
    2378             :                      bool isconcurrent, int request)
    2379             : {
    2380             :     ParallelContext *pcxt;
    2381             :     int         scantuplesortstates;
    2382             :     Snapshot    snapshot;
    2383             :     Size        estbrinshared;
    2384             :     Size        estsort;
    2385             :     BrinShared *brinshared;
    2386             :     Sharedsort *sharedsort;
    2387          10 :     BrinLeader *brinleader = (BrinLeader *) palloc0(sizeof(BrinLeader));
    2388             :     WalUsage   *walusage;
    2389             :     BufferUsage *bufferusage;
    2390          10 :     bool        leaderparticipates = true;
    2391             :     int         querylen;
    2392             : 
    2393             : #ifdef DISABLE_LEADER_PARTICIPATION
    2394             :     leaderparticipates = false;
    2395             : #endif
    2396             : 
    2397             :     /*
    2398             :      * Enter parallel mode, and create context for parallel build of brin
    2399             :      * index
    2400             :      */
    2401          10 :     EnterParallelMode();
    2402             :     Assert(request > 0);
    2403          10 :     pcxt = CreateParallelContext("postgres", "_brin_parallel_build_main",
    2404             :                                  request);
    2405             : 
    2406          10 :     scantuplesortstates = leaderparticipates ? request + 1 : request;
    2407             : 
    2408             :     /*
    2409             :      * Prepare for scan of the base relation.  In a normal index build, we use
    2410             :      * SnapshotAny because we must retrieve all tuples and do our own time
    2411             :      * qual checks (because we have to index RECENTLY_DEAD tuples).  In a
    2412             :      * concurrent build, we take a regular MVCC snapshot and index whatever's
    2413             :      * live according to that.
    2414             :      */
    2415          10 :     if (!isconcurrent)
    2416          10 :         snapshot = SnapshotAny;
    2417             :     else
    2418           0 :         snapshot = RegisterSnapshot(GetTransactionSnapshot());
    2419             : 
    2420             :     /*
    2421             :      * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace.
    2422             :      */
    2423          10 :     estbrinshared = _brin_parallel_estimate_shared(heap, snapshot);
    2424          10 :     shm_toc_estimate_chunk(&pcxt->estimator, estbrinshared);
    2425          10 :     estsort = tuplesort_estimate_shared(scantuplesortstates);
    2426          10 :     shm_toc_estimate_chunk(&pcxt->estimator, estsort);
    2427             : 
    2428          10 :     shm_toc_estimate_keys(&pcxt->estimator, 2);
    2429             : 
    2430             :     /*
    2431             :      * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
    2432             :      * and PARALLEL_KEY_BUFFER_USAGE.
    2433             :      *
    2434             :      * If there are no extensions loaded that care, we could skip this.  We
    2435             :      * have no way of knowing whether anyone's looking at pgWalUsage or
    2436             :      * pgBufferUsage, so do it unconditionally.
    2437             :      */
    2438          10 :     shm_toc_estimate_chunk(&pcxt->estimator,
    2439             :                            mul_size(sizeof(WalUsage), pcxt->nworkers));
    2440          10 :     shm_toc_estimate_keys(&pcxt->estimator, 1);
    2441          10 :     shm_toc_estimate_chunk(&pcxt->estimator,
    2442             :                            mul_size(sizeof(BufferUsage), pcxt->nworkers));
    2443          10 :     shm_toc_estimate_keys(&pcxt->estimator, 1);
    2444             : 
    2445             :     /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
    2446          10 :     if (debug_query_string)
    2447             :     {
    2448          10 :         querylen = strlen(debug_query_string);
    2449          10 :         shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
    2450          10 :         shm_toc_estimate_keys(&pcxt->estimator, 1);
    2451             :     }
    2452             :     else
    2453           0 :         querylen = 0;           /* keep compiler quiet */
    2454             : 
    2455             :     /* Everyone's had a chance to ask for space, so now create the DSM */
    2456          10 :     InitializeParallelDSM(pcxt);
    2457             : 
    2458             :     /* If no DSM segment was available, back out (do serial build) */
    2459          10 :     if (pcxt->seg == NULL)
    2460             :     {
    2461           0 :         if (IsMVCCSnapshot(snapshot))
    2462           0 :             UnregisterSnapshot(snapshot);
    2463           0 :         DestroyParallelContext(pcxt);
    2464           0 :         ExitParallelMode();
    2465           0 :         return;
    2466             :     }
    2467             : 
    2468             :     /* Store shared build state, for which we reserved space */
    2469          10 :     brinshared = (BrinShared *) shm_toc_allocate(pcxt->toc, estbrinshared);
    2470             :     /* Initialize immutable state */
    2471          10 :     brinshared->heaprelid = RelationGetRelid(heap);
    2472          10 :     brinshared->indexrelid = RelationGetRelid(index);
    2473          10 :     brinshared->isconcurrent = isconcurrent;
    2474          10 :     brinshared->scantuplesortstates = scantuplesortstates;
    2475          10 :     brinshared->pagesPerRange = buildstate->bs_pagesPerRange;
    2476          10 :     brinshared->queryid = pgstat_get_my_query_id();
    2477          10 :     ConditionVariableInit(&brinshared->workersdonecv);
    2478          10 :     SpinLockInit(&brinshared->mutex);
    2479             : 
    2480             :     /* Initialize mutable state */
    2481          10 :     brinshared->nparticipantsdone = 0;
    2482          10 :     brinshared->reltuples = 0.0;
    2483          10 :     brinshared->indtuples = 0.0;
    2484             : 
    2485          10 :     table_parallelscan_initialize(heap,
    2486             :                                   ParallelTableScanFromBrinShared(brinshared),
    2487             :                                   snapshot);
    2488             : 
    2489             :     /*
    2490             :      * Store shared tuplesort-private state, for which we reserved space.
    2491             :      * Then, initialize opaque state using tuplesort routine.
    2492             :      */
    2493          10 :     sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
    2494          10 :     tuplesort_initialize_shared(sharedsort, scantuplesortstates,
    2495             :                                 pcxt->seg);
    2496             : 
    2497             :     /*
    2498             :      * Store shared tuplesort-private state, for which we reserved space.
    2499             :      * Then, initialize opaque state using tuplesort routine.
    2500             :      */
    2501          10 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_BRIN_SHARED, brinshared);
    2502          10 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
    2503             : 
    2504             :     /* Store query string for workers */
    2505          10 :     if (debug_query_string)
    2506             :     {
    2507             :         char       *sharedquery;
    2508             : 
    2509          10 :         sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
    2510          10 :         memcpy(sharedquery, debug_query_string, querylen + 1);
    2511          10 :         shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
    2512             :     }
    2513             : 
    2514             :     /*
    2515             :      * Allocate space for each worker's WalUsage and BufferUsage; no need to
    2516             :      * initialize.
    2517             :      */
    2518          10 :     walusage = shm_toc_allocate(pcxt->toc,
    2519          10 :                                 mul_size(sizeof(WalUsage), pcxt->nworkers));
    2520          10 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
    2521          10 :     bufferusage = shm_toc_allocate(pcxt->toc,
    2522          10 :                                    mul_size(sizeof(BufferUsage), pcxt->nworkers));
    2523          10 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
    2524             : 
    2525             :     /* Launch workers, saving status for leader/caller */
    2526          10 :     LaunchParallelWorkers(pcxt);
    2527          10 :     brinleader->pcxt = pcxt;
    2528          10 :     brinleader->nparticipanttuplesorts = pcxt->nworkers_launched;
    2529          10 :     if (leaderparticipates)
    2530          10 :         brinleader->nparticipanttuplesorts++;
    2531          10 :     brinleader->brinshared = brinshared;
    2532          10 :     brinleader->sharedsort = sharedsort;
    2533          10 :     brinleader->snapshot = snapshot;
    2534          10 :     brinleader->walusage = walusage;
    2535          10 :     brinleader->bufferusage = bufferusage;
    2536             : 
    2537             :     /* If no workers were successfully launched, back out (do serial build) */
    2538          10 :     if (pcxt->nworkers_launched == 0)
    2539             :     {
    2540           2 :         _brin_end_parallel(brinleader, NULL);
    2541           2 :         return;
    2542             :     }
    2543             : 
    2544             :     /* Save leader state now that it's clear build will be parallel */
    2545           8 :     buildstate->bs_leader = brinleader;
    2546             : 
    2547             :     /* Join heap scan ourselves */
    2548           8 :     if (leaderparticipates)
    2549           8 :         _brin_leader_participate_as_worker(buildstate, heap, index);
    2550             : 
    2551             :     /*
    2552             :      * Caller needs to wait for all launched workers when we return.  Make
    2553             :      * sure that the failure-to-start case will not hang forever.
    2554             :      */
    2555           8 :     WaitForParallelWorkersToAttach(pcxt);
    2556             : }
    2557             : 
    2558             : /*
    2559             :  * Shut down workers, destroy parallel context, and end parallel mode.
    2560             :  */
    2561             : static void
    2562          10 : _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state)
    2563             : {
    2564             :     int         i;
    2565             : 
    2566             :     /* Shutdown worker processes */
    2567          10 :     WaitForParallelWorkersToFinish(brinleader->pcxt);
    2568             : 
    2569             :     /*
    2570             :      * Next, accumulate WAL usage.  (This must wait for the workers to finish,
    2571             :      * or we might get incomplete data.)
    2572             :      */
    2573          22 :     for (i = 0; i < brinleader->pcxt->nworkers_launched; i++)
    2574          12 :         InstrAccumParallelQuery(&brinleader->bufferusage[i], &brinleader->walusage[i]);
    2575             : 
    2576             :     /* Free last reference to MVCC snapshot, if one was used */
    2577          10 :     if (IsMVCCSnapshot(brinleader->snapshot))
    2578           0 :         UnregisterSnapshot(brinleader->snapshot);
    2579          10 :     DestroyParallelContext(brinleader->pcxt);
    2580          10 :     ExitParallelMode();
    2581          10 : }
    2582             : 
    2583             : /*
    2584             :  * Within leader, wait for end of heap scan.
    2585             :  *
    2586             :  * When called, parallel heap scan started by _brin_begin_parallel() will
    2587             :  * already be underway within worker processes (when leader participates
    2588             :  * as a worker, we should end up here just as workers are finishing).
    2589             :  *
    2590             :  * Returns the total number of heap tuples scanned.
    2591             :  */
    2592             : static double
    2593           8 : _brin_parallel_heapscan(BrinBuildState *state)
    2594             : {
    2595           8 :     BrinShared *brinshared = state->bs_leader->brinshared;
    2596             :     int         nparticipanttuplesorts;
    2597             : 
    2598           8 :     nparticipanttuplesorts = state->bs_leader->nparticipanttuplesorts;
    2599             :     for (;;)
    2600             :     {
    2601          26 :         SpinLockAcquire(&brinshared->mutex);
    2602          26 :         if (brinshared->nparticipantsdone == nparticipanttuplesorts)
    2603             :         {
    2604             :             /* copy the data into leader state */
    2605           8 :             state->bs_reltuples = brinshared->reltuples;
    2606           8 :             state->bs_numtuples = brinshared->indtuples;
    2607             : 
    2608           8 :             SpinLockRelease(&brinshared->mutex);
    2609           8 :             break;
    2610             :         }
    2611          18 :         SpinLockRelease(&brinshared->mutex);
    2612             : 
    2613          18 :         ConditionVariableSleep(&brinshared->workersdonecv,
    2614             :                                WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
    2615             :     }
    2616             : 
    2617           8 :     ConditionVariableCancelSleep();
    2618             : 
    2619           8 :     return state->bs_reltuples;
    2620             : }
    2621             : 
    2622             : /*
    2623             :  * Within leader, wait for end of heap scan and merge per-worker results.
    2624             :  *
    2625             :  * After waiting for all workers to finish, merge the per-worker results into
    2626             :  * the complete index. The results from each worker are sorted by block number
    2627             :  * (start of the page range). While combining the per-worker results we merge
    2628             :  * summaries for the same page range, and also fill-in empty summaries for
    2629             :  * ranges without any tuples.
    2630             :  *
    2631             :  * Returns the total number of heap tuples scanned.
    2632             :  */
    2633             : static double
    2634           8 : _brin_parallel_merge(BrinBuildState *state)
    2635             : {
    2636             :     BrinTuple  *btup;
    2637           8 :     BrinMemTuple *memtuple = NULL;
    2638             :     Size        tuplen;
    2639           8 :     BlockNumber prevblkno = InvalidBlockNumber;
    2640             :     MemoryContext rangeCxt,
    2641             :                 oldCxt;
    2642             :     double      reltuples;
    2643             : 
    2644             :     /* wait for workers to scan table and produce partial results */
    2645           8 :     reltuples = _brin_parallel_heapscan(state);
    2646             : 
    2647             :     /* do the actual sort in the leader */
    2648           8 :     tuplesort_performsort(state->bs_sortstate);
    2649             : 
    2650             :     /*
    2651             :      * Initialize BrinMemTuple we'll use to union summaries from workers (in
    2652             :      * case they happened to produce parts of the same page range).
    2653             :      */
    2654           8 :     memtuple = brin_new_memtuple(state->bs_bdesc);
    2655             : 
    2656             :     /*
    2657             :      * Create a memory context we'll reset to combine results for a single
    2658             :      * page range (received from the workers). We don't expect huge number of
    2659             :      * overlaps under regular circumstances, because for large tables the
    2660             :      * chunk size is likely larger than the BRIN page range), but it can
    2661             :      * happen, and the union functions may do all kinds of stuff. So we better
    2662             :      * reset the context once in a while.
    2663             :      */
    2664           8 :     rangeCxt = AllocSetContextCreate(CurrentMemoryContext,
    2665             :                                      "brin union",
    2666             :                                      ALLOCSET_DEFAULT_SIZES);
    2667           8 :     oldCxt = MemoryContextSwitchTo(rangeCxt);
    2668             : 
    2669             :     /*
    2670             :      * Read the BRIN tuples from the shared tuplesort, sorted by block number.
    2671             :      * That probably gives us an index that is cheaper to scan, thanks to
    2672             :      * mostly getting data from the same index page as before.
    2673             :      */
    2674          50 :     while ((btup = tuplesort_getbrintuple(state->bs_sortstate, &tuplen, true)) != NULL)
    2675             :     {
    2676             :         /* Ranges should be multiples of pages_per_range for the index. */
    2677             :         Assert(btup->bt_blkno % state->bs_leader->brinshared->pagesPerRange == 0);
    2678             : 
    2679             :         /*
    2680             :          * Do we need to union summaries for the same page range?
    2681             :          *
    2682             :          * If this is the first brin tuple we read, then just deform it into
    2683             :          * the memtuple, and continue with the next one from tuplesort. We
    2684             :          * however may need to insert empty summaries into the index.
    2685             :          *
    2686             :          * If it's the same block as the last we saw, we simply union the brin
    2687             :          * tuple into it, and we're done - we don't even need to insert empty
    2688             :          * ranges, because that was done earlier when we saw the first brin
    2689             :          * tuple (for this range).
    2690             :          *
    2691             :          * Finally, if it's not the first brin tuple, and it's not the same
    2692             :          * page range, we need to do the insert and then deform the tuple into
    2693             :          * the memtuple. Then we'll insert empty ranges before the new brin
    2694             :          * tuple, if needed.
    2695             :          */
    2696          42 :         if (prevblkno == InvalidBlockNumber)
    2697             :         {
    2698             :             /* First brin tuples, just deform into memtuple. */
    2699           2 :             memtuple = brin_deform_tuple(state->bs_bdesc, btup, memtuple);
    2700             : 
    2701             :             /* continue to insert empty pages before thisblock */
    2702             :         }
    2703          40 :         else if (memtuple->bt_blkno == btup->bt_blkno)
    2704             :         {
    2705             :             /*
    2706             :              * Not the first brin tuple, but same page range as the previous
    2707             :              * one, so we can merge it into the memtuple.
    2708             :              */
    2709           2 :             union_tuples(state->bs_bdesc, memtuple, btup);
    2710           2 :             continue;
    2711             :         }
    2712             :         else
    2713             :         {
    2714             :             BrinTuple  *tmp;
    2715             :             Size        len;
    2716             : 
    2717             :             /*
    2718             :              * We got brin tuple for a different page range, so form a brin
    2719             :              * tuple from the memtuple, insert it, and re-init the memtuple
    2720             :              * from the new brin tuple.
    2721             :              */
    2722          38 :             tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno,
    2723             :                                   memtuple, &len);
    2724             : 
    2725          38 :             brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
    2726             :                           &state->bs_currentInsertBuf, tmp->bt_blkno, tmp, len);
    2727             : 
    2728             :             /*
    2729             :              * Reset the per-output-range context. This frees all the memory
    2730             :              * possibly allocated by the union functions, and also the BRIN
    2731             :              * tuple we just formed and inserted.
    2732             :              */
    2733          38 :             MemoryContextReset(rangeCxt);
    2734             : 
    2735          38 :             memtuple = brin_deform_tuple(state->bs_bdesc, btup, memtuple);
    2736             : 
    2737             :             /* continue to insert empty pages before thisblock */
    2738             :         }
    2739             : 
    2740             :         /* Fill empty ranges for all ranges missing in the tuplesort. */
    2741          40 :         brin_fill_empty_ranges(state, prevblkno, btup->bt_blkno);
    2742             : 
    2743          40 :         prevblkno = btup->bt_blkno;
    2744             :     }
    2745             : 
    2746           8 :     tuplesort_end(state->bs_sortstate);
    2747             : 
    2748             :     /* Fill the BRIN tuple for the last page range with data. */
    2749           8 :     if (prevblkno != InvalidBlockNumber)
    2750             :     {
    2751             :         BrinTuple  *tmp;
    2752             :         Size        len;
    2753             : 
    2754           2 :         tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno,
    2755             :                               memtuple, &len);
    2756             : 
    2757           2 :         brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
    2758             :                       &state->bs_currentInsertBuf, tmp->bt_blkno, tmp, len);
    2759             : 
    2760           2 :         pfree(tmp);
    2761             :     }
    2762             : 
    2763             :     /* Fill empty ranges at the end, for all ranges missing in the tuplesort. */
    2764           8 :     brin_fill_empty_ranges(state, prevblkno, state->bs_maxRangeStart);
    2765             : 
    2766             :     /*
    2767             :      * Switch back to the original memory context, and destroy the one we
    2768             :      * created to isolate the union_tuple calls.
    2769             :      */
    2770           8 :     MemoryContextSwitchTo(oldCxt);
    2771           8 :     MemoryContextDelete(rangeCxt);
    2772             : 
    2773           8 :     return reltuples;
    2774             : }
    2775             : 
    2776             : /*
    2777             :  * Returns size of shared memory required to store state for a parallel
    2778             :  * brin index build based on the snapshot its parallel scan will use.
    2779             :  */
    2780             : static Size
    2781          10 : _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot)
    2782             : {
    2783             :     /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
    2784          10 :     return add_size(BUFFERALIGN(sizeof(BrinShared)),
    2785             :                     table_parallelscan_estimate(heap, snapshot));
    2786             : }
    2787             : 
    2788             : /*
    2789             :  * Within leader, participate as a parallel worker.
    2790             :  */
    2791             : static void
    2792           8 : _brin_leader_participate_as_worker(BrinBuildState *buildstate, Relation heap, Relation index)
    2793             : {
    2794           8 :     BrinLeader *brinleader = buildstate->bs_leader;
    2795             :     int         sortmem;
    2796             : 
    2797             :     /*
    2798             :      * Might as well use reliable figure when doling out maintenance_work_mem
    2799             :      * (when requested number of workers were not launched, this will be
    2800             :      * somewhat higher than it is for other workers).
    2801             :      */
    2802           8 :     sortmem = maintenance_work_mem / brinleader->nparticipanttuplesorts;
    2803             : 
    2804             :     /* Perform work common to all participants */
    2805           8 :     _brin_parallel_scan_and_build(buildstate, brinleader->brinshared,
    2806             :                                   brinleader->sharedsort, heap, index, sortmem, true);
    2807           8 : }
    2808             : 
    2809             : /*
    2810             :  * Perform a worker's portion of a parallel sort.
    2811             :  *
    2812             :  * This generates a tuplesort for the worker portion of the table.
    2813             :  *
    2814             :  * sortmem is the amount of working memory to use within each worker,
    2815             :  * expressed in KBs.
    2816             :  *
    2817             :  * When this returns, workers are done, and need only release resources.
    2818             :  */
    2819             : static void
    2820          20 : _brin_parallel_scan_and_build(BrinBuildState *state,
    2821             :                               BrinShared *brinshared, Sharedsort *sharedsort,
    2822             :                               Relation heap, Relation index,
    2823             :                               int sortmem, bool progress)
    2824             : {
    2825             :     SortCoordinate coordinate;
    2826             :     TableScanDesc scan;
    2827             :     double      reltuples;
    2828             :     IndexInfo  *indexInfo;
    2829             : 
    2830             :     /* Initialize local tuplesort coordination state */
    2831          20 :     coordinate = palloc0(sizeof(SortCoordinateData));
    2832          20 :     coordinate->isWorker = true;
    2833          20 :     coordinate->nParticipants = -1;
    2834          20 :     coordinate->sharedsort = sharedsort;
    2835             : 
    2836             :     /* Begin "partial" tuplesort */
    2837          20 :     state->bs_sortstate = tuplesort_begin_index_brin(sortmem, coordinate,
    2838             :                                                      TUPLESORT_NONE);
    2839             : 
    2840             :     /* Join parallel scan */
    2841          20 :     indexInfo = BuildIndexInfo(index);
    2842          20 :     indexInfo->ii_Concurrent = brinshared->isconcurrent;
    2843             : 
    2844          20 :     scan = table_beginscan_parallel(heap,
    2845             :                                     ParallelTableScanFromBrinShared(brinshared));
    2846             : 
    2847          20 :     reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
    2848             :                                        brinbuildCallbackParallel, state, scan);
    2849             : 
    2850             :     /* insert the last item */
    2851          20 :     form_and_spill_tuple(state);
    2852             : 
    2853             :     /* sort the BRIN ranges built by this worker */
    2854          20 :     tuplesort_performsort(state->bs_sortstate);
    2855             : 
    2856          20 :     state->bs_reltuples += reltuples;
    2857             : 
    2858             :     /*
    2859             :      * Done.  Record ambuild statistics.
    2860             :      */
    2861          20 :     SpinLockAcquire(&brinshared->mutex);
    2862          20 :     brinshared->nparticipantsdone++;
    2863          20 :     brinshared->reltuples += state->bs_reltuples;
    2864          20 :     brinshared->indtuples += state->bs_numtuples;
    2865          20 :     SpinLockRelease(&brinshared->mutex);
    2866             : 
    2867             :     /* Notify leader */
    2868          20 :     ConditionVariableSignal(&brinshared->workersdonecv);
    2869             : 
    2870          20 :     tuplesort_end(state->bs_sortstate);
    2871          20 : }
    2872             : 
    2873             : /*
    2874             :  * Perform work within a launched parallel process.
    2875             :  */
    2876             : void
    2877          12 : _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
    2878             : {
    2879             :     char       *sharedquery;
    2880             :     BrinShared *brinshared;
    2881             :     Sharedsort *sharedsort;
    2882             :     BrinBuildState *buildstate;
    2883             :     Relation    heapRel;
    2884             :     Relation    indexRel;
    2885             :     LOCKMODE    heapLockmode;
    2886             :     LOCKMODE    indexLockmode;
    2887             :     WalUsage   *walusage;
    2888             :     BufferUsage *bufferusage;
    2889             :     int         sortmem;
    2890             : 
    2891             :     /*
    2892             :      * The only possible status flag that can be set to the parallel worker is
    2893             :      * PROC_IN_SAFE_IC.
    2894             :      */
    2895             :     Assert((MyProc->statusFlags == 0) ||
    2896             :            (MyProc->statusFlags == PROC_IN_SAFE_IC));
    2897             : 
    2898             :     /* Set debug_query_string for individual workers first */
    2899          12 :     sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
    2900          12 :     debug_query_string = sharedquery;
    2901             : 
    2902             :     /* Report the query string from leader */
    2903          12 :     pgstat_report_activity(STATE_RUNNING, debug_query_string);
    2904             : 
    2905             :     /* Look up brin shared state */
    2906          12 :     brinshared = shm_toc_lookup(toc, PARALLEL_KEY_BRIN_SHARED, false);
    2907             : 
    2908             :     /* Open relations using lock modes known to be obtained by index.c */
    2909          12 :     if (!brinshared->isconcurrent)
    2910             :     {
    2911          12 :         heapLockmode = ShareLock;
    2912          12 :         indexLockmode = AccessExclusiveLock;
    2913             :     }
    2914             :     else
    2915             :     {
    2916           0 :         heapLockmode = ShareUpdateExclusiveLock;
    2917           0 :         indexLockmode = RowExclusiveLock;
    2918             :     }
    2919             : 
    2920             :     /* Track query ID */
    2921          12 :     pgstat_report_query_id(brinshared->queryid, false);
    2922             : 
    2923             :     /* Open relations within worker */
    2924          12 :     heapRel = table_open(brinshared->heaprelid, heapLockmode);
    2925          12 :     indexRel = index_open(brinshared->indexrelid, indexLockmode);
    2926             : 
    2927          12 :     buildstate = initialize_brin_buildstate(indexRel, NULL,
    2928             :                                             brinshared->pagesPerRange,
    2929             :                                             InvalidBlockNumber);
    2930             : 
    2931             :     /* Look up shared state private to tuplesort.c */
    2932          12 :     sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
    2933          12 :     tuplesort_attach_shared(sharedsort, seg);
    2934             : 
    2935             :     /* Prepare to track buffer usage during parallel execution */
    2936          12 :     InstrStartParallelQuery();
    2937             : 
    2938             :     /*
    2939             :      * Might as well use reliable figure when doling out maintenance_work_mem
    2940             :      * (when requested number of workers were not launched, this will be
    2941             :      * somewhat higher than it is for other workers).
    2942             :      */
    2943          12 :     sortmem = maintenance_work_mem / brinshared->scantuplesortstates;
    2944             : 
    2945          12 :     _brin_parallel_scan_and_build(buildstate, brinshared, sharedsort,
    2946             :                                   heapRel, indexRel, sortmem, false);
    2947             : 
    2948             :     /* Report WAL/buffer usage during parallel execution */
    2949          12 :     bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
    2950          12 :     walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
    2951          12 :     InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
    2952          12 :                           &walusage[ParallelWorkerNumber]);
    2953             : 
    2954          12 :     index_close(indexRel, indexLockmode);
    2955          12 :     table_close(heapRel, heapLockmode);
    2956          12 : }
    2957             : 
    2958             : /*
    2959             :  * brin_build_empty_tuple
    2960             :  *      Maybe initialize a BRIN tuple representing empty range.
    2961             :  *
    2962             :  * Returns a BRIN tuple representing an empty page range starting at the
    2963             :  * specified block number. The empty tuple is initialized only once, when it's
    2964             :  * needed for the first time, stored in the memory context bs_context to ensure
    2965             :  * proper life span, and reused on following calls. All empty tuples are
    2966             :  * exactly the same except for the bt_blkno field, which is set to the value
    2967             :  * in blkno parameter.
    2968             :  */
    2969             : static void
    2970          20 : brin_build_empty_tuple(BrinBuildState *state, BlockNumber blkno)
    2971             : {
    2972             :     /* First time an empty tuple is requested? If yes, initialize it. */
    2973          20 :     if (state->bs_emptyTuple == NULL)
    2974             :     {
    2975             :         MemoryContext oldcxt;
    2976          10 :         BrinMemTuple *dtuple = brin_new_memtuple(state->bs_bdesc);
    2977             : 
    2978             :         /* Allocate the tuple in context for the whole index build. */
    2979          10 :         oldcxt = MemoryContextSwitchTo(state->bs_context);
    2980             : 
    2981          10 :         state->bs_emptyTuple = brin_form_tuple(state->bs_bdesc, blkno, dtuple,
    2982             :                                                &state->bs_emptyTupleLen);
    2983             : 
    2984          10 :         MemoryContextSwitchTo(oldcxt);
    2985             :     }
    2986             :     else
    2987             :     {
    2988             :         /* If we already have an empty tuple, just update the block. */
    2989          10 :         state->bs_emptyTuple->bt_blkno = blkno;
    2990             :     }
    2991          20 : }
    2992             : 
    2993             : /*
    2994             :  * brin_fill_empty_ranges
    2995             :  *      Add BRIN index tuples representing empty page ranges.
    2996             :  *
    2997             :  * prevRange/nextRange determine for which page ranges to add empty summaries.
    2998             :  * Both boundaries are exclusive, i.e. only ranges starting at blkno for which
    2999             :  * (prevRange < blkno < nextRange) will be added to the index.
    3000             :  *
    3001             :  * If prevRange is InvalidBlockNumber, this means there was no previous page
    3002             :  * range (i.e. the first empty range to add is for blkno=0).
    3003             :  *
    3004             :  * The empty tuple is built only once, and then reused for all future calls.
    3005             :  */
    3006             : static void
    3007         408 : brin_fill_empty_ranges(BrinBuildState *state,
    3008             :                        BlockNumber prevRange, BlockNumber nextRange)
    3009             : {
    3010             :     BlockNumber blkno;
    3011             : 
    3012             :     /*
    3013             :      * If we already summarized some ranges, we need to start with the next
    3014             :      * one. Otherwise start from the first range of the table.
    3015             :      */
    3016         408 :     blkno = (prevRange == InvalidBlockNumber) ? 0 : (prevRange + state->bs_pagesPerRange);
    3017             : 
    3018             :     /* Generate empty ranges until we hit the next non-empty range. */
    3019         428 :     while (blkno < nextRange)
    3020             :     {
    3021             :         /* Did we already build the empty tuple? If not, do it now. */
    3022          20 :         brin_build_empty_tuple(state, blkno);
    3023             : 
    3024          20 :         brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
    3025             :                       &state->bs_currentInsertBuf,
    3026          20 :                       blkno, state->bs_emptyTuple, state->bs_emptyTupleLen);
    3027             : 
    3028             :         /* try next page range */
    3029          20 :         blkno += state->bs_pagesPerRange;
    3030             :     }
    3031         408 : }

Generated by: LCOV version 1.16