LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtsort.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13beta1 Lines: 533 549 97.1 %
Date: 2020-06-01 09:07:10 Functions: 22 22 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtsort.c
       4             :  *      Build a btree from sorted input by loading leaf pages sequentially.
       5             :  *
       6             :  * NOTES
       7             :  *
       8             :  * We use tuplesort.c to sort the given index tuples into order.
       9             :  * Then we scan the index tuples in order and build the btree pages
      10             :  * for each level.  We load source tuples into leaf-level pages.
      11             :  * Whenever we fill a page at one level, we add a link to it to its
      12             :  * parent level (starting a new parent level if necessary).  When
      13             :  * done, we write out each final page on each level, adding it to
      14             :  * its parent level.  When we have only one page on a level, it must be
      15             :  * the root -- it can be attached to the btree metapage and we are done.
      16             :  *
      17             :  * It is not wise to pack the pages entirely full, since then *any*
      18             :  * insertion would cause a split (and not only of the leaf page; the need
      19             :  * for a split would cascade right up the tree).  The steady-state load
      20             :  * factor for btrees is usually estimated at 70%.  We choose to pack leaf
      21             :  * pages to the user-controllable fill factor (default 90%) while upper pages
      22             :  * are always packed to 70%.  This gives us reasonable density (there aren't
      23             :  * many upper pages if the keys are reasonable-size) without risking a lot of
      24             :  * cascading splits during early insertions.
      25             :  *
      26             :  * Formerly the index pages being built were kept in shared buffers, but
      27             :  * that is of no value (since other backends have no interest in them yet)
      28             :  * and it created locking problems for CHECKPOINT, because the upper-level
      29             :  * pages were held exclusive-locked for long periods.  Now we just build
      30             :  * the pages in local memory and smgrwrite or smgrextend them as we finish
      31             :  * them.  They will need to be re-read into shared buffers on first use after
      32             :  * the build finishes.
      33             :  *
      34             :  * This code isn't concerned about the FSM at all. The caller is responsible
      35             :  * for initializing that.
      36             :  *
      37             :  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
      38             :  * Portions Copyright (c) 1994, Regents of the University of California
      39             :  *
      40             :  * IDENTIFICATION
      41             :  *    src/backend/access/nbtree/nbtsort.c
      42             :  *
      43             :  *-------------------------------------------------------------------------
      44             :  */
      45             : 
      46             : #include "postgres.h"
      47             : 
      48             : #include "access/nbtree.h"
      49             : #include "access/parallel.h"
      50             : #include "access/relscan.h"
      51             : #include "access/table.h"
      52             : #include "access/tableam.h"
      53             : #include "access/xact.h"
      54             : #include "access/xlog.h"
      55             : #include "access/xloginsert.h"
      56             : #include "catalog/index.h"
      57             : #include "commands/progress.h"
      58             : #include "executor/instrument.h"
      59             : #include "miscadmin.h"
      60             : #include "pgstat.h"
      61             : #include "storage/smgr.h"
      62             : #include "tcop/tcopprot.h"        /* pgrminclude ignore */
      63             : #include "utils/rel.h"
      64             : #include "utils/sortsupport.h"
      65             : #include "utils/tuplesort.h"
      66             : 
      67             : 
      68             : /* Magic numbers for parallel state sharing */
      69             : #define PARALLEL_KEY_BTREE_SHARED       UINT64CONST(0xA000000000000001)
      70             : #define PARALLEL_KEY_TUPLESORT          UINT64CONST(0xA000000000000002)
      71             : #define PARALLEL_KEY_TUPLESORT_SPOOL2   UINT64CONST(0xA000000000000003)
      72             : #define PARALLEL_KEY_QUERY_TEXT         UINT64CONST(0xA000000000000004)
      73             : #define PARALLEL_KEY_WAL_USAGE          UINT64CONST(0xA000000000000005)
      74             : #define PARALLEL_KEY_BUFFER_USAGE       UINT64CONST(0xA000000000000006)
      75             : 
      76             : /*
      77             :  * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
      78             :  * parallel index builds.  This may be useful as a debugging aid.
      79             : #undef DISABLE_LEADER_PARTICIPATION
      80             :  */
      81             : 
      82             : /*
      83             :  * Status record for spooling/sorting phase.  (Note we may have two of
      84             :  * these due to the special requirements for uniqueness-checking with
      85             :  * dead tuples.)
      86             :  */
      87             : typedef struct BTSpool
      88             : {
      89             :     Tuplesortstate *sortstate;  /* state data for tuplesort.c */
      90             :     Relation    heap;
      91             :     Relation    index;
      92             :     bool        isunique;
      93             : } BTSpool;
      94             : 
      95             : /*
      96             :  * Status for index builds performed in parallel.  This is allocated in a
      97             :  * dynamic shared memory segment.  Note that there is a separate tuplesort TOC
      98             :  * entry, private to tuplesort.c but allocated by this module on its behalf.
      99             :  */
     100             : typedef struct BTShared
     101             : {
     102             :     /*
     103             :      * These fields are not modified during the sort.  They primarily exist
     104             :      * for the benefit of worker processes that need to create BTSpool state
     105             :      * corresponding to that used by the leader.
     106             :      */
     107             :     Oid         heaprelid;
     108             :     Oid         indexrelid;
     109             :     bool        isunique;
     110             :     bool        isconcurrent;
     111             :     int         scantuplesortstates;
     112             : 
     113             :     /*
     114             :      * workersdonecv is used to monitor the progress of workers.  All parallel
     115             :      * participants must indicate that they are done before leader can use
     116             :      * mutable state that workers maintain during scan (and before leader can
     117             :      * proceed to tuplesort_performsort()).
     118             :      */
     119             :     ConditionVariable workersdonecv;
     120             : 
     121             :     /*
     122             :      * mutex protects all fields before heapdesc.
     123             :      *
     124             :      * These fields contain status information of interest to B-Tree index
     125             :      * builds that must work just the same when an index is built in parallel.
     126             :      */
     127             :     slock_t     mutex;
     128             : 
     129             :     /*
     130             :      * Mutable state that is maintained by workers, and reported back to
     131             :      * leader at end of parallel scan.
     132             :      *
     133             :      * nparticipantsdone is number of worker processes finished.
     134             :      *
     135             :      * reltuples is the total number of input heap tuples.
     136             :      *
     137             :      * havedead indicates if RECENTLY_DEAD tuples were encountered during
     138             :      * build.
     139             :      *
     140             :      * indtuples is the total number of tuples that made it into the index.
     141             :      *
     142             :      * brokenhotchain indicates if any worker detected a broken HOT chain
     143             :      * during build.
     144             :      */
     145             :     int         nparticipantsdone;
     146             :     double      reltuples;
     147             :     bool        havedead;
     148             :     double      indtuples;
     149             :     bool        brokenhotchain;
     150             : 
     151             :     /*
     152             :      * ParallelTableScanDescData data follows. Can't directly embed here, as
     153             :      * implementations of the parallel table scan desc interface might need
     154             :      * stronger alignment.
     155             :      */
     156             : } BTShared;
     157             : 
     158             : /*
     159             :  * Return pointer to a BTShared's parallel table scan.
     160             :  *
     161             :  * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
     162             :  * MAXALIGN.
     163             :  */
     164             : #define ParallelTableScanFromBTShared(shared) \
     165             :     (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
     166             : 
     167             : /*
     168             :  * Status for leader in parallel index build.
     169             :  */
     170             : typedef struct BTLeader
     171             : {
     172             :     /* parallel context itself */
     173             :     ParallelContext *pcxt;
     174             : 
     175             :     /*
     176             :      * nparticipanttuplesorts is the exact number of worker processes
     177             :      * successfully launched, plus one leader process if it participates as a
     178             :      * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
     179             :      * participating as a worker).
     180             :      */
     181             :     int         nparticipanttuplesorts;
     182             : 
     183             :     /*
     184             :      * Leader process convenience pointers to shared state (leader avoids TOC
     185             :      * lookups).
     186             :      *
     187             :      * btshared is the shared state for entire build.  sharedsort is the
     188             :      * shared, tuplesort-managed state passed to each process tuplesort.
     189             :      * sharedsort2 is the corresponding btspool2 shared state, used only when
     190             :      * building unique indexes.  snapshot is the snapshot used by the scan iff
     191             :      * an MVCC snapshot is required.
     192             :      */
     193             :     BTShared   *btshared;
     194             :     Sharedsort *sharedsort;
     195             :     Sharedsort *sharedsort2;
     196             :     Snapshot    snapshot;
     197             :     WalUsage   *walusage;
     198             :     BufferUsage *bufferusage;
     199             : } BTLeader;
     200             : 
     201             : /*
     202             :  * Working state for btbuild and its callback.
     203             :  *
     204             :  * When parallel CREATE INDEX is used, there is a BTBuildState for each
     205             :  * participant.
     206             :  */
     207             : typedef struct BTBuildState
     208             : {
     209             :     bool        isunique;
     210             :     bool        havedead;
     211             :     Relation    heap;
     212             :     BTSpool    *spool;
     213             : 
     214             :     /*
     215             :      * spool2 is needed only when the index is a unique index. Dead tuples are
     216             :      * put into spool2 instead of spool in order to avoid uniqueness check.
     217             :      */
     218             :     BTSpool    *spool2;
     219             :     double      indtuples;
     220             : 
     221             :     /*
     222             :      * btleader is only present when a parallel index build is performed, and
     223             :      * only in the leader process. (Actually, only the leader has a
     224             :      * BTBuildState.  Workers have their own spool and spool2, though.)
     225             :      */
     226             :     BTLeader   *btleader;
     227             : } BTBuildState;
     228             : 
     229             : /*
     230             :  * Status record for a btree page being built.  We have one of these
     231             :  * for each active tree level.
     232             :  */
     233             : typedef struct BTPageState
     234             : {
     235             :     Page        btps_page;      /* workspace for page building */
     236             :     BlockNumber btps_blkno;     /* block # to write this page at */
     237             :     IndexTuple  btps_lowkey;    /* page's strict lower bound pivot tuple */
     238             :     OffsetNumber btps_lastoff;  /* last item offset loaded */
     239             :     Size        btps_lastextra; /* last item's extra posting list space */
     240             :     uint32      btps_level;     /* tree level (0 = leaf) */
     241             :     Size        btps_full;      /* "full" if less than this much free space */
     242             :     struct BTPageState *btps_next;  /* link to parent level, if any */
     243             : } BTPageState;
     244             : 
     245             : /*
     246             :  * Overall status record for index writing phase.
     247             :  */
     248             : typedef struct BTWriteState
     249             : {
     250             :     Relation    heap;
     251             :     Relation    index;
     252             :     BTScanInsert inskey;        /* generic insertion scankey */
     253             :     bool        btws_use_wal;   /* dump pages to WAL? */
     254             :     BlockNumber btws_pages_alloced; /* # pages allocated */
     255             :     BlockNumber btws_pages_written; /* # pages written out */
     256             :     Page        btws_zeropage;  /* workspace for filling zeroes */
     257             : } BTWriteState;
     258             : 
     259             : 
     260             : static double _bt_spools_heapscan(Relation heap, Relation index,
     261             :                                   BTBuildState *buildstate, IndexInfo *indexInfo);
     262             : static void _bt_spooldestroy(BTSpool *btspool);
     263             : static void _bt_spool(BTSpool *btspool, ItemPointer self,
     264             :                       Datum *values, bool *isnull);
     265             : static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
     266             : static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
     267             :                                bool *isnull, bool tupleIsAlive, void *state);
     268             : static Page _bt_blnewpage(uint32 level);
     269             : static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
     270             : static void _bt_slideleft(Page page);
     271             : static void _bt_sortaddtup(Page page, Size itemsize,
     272             :                            IndexTuple itup, OffsetNumber itup_off,
     273             :                            bool newfirstdataitem);
     274             : static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
     275             :                          IndexTuple itup, Size truncextra);
     276             : static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
     277             :                                           BTPageState *state,
     278             :                                           BTDedupState dstate);
     279             : static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
     280             : static void _bt_load(BTWriteState *wstate,
     281             :                      BTSpool *btspool, BTSpool *btspool2);
     282             : static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
     283             :                                int request);
     284             : static void _bt_end_parallel(BTLeader *btleader);
     285             : static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
     286             : static double _bt_parallel_heapscan(BTBuildState *buildstate,
     287             :                                     bool *brokenhotchain);
     288             : static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
     289             : static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
     290             :                                        BTShared *btshared, Sharedsort *sharedsort,
     291             :                                        Sharedsort *sharedsort2, int sortmem,
     292             :                                        bool progress);
     293             : 
     294             : 
     295             : /*
     296             :  *  btbuild() -- build a new btree index.
     297             :  */
     298             : IndexBuildResult *
     299       76526 : btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
     300             : {
     301             :     IndexBuildResult *result;
     302             :     BTBuildState buildstate;
     303             :     double      reltuples;
     304             : 
     305             : #ifdef BTREE_BUILD_STATS
     306             :     if (log_btree_build_stats)
     307             :         ResetUsage();
     308             : #endif                          /* BTREE_BUILD_STATS */
     309             : 
     310       76526 :     buildstate.isunique = indexInfo->ii_Unique;
     311       76526 :     buildstate.havedead = false;
     312       76526 :     buildstate.heap = heap;
     313       76526 :     buildstate.spool = NULL;
     314       76526 :     buildstate.spool2 = NULL;
     315       76526 :     buildstate.indtuples = 0;
     316       76526 :     buildstate.btleader = NULL;
     317             : 
     318             :     /*
     319             :      * We expect to be called exactly once for any index relation. If that's
     320             :      * not the case, big trouble's what we have.
     321             :      */
     322       76526 :     if (RelationGetNumberOfBlocks(index) != 0)
     323           0 :         elog(ERROR, "index \"%s\" already contains data",
     324             :              RelationGetRelationName(index));
     325             : 
     326       76526 :     reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
     327             : 
     328             :     /*
     329             :      * Finish the build by (1) completing the sort of the spool file, (2)
     330             :      * inserting the sorted tuples into btree pages and (3) building the upper
     331             :      * levels.  Finally, it may also be necessary to end use of parallelism.
     332             :      */
     333       76522 :     _bt_leafbuild(buildstate.spool, buildstate.spool2);
     334       76478 :     _bt_spooldestroy(buildstate.spool);
     335       76478 :     if (buildstate.spool2)
     336          12 :         _bt_spooldestroy(buildstate.spool2);
     337       76478 :     if (buildstate.btleader)
     338          88 :         _bt_end_parallel(buildstate.btleader);
     339             : 
     340       76478 :     result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
     341             : 
     342       76478 :     result->heap_tuples = reltuples;
     343       76478 :     result->index_tuples = buildstate.indtuples;
     344             : 
     345             : #ifdef BTREE_BUILD_STATS
     346             :     if (log_btree_build_stats)
     347             :     {
     348             :         ShowUsage("BTREE BUILD STATS");
     349             :         ResetUsage();
     350             :     }
     351             : #endif                          /* BTREE_BUILD_STATS */
     352             : 
     353       76478 :     return result;
     354             : }
     355             : 
     356             : /*
     357             :  * Create and initialize one or two spool structures, and save them in caller's
     358             :  * buildstate argument.  May also fill-in fields within indexInfo used by index
     359             :  * builds.
     360             :  *
     361             :  * Scans the heap, possibly in parallel, filling spools with IndexTuples.  This
     362             :  * routine encapsulates all aspects of managing parallelism.  Caller need only
     363             :  * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
     364             :  *
     365             :  * Returns the total number of heap tuples scanned.
     366             :  */
     367             : static double
     368       76526 : _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
     369             :                     IndexInfo *indexInfo)
     370             : {
     371       76526 :     BTSpool    *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
     372       76526 :     SortCoordinate coordinate = NULL;
     373       76526 :     double      reltuples = 0;
     374             : 
     375             :     /*
     376             :      * We size the sort area as maintenance_work_mem rather than work_mem to
     377             :      * speed index creation.  This should be OK since a single backend can't
     378             :      * run multiple index creations in parallel (see also: notes on
     379             :      * parallelism and maintenance_work_mem below).
     380             :      */
     381       76526 :     btspool->heap = heap;
     382       76526 :     btspool->index = index;
     383       76526 :     btspool->isunique = indexInfo->ii_Unique;
     384             : 
     385             :     /* Save as primary spool */
     386       76526 :     buildstate->spool = btspool;
     387             : 
     388             :     /* Report table scan phase started */
     389       76526 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     390             :                                  PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);
     391             : 
     392             :     /* Attempt to launch parallel worker scan when required */
     393       76526 :     if (indexInfo->ii_ParallelWorkers > 0)
     394          88 :         _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
     395             :                            indexInfo->ii_ParallelWorkers);
     396             : 
     397             :     /*
     398             :      * If parallel build requested and at least one worker process was
     399             :      * successfully launched, set up coordination state
     400             :      */
     401       76526 :     if (buildstate->btleader)
     402             :     {
     403          88 :         coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
     404          88 :         coordinate->isWorker = false;
     405          88 :         coordinate->nParticipants =
     406          88 :             buildstate->btleader->nparticipanttuplesorts;
     407          88 :         coordinate->sharedsort = buildstate->btleader->sharedsort;
     408             :     }
     409             : 
     410             :     /*
     411             :      * Begin serial/leader tuplesort.
     412             :      *
     413             :      * In cases where parallelism is involved, the leader receives the same
     414             :      * share of maintenance_work_mem as a serial sort (it is generally treated
     415             :      * in the same way as a serial sort once we return).  Parallel worker
     416             :      * Tuplesortstates will have received only a fraction of
     417             :      * maintenance_work_mem, though.
     418             :      *
     419             :      * We rely on the lifetime of the Leader Tuplesortstate almost not
     420             :      * overlapping with any worker Tuplesortstate's lifetime.  There may be
     421             :      * some small overlap, but that's okay because we rely on leader
     422             :      * Tuplesortstate only allocating a small, fixed amount of memory here.
     423             :      * When its tuplesort_performsort() is called (by our caller), and
     424             :      * significant amounts of memory are likely to be used, all workers must
     425             :      * have already freed almost all memory held by their Tuplesortstates
     426             :      * (they are about to go away completely, too).  The overall effect is
     427             :      * that maintenance_work_mem always represents an absolute high watermark
     428             :      * on the amount of memory used by a CREATE INDEX operation, regardless of
     429             :      * the use of parallelism or any other factor.
     430             :      */
     431      153052 :     buildstate->spool->sortstate =
     432       76526 :         tuplesort_begin_index_btree(heap, index, buildstate->isunique,
     433             :                                     maintenance_work_mem, coordinate,
     434             :                                     false);
     435             : 
     436             :     /*
     437             :      * If building a unique index, put dead tuples in a second spool to keep
     438             :      * them out of the uniqueness check.  We expect that the second spool (for
     439             :      * dead tuples) won't get very full, so we give it only work_mem.
     440             :      */
     441       76526 :     if (indexInfo->ii_Unique)
     442             :     {
     443       68082 :         BTSpool    *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
     444       68082 :         SortCoordinate coordinate2 = NULL;
     445             : 
     446             :         /* Initialize secondary spool */
     447       68082 :         btspool2->heap = heap;
     448       68082 :         btspool2->index = index;
     449       68082 :         btspool2->isunique = false;
     450             :         /* Save as secondary spool */
     451       68082 :         buildstate->spool2 = btspool2;
     452             : 
     453       68082 :         if (buildstate->btleader)
     454             :         {
     455             :             /*
     456             :              * Set up non-private state that is passed to
     457             :              * tuplesort_begin_index_btree() about the basic high level
     458             :              * coordination of a parallel sort.
     459             :              */
     460          36 :             coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
     461          36 :             coordinate2->isWorker = false;
     462          36 :             coordinate2->nParticipants =
     463          36 :                 buildstate->btleader->nparticipanttuplesorts;
     464          36 :             coordinate2->sharedsort = buildstate->btleader->sharedsort2;
     465             :         }
     466             : 
     467             :         /*
     468             :          * We expect that the second one (for dead tuples) won't get very
     469             :          * full, so we give it only work_mem
     470             :          */
     471       68082 :         buildstate->spool2->sortstate =
     472       68082 :             tuplesort_begin_index_btree(heap, index, false, work_mem,
     473             :                                         coordinate2, false);
     474             :     }
     475             : 
     476             :     /* Fill spool using either serial or parallel heap scan */
     477       76526 :     if (!buildstate->btleader)
     478       76438 :         reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
     479             :                                            _bt_build_callback, (void *) buildstate,
     480             :                                            NULL);
     481             :     else
     482          88 :         reltuples = _bt_parallel_heapscan(buildstate,
     483             :                                           &indexInfo->ii_BrokenHotChain);
     484             : 
     485             :     /*
     486             :      * Set the progress target for the next phase.  Reset the block number
     487             :      * values set by table_index_build_scan
     488             :      */
     489             :     {
     490       76522 :         const int   index[] = {
     491             :             PROGRESS_CREATEIDX_TUPLES_TOTAL,
     492             :             PROGRESS_SCAN_BLOCKS_TOTAL,
     493             :             PROGRESS_SCAN_BLOCKS_DONE
     494             :         };
     495       76522 :         const int64 val[] = {
     496       76522 :             buildstate->indtuples,
     497             :             0, 0
     498             :         };
     499             : 
     500       76522 :         pgstat_progress_update_multi_param(3, index, val);
     501             :     }
     502             : 
     503             :     /* okay, all heap tuples are spooled */
     504       76522 :     if (buildstate->spool2 && !buildstate->havedead)
     505             :     {
     506             :         /* spool2 turns out to be unnecessary */
     507       68070 :         _bt_spooldestroy(buildstate->spool2);
     508       68070 :         buildstate->spool2 = NULL;
     509             :     }
     510             : 
     511       76522 :     return reltuples;
     512             : }
     513             : 
     514             : /*
     515             :  * clean up a spool structure and its substructures.
     516             :  */
     517             : static void
     518      144560 : _bt_spooldestroy(BTSpool *btspool)
     519             : {
     520      144560 :     tuplesort_end(btspool->sortstate);
     521      144560 :     pfree(btspool);
     522      144560 : }
     523             : 
     524             : /*
     525             :  * spool an index entry into the sort file.
     526             :  */
     527             : static void
     528    15761002 : _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
     529             : {
     530    15761002 :     tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
     531             :                                   self, values, isnull);
     532    15761002 : }
     533             : 
     534             : /*
     535             :  * given a spool loaded by successive calls to _bt_spool,
     536             :  * create an entire btree.
     537             :  */
     538             : static void
     539       76522 : _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
     540             : {
     541             :     BTWriteState wstate;
     542             : 
     543             : #ifdef BTREE_BUILD_STATS
     544             :     if (log_btree_build_stats)
     545             :     {
     546             :         ShowUsage("BTREE BUILD (Spool) STATISTICS");
     547             :         ResetUsage();
     548             :     }
     549             : #endif                          /* BTREE_BUILD_STATS */
     550             : 
     551       76522 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     552             :                                  PROGRESS_BTREE_PHASE_PERFORMSORT_1);
     553       76522 :     tuplesort_performsort(btspool->sortstate);
     554       76478 :     if (btspool2)
     555             :     {
     556          12 :         pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     557             :                                      PROGRESS_BTREE_PHASE_PERFORMSORT_2);
     558          12 :         tuplesort_performsort(btspool2->sortstate);
     559             :     }
     560             : 
     561       76478 :     wstate.heap = btspool->heap;
     562       76478 :     wstate.index = btspool->index;
     563       76478 :     wstate.inskey = _bt_mkscankey(wstate.index, NULL);
     564             :     /* _bt_mkscankey() won't set allequalimage without metapage */
     565       76478 :     wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
     566       76478 :     wstate.btws_use_wal = RelationNeedsWAL(wstate.index);
     567             : 
     568             :     /* reserve the metapage */
     569       76478 :     wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
     570       76478 :     wstate.btws_pages_written = 0;
     571       76478 :     wstate.btws_zeropage = NULL;    /* until needed */
     572             : 
     573       76478 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     574             :                                  PROGRESS_BTREE_PHASE_LEAF_LOAD);
     575       76478 :     _bt_load(&wstate, btspool, btspool2);
     576       76478 : }
     577             : 
     578             : /*
     579             :  * Per-tuple callback for table_index_build_scan
     580             :  */
     581             : static void
     582    15761002 : _bt_build_callback(Relation index,
     583             :                    ItemPointer tid,
     584             :                    Datum *values,
     585             :                    bool *isnull,
     586             :                    bool tupleIsAlive,
     587             :                    void *state)
     588             : {
     589    15761002 :     BTBuildState *buildstate = (BTBuildState *) state;
     590             : 
     591             :     /*
     592             :      * insert the index tuple into the appropriate spool file for subsequent
     593             :      * processing
     594             :      */
     595    15761002 :     if (tupleIsAlive || buildstate->spool2 == NULL)
     596    15760572 :         _bt_spool(buildstate->spool, tid, values, isnull);
     597             :     else
     598             :     {
     599             :         /* dead tuples are put into spool2 */
     600         430 :         buildstate->havedead = true;
     601         430 :         _bt_spool(buildstate->spool2, tid, values, isnull);
     602             :     }
     603             : 
     604    15761002 :     buildstate->indtuples += 1;
     605    15761002 : }
     606             : 
     607             : /*
     608             :  * allocate workspace for a new, clean btree page, not linked to any siblings.
     609             :  */
     610             : static Page
     611       81108 : _bt_blnewpage(uint32 level)
     612             : {
     613             :     Page        page;
     614             :     BTPageOpaque opaque;
     615             : 
     616       81108 :     page = (Page) palloc(BLCKSZ);
     617             : 
     618             :     /* Zero the page and set up standard page header info */
     619       81108 :     _bt_pageinit(page, BLCKSZ);
     620             : 
     621             :     /* Initialize BT opaque state */
     622       81108 :     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
     623       81108 :     opaque->btpo_prev = opaque->btpo_next = P_NONE;
     624       81108 :     opaque->btpo.level = level;
     625       81108 :     opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
     626       81108 :     opaque->btpo_cycleid = 0;
     627             : 
     628             :     /* Make the P_HIKEY line pointer appear allocated */
     629       81108 :     ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
     630             : 
     631       81108 :     return page;
     632             : }
     633             : 
     634             : /*
     635             :  * emit a completed btree page, and release the working storage.
     636             :  */
     637             : static void
     638      157586 : _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
     639             : {
     640             :     /* Ensure rd_smgr is open (could have been closed by relcache flush!) */
     641      157586 :     RelationOpenSmgr(wstate->index);
     642             : 
     643             :     /* XLOG stuff */
     644      157586 :     if (wstate->btws_use_wal)
     645             :     {
     646             :         /* We use the XLOG_FPI record type for this */
     647      136610 :         log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true);
     648             :     }
     649             : 
     650             :     /*
     651             :      * If we have to write pages nonsequentially, fill in the space with
     652             :      * zeroes until we come back and overwrite.  This is not logically
     653             :      * necessary on standard Unix filesystems (unwritten space will read as
     654             :      * zeroes anyway), but it should help to avoid fragmentation. The dummy
     655             :      * pages aren't WAL-logged though.
     656             :      */
     657      184808 :     while (blkno > wstate->btws_pages_written)
     658             :     {
     659       27222 :         if (!wstate->btws_zeropage)
     660       23196 :             wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
     661             :         /* don't set checksum for all-zero page */
     662       27222 :         smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM,
     663       27222 :                    wstate->btws_pages_written++,
     664       27222 :                    (char *) wstate->btws_zeropage,
     665             :                    true);
     666             :     }
     667             : 
     668      157586 :     PageSetChecksumInplace(page, blkno);
     669             : 
     670             :     /*
     671             :      * Now write the page.  There's no need for smgr to schedule an fsync for
     672             :      * this write; we'll do it ourselves before ending the build.
     673             :      */
     674      157586 :     if (blkno == wstate->btws_pages_written)
     675             :     {
     676             :         /* extending the file... */
     677      130364 :         smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
     678             :                    (char *) page, true);
     679      130364 :         wstate->btws_pages_written++;
     680             :     }
     681             :     else
     682             :     {
     683             :         /* overwriting a block we zero-filled before */
     684       27222 :         smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
     685             :                   (char *) page, true);
     686             :     }
     687             : 
     688      157586 :     pfree(page);
     689      157586 : }
     690             : 
     691             : /*
     692             :  * allocate and initialize a new BTPageState.  the returned structure
     693             :  * is suitable for immediate use by _bt_buildadd.
     694             :  */
     695             : static BTPageState *
     696       29362 : _bt_pagestate(BTWriteState *wstate, uint32 level)
     697             : {
     698       29362 :     BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
     699             : 
     700             :     /* create initial page for level */
     701       29362 :     state->btps_page = _bt_blnewpage(level);
     702             : 
     703             :     /* and assign it a page position */
     704       29362 :     state->btps_blkno = wstate->btws_pages_alloced++;
     705             : 
     706       29362 :     state->btps_lowkey = NULL;
     707             :     /* initialize lastoff so first item goes into P_FIRSTKEY */
     708       29362 :     state->btps_lastoff = P_HIKEY;
     709       29362 :     state->btps_lastextra = 0;
     710       29362 :     state->btps_level = level;
     711             :     /* set "full" threshold based on level.  See notes at head of file. */
     712       29362 :     if (level > 0)
     713        6166 :         state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
     714             :     else
     715       23196 :         state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
     716             : 
     717             :     /* no parent level, yet */
     718       29362 :     state->btps_next = NULL;
     719             : 
     720       29362 :     return state;
     721             : }
     722             : 
     723             : /*
     724             :  * slide an array of ItemIds back one slot (from P_FIRSTKEY to
     725             :  * P_HIKEY, overwriting P_HIKEY).  we need to do this when we discover
     726             :  * that we have built an ItemId array in what has turned out to be a
     727             :  * P_RIGHTMOST page.
     728             :  */
     729             : static void
     730       29362 : _bt_slideleft(Page page)
     731             : {
     732             :     OffsetNumber off;
     733             :     OffsetNumber maxoff;
     734             :     ItemId      previi;
     735             :     ItemId      thisii;
     736             : 
     737       29362 :     if (!PageIsEmpty(page))
     738             :     {
     739       29362 :         maxoff = PageGetMaxOffsetNumber(page);
     740       29362 :         previi = PageGetItemId(page, P_HIKEY);
     741     1919732 :         for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
     742             :         {
     743     1890370 :             thisii = PageGetItemId(page, off);
     744     1890370 :             *previi = *thisii;
     745     1890370 :             previi = thisii;
     746             :         }
     747       29362 :         ((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
     748             :     }
     749       29362 : }
     750             : 
     751             : /*
     752             :  * Add an item to a page being built.
     753             :  *
     754             :  * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
     755             :  * raises an error directly.
     756             :  *
     757             :  * Note that our nbtsort.c caller does not know yet if the page will be
     758             :  * rightmost.  Offset P_FIRSTKEY is always assumed to be the first data key by
     759             :  * caller.  Page that turns out to be the rightmost on its level is fixed by
     760             :  * calling _bt_slideleft().
     761             :  */
     762             : static void
     763    15140292 : _bt_sortaddtup(Page page,
     764             :                Size itemsize,
     765             :                IndexTuple itup,
     766             :                OffsetNumber itup_off,
     767             :                bool newfirstdataitem)
     768             : {
     769             :     IndexTupleData trunctuple;
     770             : 
     771    15140292 :     if (newfirstdataitem)
     772             :     {
     773        6256 :         trunctuple = *itup;
     774        6256 :         trunctuple.t_info = sizeof(IndexTupleData);
     775        6256 :         BTreeTupleSetNAtts(&trunctuple, 0, false);
     776        6256 :         itup = &trunctuple;
     777        6256 :         itemsize = sizeof(IndexTupleData);
     778             :     }
     779             : 
     780    15140292 :     if (PageAddItem(page, (Item) itup, itemsize, itup_off,
     781             :                     false, false) == InvalidOffsetNumber)
     782           0 :         elog(ERROR, "failed to add item to the index page");
     783    15140292 : }
     784             : 
     785             : /*----------
     786             :  * Add an item to a disk page from the sort output (or add a posting list
     787             :  * item formed from the sort output).
     788             :  *
     789             :  * We must be careful to observe the page layout conventions of nbtsearch.c:
     790             :  * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
     791             :  * - on non-leaf pages, the key portion of the first item need not be
     792             :  *   stored, we should store only the link.
     793             :  *
     794             :  * A leaf page being built looks like:
     795             :  *
     796             :  * +----------------+---------------------------------+
     797             :  * | PageHeaderData | linp0 linp1 linp2 ...           |
     798             :  * +-----------+----+---------------------------------+
     799             :  * | ... linpN |                                      |
     800             :  * +-----------+--------------------------------------+
     801             :  * |     ^ last                                       |
     802             :  * |                                                  |
     803             :  * +-------------+------------------------------------+
     804             :  * |             | itemN ...                          |
     805             :  * +-------------+------------------+-----------------+
     806             :  * |          ... item3 item2 item1 | "special space" |
     807             :  * +--------------------------------+-----------------+
     808             :  *
     809             :  * Contrast this with the diagram in bufpage.h; note the mismatch
     810             :  * between linps and items.  This is because we reserve linp0 as a
     811             :  * placeholder for the pointer to the "high key" item; when we have
     812             :  * filled up the page, we will set linp0 to point to itemN and clear
     813             :  * linpN.  On the other hand, if we find this is the last (rightmost)
     814             :  * page, we leave the items alone and slide the linp array over.  If
     815             :  * the high key is to be truncated, offset 1 is deleted, and we insert
     816             :  * the truncated high key at offset 1.
     817             :  *
     818             :  * 'last' pointer indicates the last offset added to the page.
     819             :  *
     820             :  * 'truncextra' is the size of the posting list in itup, if any.  This
     821             :  * information is stashed for the next call here, when we may benefit
     822             :  * from considering the impact of truncating away the posting list on
     823             :  * the page before deciding to finish the page off.  Posting lists are
     824             :  * often relatively large, so it is worth going to the trouble of
     825             :  * accounting for the saving from truncating away the posting list of
     826             :  * the tuple that becomes the high key (that may be the only way to
     827             :  * get close to target free space on the page).  Note that this is
     828             :  * only used for the soft fillfactor-wise limit, not the critical hard
     829             :  * limit.
     830             :  *----------
     831             :  */
     832             : static void
     833    15088546 : _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
     834             :              Size truncextra)
     835             : {
     836             :     Page        npage;
     837             :     BlockNumber nblkno;
     838             :     OffsetNumber last_off;
     839             :     Size        last_truncextra;
     840             :     Size        pgspc;
     841             :     Size        itupsz;
     842             :     bool        isleaf;
     843             : 
     844             :     /*
     845             :      * This is a handy place to check for cancel interrupts during the btree
     846             :      * load phase of index creation.
     847             :      */
     848    15088546 :     CHECK_FOR_INTERRUPTS();
     849             : 
     850    15088546 :     npage = state->btps_page;
     851    15088546 :     nblkno = state->btps_blkno;
     852    15088546 :     last_off = state->btps_lastoff;
     853    15088546 :     last_truncextra = state->btps_lastextra;
     854    15088546 :     state->btps_lastextra = truncextra;
     855             : 
     856    15088546 :     pgspc = PageGetFreeSpace(npage);
     857    15088546 :     itupsz = IndexTupleSize(itup);
     858    15088546 :     itupsz = MAXALIGN(itupsz);
     859             :     /* Leaf case has slightly different rules due to suffix truncation */
     860    15088546 :     isleaf = (state->btps_level == 0);
     861             : 
     862             :     /*
     863             :      * Check whether the new item can fit on a btree page on current level at
     864             :      * all.
     865             :      *
     866             :      * Every newly built index will treat heap TID as part of the keyspace,
     867             :      * which imposes the requirement that new high keys must occasionally have
     868             :      * a heap TID appended within _bt_truncate().  That may leave a new pivot
     869             :      * tuple one or two MAXALIGN() quantums larger than the original
     870             :      * firstright tuple it's derived from.  v4 deals with the problem by
     871             :      * decreasing the limit on the size of tuples inserted on the leaf level
     872             :      * by the same small amount.  Enforce the new v4+ limit on the leaf level,
     873             :      * and the old limit on internal levels, since pivot tuples may need to
     874             :      * make use of the reserved space.  This should never fail on internal
     875             :      * pages.
     876             :      */
     877    15088546 :     if (unlikely(itupsz > BTMaxItemSize(npage)))
     878         176 :         _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
     879             :                              itup);
     880             : 
     881             :     /*
     882             :      * Check to see if current page will fit new item, with space left over to
     883             :      * append a heap TID during suffix truncation when page is a leaf page.
     884             :      *
     885             :      * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
     886             :      * high key with heap TID when finishing off a leaf page, since we rely on
     887             :      * _bt_check_third_page() rejecting oversized non-pivot tuples.  On
     888             :      * internal pages we can always fit 3 pivot tuples with larger internal
     889             :      * page tuple limit (includes page high key).
     890             :      *
     891             :      * Most of the time, a page is only "full" in the sense that the soft
     892             :      * fillfactor-wise limit has been exceeded.  However, we must always leave
     893             :      * at least two items plus a high key on each page before starting a new
     894             :      * page.  Disregard fillfactor and insert on "full" current page if we
     895             :      * don't have the minimum number of items yet.  (Note that we deliberately
     896             :      * assume that suffix truncation neither enlarges nor shrinks new high key
     897             :      * when applying soft limit, except when last tuple has a posting list.)
     898             :      */
     899             :     Assert(last_truncextra == 0 || isleaf);
     900    15088546 :     if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
     901    15087826 :         (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
     902             :     {
     903             :         /*
     904             :          * Finish off the page and write it out.
     905             :          */
     906       51746 :         Page        opage = npage;
     907       51746 :         BlockNumber oblkno = nblkno;
     908             :         ItemId      ii;
     909             :         ItemId      hii;
     910             :         IndexTuple  oitup;
     911             : 
     912             :         /* Create new page of same level */
     913       51746 :         npage = _bt_blnewpage(state->btps_level);
     914             : 
     915             :         /* and assign it a page position */
     916       51746 :         nblkno = wstate->btws_pages_alloced++;
     917             : 
     918             :         /*
     919             :          * We copy the last item on the page into the new page, and then
     920             :          * rearrange the old page so that the 'last item' becomes its high key
     921             :          * rather than a true data item.  There had better be at least two
     922             :          * items on the page already, else the page would be empty of useful
     923             :          * data.
     924             :          */
     925             :         Assert(last_off > P_FIRSTKEY);
     926       51746 :         ii = PageGetItemId(opage, last_off);
     927       51746 :         oitup = (IndexTuple) PageGetItem(opage, ii);
     928       51746 :         _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
     929       51746 :                        !isleaf);
     930             : 
     931             :         /*
     932             :          * Move 'last' into the high key position on opage.  _bt_blnewpage()
     933             :          * allocated empty space for a line pointer when opage was first
     934             :          * created, so this is a matter of rearranging already-allocated space
     935             :          * on page, and initializing high key line pointer. (Actually, leaf
     936             :          * pages must also swap oitup with a truncated version of oitup, which
     937             :          * is sometimes larger than oitup, though never by more than the space
     938             :          * needed to append a heap TID.)
     939             :          */
     940       51746 :         hii = PageGetItemId(opage, P_HIKEY);
     941       51746 :         *hii = *ii;
     942       51746 :         ItemIdSetUnused(ii);    /* redundant */
     943       51746 :         ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
     944             : 
     945       51746 :         if (isleaf)
     946             :         {
     947             :             IndexTuple  lastleft;
     948             :             IndexTuple  truncated;
     949             : 
     950             :             /*
     951             :              * Truncate away any unneeded attributes from high key on leaf
     952             :              * level.  This is only done at the leaf level because downlinks
     953             :              * in internal pages are either negative infinity items, or get
     954             :              * their contents from copying from one level down.  See also:
     955             :              * _bt_split().
     956             :              *
     957             :              * We don't try to bias our choice of split point to make it more
     958             :              * likely that _bt_truncate() can truncate away more attributes,
     959             :              * whereas the split point used within _bt_split() is chosen much
     960             :              * more delicately.  Even still, the lastleft and firstright
     961             :              * tuples passed to _bt_truncate() here are at least not fully
     962             :              * equal to each other when deduplication is used, unless there is
     963             :              * a large group of duplicates (also, unique index builds usually
     964             :              * have few or no spool2 duplicates).  When the split point is
     965             :              * between two unequal tuples, _bt_truncate() will avoid including
     966             :              * a heap TID in the new high key, which is the most important
     967             :              * benefit of suffix truncation.
     968             :              *
     969             :              * Overwrite the old item with new truncated high key directly.
     970             :              * oitup is already located at the physical beginning of tuple
     971             :              * space, so this should directly reuse the existing tuple space.
     972             :              */
     973       51656 :             ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
     974       51656 :             lastleft = (IndexTuple) PageGetItem(opage, ii);
     975             : 
     976             :             Assert(IndexTupleSize(oitup) > last_truncextra);
     977       51656 :             truncated = _bt_truncate(wstate->index, lastleft, oitup,
     978             :                                      wstate->inskey);
     979       51656 :             if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated,
     980       51656 :                                          IndexTupleSize(truncated)))
     981           0 :                 elog(ERROR, "failed to add high key to the index page");
     982       51656 :             pfree(truncated);
     983             : 
     984             :             /* oitup should continue to point to the page's high key */
     985       51656 :             hii = PageGetItemId(opage, P_HIKEY);
     986       51656 :             oitup = (IndexTuple) PageGetItem(opage, hii);
     987             :         }
     988             : 
     989             :         /*
     990             :          * Link the old page into its parent, using its low key.  If we don't
     991             :          * have a parent, we have to create one; this adds a new btree level.
     992             :          */
     993       51746 :         if (state->btps_next == NULL)
     994        6166 :             state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
     995             : 
     996             :         Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
     997             :                 IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
     998             :                 BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
     999             :                P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage)));
    1000             :         Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
    1001             :                !P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage)));
    1002       51746 :         BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
    1003       51746 :         _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
    1004       51746 :         pfree(state->btps_lowkey);
    1005             : 
    1006             :         /*
    1007             :          * Save a copy of the high key from the old page.  It is also the low
    1008             :          * key for the new page.
    1009             :          */
    1010       51746 :         state->btps_lowkey = CopyIndexTuple(oitup);
    1011             : 
    1012             :         /*
    1013             :          * Set the sibling links for both pages.
    1014             :          */
    1015             :         {
    1016       51746 :             BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
    1017       51746 :             BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
    1018             : 
    1019       51746 :             oopaque->btpo_next = nblkno;
    1020       51746 :             nopaque->btpo_prev = oblkno;
    1021       51746 :             nopaque->btpo_next = P_NONE; /* redundant */
    1022             :         }
    1023             : 
    1024             :         /*
    1025             :          * Write out the old page.  We never need to touch it again, so we can
    1026             :          * free the opage workspace too.
    1027             :          */
    1028       51746 :         _bt_blwritepage(wstate, opage, oblkno);
    1029             : 
    1030             :         /*
    1031             :          * Reset last_off to point to new page
    1032             :          */
    1033       51746 :         last_off = P_FIRSTKEY;
    1034             :     }
    1035             : 
    1036             :     /*
    1037             :      * By here, either original page is still the current page, or a new page
    1038             :      * was created that became the current page.  Either way, the current page
    1039             :      * definitely has space for new item.
    1040             :      *
    1041             :      * If the new item is the first for its page, it must also be the first
    1042             :      * item on its entire level.  On later same-level pages, a low key for a
    1043             :      * page will be copied from the prior page in the code above.  Generate a
    1044             :      * minus infinity low key here instead.
    1045             :      */
    1046    15088546 :     if (last_off == P_HIKEY)
    1047             :     {
    1048             :         Assert(state->btps_lowkey == NULL);
    1049       29362 :         state->btps_lowkey = palloc0(sizeof(IndexTupleData));
    1050       29362 :         state->btps_lowkey->t_info = sizeof(IndexTupleData);
    1051       29362 :         BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
    1052             :     }
    1053             : 
    1054             :     /*
    1055             :      * Add the new item into the current page.
    1056             :      */
    1057    15088546 :     last_off = OffsetNumberNext(last_off);
    1058    15088546 :     _bt_sortaddtup(npage, itupsz, itup, last_off,
    1059    15088546 :                    !isleaf && last_off == P_FIRSTKEY);
    1060             : 
    1061    15088546 :     state->btps_page = npage;
    1062    15088546 :     state->btps_blkno = nblkno;
    1063    15088546 :     state->btps_lastoff = last_off;
    1064    15088546 : }
    1065             : 
    1066             : /*
    1067             :  * Finalize pending posting list tuple, and add it to the index.  Final tuple
    1068             :  * is based on saved base tuple, and saved list of heap TIDs.
    1069             :  *
    1070             :  * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
    1071             :  * using _bt_buildadd().
    1072             :  */
    1073             : static void
    1074     6005926 : _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
    1075             :                               BTDedupState dstate)
    1076             : {
    1077             :     Assert(dstate->nitems > 0);
    1078             : 
    1079     6005926 :     if (dstate->nitems == 1)
    1080     5994314 :         _bt_buildadd(wstate, state, dstate->base, 0);
    1081             :     else
    1082             :     {
    1083             :         IndexTuple  postingtuple;
    1084             :         Size        truncextra;
    1085             : 
    1086             :         /* form a tuple with a posting list */
    1087       11612 :         postingtuple = _bt_form_posting(dstate->base,
    1088             :                                         dstate->htids,
    1089             :                                         dstate->nhtids);
    1090             :         /* Calculate posting list overhead */
    1091       23224 :         truncextra = IndexTupleSize(postingtuple) -
    1092       11612 :             BTreeTupleGetPostingOffset(postingtuple);
    1093             : 
    1094       11612 :         _bt_buildadd(wstate, state, postingtuple, truncextra);
    1095       11612 :         pfree(postingtuple);
    1096             :     }
    1097             : 
    1098     6005926 :     dstate->nhtids = 0;
    1099     6005926 :     dstate->nitems = 0;
    1100     6005926 :     dstate->phystupsize = 0;
    1101     6005926 : }
    1102             : 
    1103             : /*
    1104             :  * Finish writing out the completed btree.
    1105             :  */
    1106             : static void
    1107       76478 : _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
    1108             : {
    1109             :     BTPageState *s;
    1110       76478 :     BlockNumber rootblkno = P_NONE;
    1111       76478 :     uint32      rootlevel = 0;
    1112             :     Page        metapage;
    1113             : 
    1114             :     /*
    1115             :      * Each iteration of this loop completes one more level of the tree.
    1116             :      */
    1117      105840 :     for (s = state; s != NULL; s = s->btps_next)
    1118             :     {
    1119             :         BlockNumber blkno;
    1120             :         BTPageOpaque opaque;
    1121             : 
    1122       29362 :         blkno = s->btps_blkno;
    1123       29362 :         opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page);
    1124             : 
    1125             :         /*
    1126             :          * We have to link the last page on this level to somewhere.
    1127             :          *
    1128             :          * If we're at the top, it's the root, so attach it to the metapage.
    1129             :          * Otherwise, add an entry for it to its parent using its low key.
    1130             :          * This may cause the last page of the parent level to split, but
    1131             :          * that's not a problem -- we haven't gotten to it yet.
    1132             :          */
    1133       29362 :         if (s->btps_next == NULL)
    1134             :         {
    1135       23196 :             opaque->btpo_flags |= BTP_ROOT;
    1136       23196 :             rootblkno = blkno;
    1137       23196 :             rootlevel = s->btps_level;
    1138             :         }
    1139             :         else
    1140             :         {
    1141             :             Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
    1142             :                     IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
    1143             :                     BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
    1144             :                    P_LEFTMOST(opaque));
    1145             :             Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
    1146             :                    !P_LEFTMOST(opaque));
    1147        6166 :             BTreeTupleSetDownLink(s->btps_lowkey, blkno);
    1148        6166 :             _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
    1149        6166 :             pfree(s->btps_lowkey);
    1150        6166 :             s->btps_lowkey = NULL;
    1151             :         }
    1152             : 
    1153             :         /*
    1154             :          * This is the rightmost page, so the ItemId array needs to be slid
    1155             :          * back one slot.  Then we can dump out the page.
    1156             :          */
    1157       29362 :         _bt_slideleft(s->btps_page);
    1158       29362 :         _bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
    1159       29362 :         s->btps_page = NULL; /* writepage freed the workspace */
    1160             :     }
    1161             : 
    1162             :     /*
    1163             :      * As the last step in the process, construct the metapage and make it
    1164             :      * point to the new root (unless we had no data at all, in which case it's
    1165             :      * set to point to "P_NONE").  This changes the index to the "valid" state
    1166             :      * by filling in a valid magic number in the metapage.
    1167             :      */
    1168       76478 :     metapage = (Page) palloc(BLCKSZ);
    1169       76478 :     _bt_initmetapage(metapage, rootblkno, rootlevel,
    1170       76478 :                      wstate->inskey->allequalimage);
    1171       76478 :     _bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
    1172       76478 : }
    1173             : 
    1174             : /*
    1175             :  * Read tuples in correct sort order from tuplesort, and load them into
    1176             :  * btree leaves.
    1177             :  */
    1178             : static void
    1179       76478 : _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
    1180             : {
    1181       76478 :     BTPageState *state = NULL;
    1182       76478 :     bool        merge = (btspool2 != NULL);
    1183             :     IndexTuple  itup,
    1184       76478 :                 itup2 = NULL;
    1185             :     bool        load1;
    1186       76478 :     TupleDesc   tupdes = RelationGetDescr(wstate->index);
    1187             :     int         i,
    1188       76478 :                 keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
    1189             :     SortSupport sortKeys;
    1190       76478 :     int64       tuples_done = 0;
    1191             :     bool        deduplicate;
    1192             : 
    1193       84468 :     deduplicate = wstate->inskey->allequalimage &&
    1194        7990 :         BTGetDeduplicateItems(wstate->index);
    1195             : 
    1196       76478 :     if (merge)
    1197             :     {
    1198             :         /*
    1199             :          * Another BTSpool for dead tuples exists. Now we have to merge
    1200             :          * btspool and btspool2.
    1201             :          */
    1202             : 
    1203             :         /* the preparation of merge */
    1204          12 :         itup = tuplesort_getindextuple(btspool->sortstate, true);
    1205          12 :         itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
    1206             : 
    1207             :         /* Prepare SortSupport data for each column */
    1208          12 :         sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
    1209             : 
    1210          26 :         for (i = 0; i < keysz; i++)
    1211             :         {
    1212          14 :             SortSupport sortKey = sortKeys + i;
    1213          14 :             ScanKey     scanKey = wstate->inskey->scankeys + i;
    1214             :             int16       strategy;
    1215             : 
    1216          14 :             sortKey->ssup_cxt = CurrentMemoryContext;
    1217          14 :             sortKey->ssup_collation = scanKey->sk_collation;
    1218          14 :             sortKey->ssup_nulls_first =
    1219          14 :                 (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
    1220          14 :             sortKey->ssup_attno = scanKey->sk_attno;
    1221             :             /* Abbreviation is not supported here */
    1222          14 :             sortKey->abbreviate = false;
    1223             : 
    1224             :             AssertState(sortKey->ssup_attno != 0);
    1225             : 
    1226          14 :             strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
    1227             :                 BTGreaterStrategyNumber : BTLessStrategyNumber;
    1228             : 
    1229          14 :             PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
    1230             :         }
    1231             : 
    1232             :         for (;;)
    1233             :         {
    1234        3000 :             load1 = true;       /* load BTSpool next ? */
    1235        3012 :             if (itup2 == NULL)
    1236             :             {
    1237         128 :                 if (itup == NULL)
    1238          12 :                     break;
    1239             :             }
    1240        2884 :             else if (itup != NULL)
    1241             :             {
    1242        2716 :                 int32       compare = 0;
    1243             : 
    1244        2920 :                 for (i = 1; i <= keysz; i++)
    1245             :                 {
    1246             :                     SortSupport entry;
    1247             :                     Datum       attrDatum1,
    1248             :                                 attrDatum2;
    1249             :                     bool        isNull1,
    1250             :                                 isNull2;
    1251             : 
    1252        2764 :                     entry = sortKeys + i - 1;
    1253        2764 :                     attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
    1254        2764 :                     attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
    1255             : 
    1256        2764 :                     compare = ApplySortComparator(attrDatum1, isNull1,
    1257             :                                                   attrDatum2, isNull2,
    1258             :                                                   entry);
    1259        2764 :                     if (compare > 0)
    1260             :                     {
    1261         252 :                         load1 = false;
    1262        2560 :                         break;
    1263             :                     }
    1264        2512 :                     else if (compare < 0)
    1265        2308 :                         break;
    1266             :                 }
    1267             : 
    1268             :                 /*
    1269             :                  * If key values are equal, we sort on ItemPointer.  This is
    1270             :                  * required for btree indexes, since heap TID is treated as an
    1271             :                  * implicit last key attribute in order to ensure that all
    1272             :                  * keys in the index are physically unique.
    1273             :                  */
    1274        2716 :                 if (compare == 0)
    1275             :                 {
    1276         156 :                     compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
    1277             :                     Assert(compare != 0);
    1278         156 :                     if (compare > 0)
    1279          10 :                         load1 = false;
    1280             :                 }
    1281             :             }
    1282             :             else
    1283         168 :                 load1 = false;
    1284             : 
    1285             :             /* When we see first tuple, create first index page */
    1286        3000 :             if (state == NULL)
    1287          12 :                 state = _bt_pagestate(wstate, 0);
    1288             : 
    1289        3000 :             if (load1)
    1290             :             {
    1291        2570 :                 _bt_buildadd(wstate, state, itup, 0);
    1292        2570 :                 itup = tuplesort_getindextuple(btspool->sortstate, true);
    1293             :             }
    1294             :             else
    1295             :             {
    1296         430 :                 _bt_buildadd(wstate, state, itup2, 0);
    1297         430 :                 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
    1298             :             }
    1299             : 
    1300             :             /* Report progress */
    1301        3000 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1302             :                                          ++tuples_done);
    1303             :         }
    1304          12 :         pfree(sortKeys);
    1305             :     }
    1306       76466 :     else if (deduplicate)
    1307             :     {
    1308             :         /* merge is unnecessary, deduplicate into posting lists */
    1309             :         BTDedupState dstate;
    1310             : 
    1311        7980 :         dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
    1312        7980 :         dstate->deduplicate = true; /* unused */
    1313        7980 :         dstate->maxpostingsize = 0; /* set later */
    1314             :         /* Metadata about base tuple of current pending posting list */
    1315        7980 :         dstate->base = NULL;
    1316        7980 :         dstate->baseoff = InvalidOffsetNumber;   /* unused */
    1317        7980 :         dstate->basetupsize = 0;
    1318             :         /* Metadata about current pending posting list TIDs */
    1319        7980 :         dstate->htids = NULL;
    1320        7980 :         dstate->nhtids = 0;
    1321        7980 :         dstate->nitems = 0;
    1322        7980 :         dstate->phystupsize = 0; /* unused */
    1323        7980 :         dstate->nintervals = 0; /* unused */
    1324             : 
    1325     6744082 :         while ((itup = tuplesort_getindextuple(btspool->sortstate,
    1326             :                                                true)) != NULL)
    1327             :         {
    1328             :             /* When we see first tuple, create first index page */
    1329     6736102 :             if (state == NULL)
    1330             :             {
    1331        1296 :                 state = _bt_pagestate(wstate, 0);
    1332             : 
    1333             :                 /*
    1334             :                  * Limit size of posting list tuples to 1/10 space we want to
    1335             :                  * leave behind on the page, plus space for final item's line
    1336             :                  * pointer.  This is equal to the space that we'd like to
    1337             :                  * leave behind on each leaf page when fillfactor is 90,
    1338             :                  * allowing us to get close to fillfactor% space utilization
    1339             :                  * when there happen to be a great many duplicates.  (This
    1340             :                  * makes higher leaf fillfactor settings ineffective when
    1341             :                  * building indexes that have many duplicates, but packing
    1342             :                  * leaf pages full with few very large tuples doesn't seem
    1343             :                  * like a useful goal.)
    1344             :                  */
    1345        1296 :                 dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
    1346             :                     sizeof(ItemIdData);
    1347             :                 Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) &&
    1348             :                        dstate->maxpostingsize <= INDEX_SIZE_MASK);
    1349        1296 :                 dstate->htids = palloc(dstate->maxpostingsize);
    1350             : 
    1351             :                 /* start new pending posting list with itup copy */
    1352        1296 :                 _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
    1353             :                                         InvalidOffsetNumber);
    1354             :             }
    1355     6734806 :             else if (_bt_keep_natts_fast(wstate->index, dstate->base,
    1356      734768 :                                          itup) > keysz &&
    1357      734768 :                      _bt_dedup_save_htid(dstate, itup))
    1358             :             {
    1359             :                 /*
    1360             :                  * Tuple is equal to base tuple of pending posting list.  Heap
    1361             :                  * TID from itup has been saved in state.
    1362             :                  */
    1363             :             }
    1364             :             else
    1365             :             {
    1366             :                 /*
    1367             :                  * Tuple is not equal to pending posting list tuple, or
    1368             :                  * _bt_dedup_save_htid() opted to not merge current item into
    1369             :                  * pending posting list.
    1370             :                  */
    1371     6004630 :                 _bt_sort_dedup_finish_pending(wstate, state, dstate);
    1372     6004630 :                 pfree(dstate->base);
    1373             : 
    1374             :                 /* start new pending posting list with itup copy */
    1375     6004630 :                 _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
    1376             :                                         InvalidOffsetNumber);
    1377             :             }
    1378             : 
    1379             :             /* Report progress */
    1380     6736102 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1381             :                                          ++tuples_done);
    1382             :         }
    1383             : 
    1384        7980 :         if (state)
    1385             :         {
    1386             :             /*
    1387             :              * Handle the last item (there must be a last item when the
    1388             :              * tuplesort returned one or more tuples)
    1389             :              */
    1390        1296 :             _bt_sort_dedup_finish_pending(wstate, state, dstate);
    1391        1296 :             pfree(dstate->base);
    1392        1296 :             pfree(dstate->htids);
    1393             :         }
    1394             : 
    1395        7980 :         pfree(dstate);
    1396             :     }
    1397             :     else
    1398             :     {
    1399             :         /* merging and deduplication are both unnecessary */
    1400     9090194 :         while ((itup = tuplesort_getindextuple(btspool->sortstate,
    1401             :                                                true)) != NULL)
    1402             :         {
    1403             :             /* When we see first tuple, create first index page */
    1404     9021708 :             if (state == NULL)
    1405       21888 :                 state = _bt_pagestate(wstate, 0);
    1406             : 
    1407     9021708 :             _bt_buildadd(wstate, state, itup, 0);
    1408             : 
    1409             :             /* Report progress */
    1410     9021708 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1411             :                                          ++tuples_done);
    1412             :         }
    1413             :     }
    1414             : 
    1415             :     /* Close down final pages and write the metapage */
    1416       76478 :     _bt_uppershutdown(wstate, state);
    1417             : 
    1418             :     /*
    1419             :      * When we WAL-logged index pages, we must nonetheless fsync index files.
    1420             :      * Since we're building outside shared buffers, a CHECKPOINT occurring
    1421             :      * during the build has no way to flush the previously written data to
    1422             :      * disk (indeed it won't know the index even exists).  A crash later on
    1423             :      * would replay WAL from the checkpoint, therefore it wouldn't replay our
    1424             :      * earlier WAL entries. If we do not fsync those pages here, they might
    1425             :      * still not be on disk when the crash occurs.
    1426             :      */
    1427       76478 :     if (wstate->btws_use_wal)
    1428             :     {
    1429       70206 :         RelationOpenSmgr(wstate->index);
    1430       70206 :         smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
    1431             :     }
    1432       76478 : }
    1433             : 
    1434             : /*
    1435             :  * Create parallel context, and launch workers for leader.
    1436             :  *
    1437             :  * buildstate argument should be initialized (with the exception of the
    1438             :  * tuplesort state in spools, which may later be created based on shared
    1439             :  * state initially set up here).
    1440             :  *
    1441             :  * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
    1442             :  *
    1443             :  * request is the target number of parallel worker processes to launch.
    1444             :  *
    1445             :  * Sets buildstate's BTLeader, which caller must use to shut down parallel
    1446             :  * mode by passing it to _bt_end_parallel() at the very end of its index
    1447             :  * build.  If not even a single worker process can be launched, this is
    1448             :  * never set, and caller should proceed with a serial index build.
    1449             :  */
    1450             : static void
    1451          88 : _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
    1452             : {
    1453             :     ParallelContext *pcxt;
    1454             :     int         scantuplesortstates;
    1455             :     Snapshot    snapshot;
    1456             :     Size        estbtshared;
    1457             :     Size        estsort;
    1458             :     BTShared   *btshared;
    1459             :     Sharedsort *sharedsort;
    1460             :     Sharedsort *sharedsort2;
    1461          88 :     BTSpool    *btspool = buildstate->spool;
    1462          88 :     BTLeader   *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
    1463             :     WalUsage   *walusage;
    1464             :     BufferUsage *bufferusage;
    1465          88 :     bool        leaderparticipates = true;
    1466             :     char       *sharedquery;
    1467             :     int         querylen;
    1468             : 
    1469             : #ifdef DISABLE_LEADER_PARTICIPATION
    1470             :     leaderparticipates = false;
    1471             : #endif
    1472             : 
    1473             :     /*
    1474             :      * Enter parallel mode, and create context for parallel build of btree
    1475             :      * index
    1476             :      */
    1477          88 :     EnterParallelMode();
    1478             :     Assert(request > 0);
    1479          88 :     pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
    1480             :                                  request);
    1481             : 
    1482          88 :     scantuplesortstates = leaderparticipates ? request + 1 : request;
    1483             : 
    1484             :     /*
    1485             :      * Prepare for scan of the base relation.  In a normal index build, we use
    1486             :      * SnapshotAny because we must retrieve all tuples and do our own time
    1487             :      * qual checks (because we have to index RECENTLY_DEAD tuples).  In a
    1488             :      * concurrent build, we take a regular MVCC snapshot and index whatever's
    1489             :      * live according to that.
    1490             :      */
    1491          88 :     if (!isconcurrent)
    1492          88 :         snapshot = SnapshotAny;
    1493             :     else
    1494           0 :         snapshot = RegisterSnapshot(GetTransactionSnapshot());
    1495             : 
    1496             :     /*
    1497             :      * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
    1498             :      * PARALLEL_KEY_TUPLESORT tuplesort workspace
    1499             :      */
    1500          88 :     estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
    1501          88 :     shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
    1502          88 :     estsort = tuplesort_estimate_shared(scantuplesortstates);
    1503          88 :     shm_toc_estimate_chunk(&pcxt->estimator, estsort);
    1504             : 
    1505             :     /*
    1506             :      * Unique case requires a second spool, and so we may have to account for
    1507             :      * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
    1508             :      */
    1509          88 :     if (!btspool->isunique)
    1510          52 :         shm_toc_estimate_keys(&pcxt->estimator, 2);
    1511             :     else
    1512             :     {
    1513          36 :         shm_toc_estimate_chunk(&pcxt->estimator, estsort);
    1514          36 :         shm_toc_estimate_keys(&pcxt->estimator, 3);
    1515             :     }
    1516             : 
    1517             :     /*
    1518             :      * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
    1519             :      * and PARALLEL_KEY_BUFFER_USAGE.
    1520             :      *
    1521             :      * If there are no extensions loaded that care, we could skip this.  We
    1522             :      * have no way of knowing whether anyone's looking at pgWalUsage or
    1523             :      * pgBufferUsage, so do it unconditionally.
    1524             :      */
    1525          88 :     shm_toc_estimate_chunk(&pcxt->estimator,
    1526             :                            mul_size(sizeof(WalUsage), pcxt->nworkers));
    1527          88 :     shm_toc_estimate_keys(&pcxt->estimator, 1);
    1528          88 :     shm_toc_estimate_chunk(&pcxt->estimator,
    1529             :                            mul_size(sizeof(BufferUsage), pcxt->nworkers));
    1530          88 :     shm_toc_estimate_keys(&pcxt->estimator, 1);
    1531             : 
    1532             :     /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
    1533          88 :     querylen = strlen(debug_query_string);
    1534          88 :     shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
    1535          88 :     shm_toc_estimate_keys(&pcxt->estimator, 1);
    1536             : 
    1537             :     /* Everyone's had a chance to ask for space, so now create the DSM */
    1538          88 :     InitializeParallelDSM(pcxt);
    1539             : 
    1540             :     /* If no DSM segment was available, back out (do serial build) */
    1541          88 :     if (pcxt->seg == NULL)
    1542             :     {
    1543           0 :         if (IsMVCCSnapshot(snapshot))
    1544           0 :             UnregisterSnapshot(snapshot);
    1545           0 :         DestroyParallelContext(pcxt);
    1546           0 :         ExitParallelMode();
    1547           0 :         return;
    1548             :     }
    1549             : 
    1550             :     /* Store shared build state, for which we reserved space */
    1551          88 :     btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
    1552             :     /* Initialize immutable state */
    1553          88 :     btshared->heaprelid = RelationGetRelid(btspool->heap);
    1554          88 :     btshared->indexrelid = RelationGetRelid(btspool->index);
    1555          88 :     btshared->isunique = btspool->isunique;
    1556          88 :     btshared->isconcurrent = isconcurrent;
    1557          88 :     btshared->scantuplesortstates = scantuplesortstates;
    1558          88 :     ConditionVariableInit(&btshared->workersdonecv);
    1559          88 :     SpinLockInit(&btshared->mutex);
    1560             :     /* Initialize mutable state */
    1561          88 :     btshared->nparticipantsdone = 0;
    1562          88 :     btshared->reltuples = 0.0;
    1563          88 :     btshared->havedead = false;
    1564          88 :     btshared->indtuples = 0.0;
    1565          88 :     btshared->brokenhotchain = false;
    1566          88 :     table_parallelscan_initialize(btspool->heap,
    1567             :                                   ParallelTableScanFromBTShared(btshared),
    1568             :                                   snapshot);
    1569             : 
    1570             :     /*
    1571             :      * Store shared tuplesort-private state, for which we reserved space.
    1572             :      * Then, initialize opaque state using tuplesort routine.
    1573             :      */
    1574          88 :     sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
    1575          88 :     tuplesort_initialize_shared(sharedsort, scantuplesortstates,
    1576             :                                 pcxt->seg);
    1577             : 
    1578          88 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
    1579          88 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
    1580             : 
    1581             :     /* Unique case requires a second spool, and associated shared state */
    1582          88 :     if (!btspool->isunique)
    1583          52 :         sharedsort2 = NULL;
    1584             :     else
    1585             :     {
    1586             :         /*
    1587             :          * Store additional shared tuplesort-private state, for which we
    1588             :          * reserved space.  Then, initialize opaque state using tuplesort
    1589             :          * routine.
    1590             :          */
    1591          36 :         sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
    1592          36 :         tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
    1593             :                                     pcxt->seg);
    1594             : 
    1595          36 :         shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
    1596             :     }
    1597             : 
    1598             :     /* Store query string for workers */
    1599          88 :     sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
    1600          88 :     memcpy(sharedquery, debug_query_string, querylen + 1);
    1601          88 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
    1602             : 
    1603             :     /*
    1604             :      * Allocate space for each worker's WalUsage and BufferUsage; no need to
    1605             :      * initialize.
    1606             :      */
    1607          88 :     walusage = shm_toc_allocate(pcxt->toc,
    1608          88 :                                 mul_size(sizeof(WalUsage), pcxt->nworkers));
    1609          88 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
    1610          88 :     bufferusage = shm_toc_allocate(pcxt->toc,
    1611          88 :                                    mul_size(sizeof(BufferUsage), pcxt->nworkers));
    1612          88 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
    1613             : 
    1614             :     /* Launch workers, saving status for leader/caller */
    1615          88 :     LaunchParallelWorkers(pcxt);
    1616          88 :     btleader->pcxt = pcxt;
    1617          88 :     btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
    1618          88 :     if (leaderparticipates)
    1619          88 :         btleader->nparticipanttuplesorts++;
    1620          88 :     btleader->btshared = btshared;
    1621          88 :     btleader->sharedsort = sharedsort;
    1622          88 :     btleader->sharedsort2 = sharedsort2;
    1623          88 :     btleader->snapshot = snapshot;
    1624          88 :     btleader->walusage = walusage;
    1625          88 :     btleader->bufferusage = bufferusage;
    1626             : 
    1627             :     /* If no workers were successfully launched, back out (do serial build) */
    1628          88 :     if (pcxt->nworkers_launched == 0)
    1629             :     {
    1630           0 :         _bt_end_parallel(btleader);
    1631           0 :         return;
    1632             :     }
    1633             : 
    1634             :     /* Save leader state now that it's clear build will be parallel */
    1635          88 :     buildstate->btleader = btleader;
    1636             : 
    1637             :     /* Join heap scan ourselves */
    1638          88 :     if (leaderparticipates)
    1639          88 :         _bt_leader_participate_as_worker(buildstate);
    1640             : 
    1641             :     /*
    1642             :      * Caller needs to wait for all launched workers when we return.  Make
    1643             :      * sure that the failure-to-start case will not hang forever.
    1644             :      */
    1645          88 :     WaitForParallelWorkersToAttach(pcxt);
    1646             : }
    1647             : 
    1648             : /*
    1649             :  * Shut down workers, destroy parallel context, and end parallel mode.
    1650             :  */
    1651             : static void
    1652          88 : _bt_end_parallel(BTLeader *btleader)
    1653             : {
    1654             :     int         i;
    1655             : 
    1656             :     /* Shutdown worker processes */
    1657          88 :     WaitForParallelWorkersToFinish(btleader->pcxt);
    1658             : 
    1659             :     /*
    1660             :      * Next, accumulate WAL usage.  (This must wait for the workers to finish,
    1661             :      * or we might get incomplete data.)
    1662             :      */
    1663         176 :     for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
    1664          88 :         InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
    1665             : 
    1666             :     /* Free last reference to MVCC snapshot, if one was used */
    1667          88 :     if (IsMVCCSnapshot(btleader->snapshot))
    1668           0 :         UnregisterSnapshot(btleader->snapshot);
    1669          88 :     DestroyParallelContext(btleader->pcxt);
    1670          88 :     ExitParallelMode();
    1671          88 : }
    1672             : 
    1673             : /*
    1674             :  * Returns size of shared memory required to store state for a parallel
    1675             :  * btree index build based on the snapshot its parallel scan will use.
    1676             :  */
    1677             : static Size
    1678          88 : _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
    1679             : {
    1680             :     /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
    1681          88 :     return add_size(BUFFERALIGN(sizeof(BTShared)),
    1682             :                     table_parallelscan_estimate(heap, snapshot));
    1683             : }
    1684             : 
    1685             : /*
    1686             :  * Within leader, wait for end of heap scan.
    1687             :  *
    1688             :  * When called, parallel heap scan started by _bt_begin_parallel() will
    1689             :  * already be underway within worker processes (when leader participates
    1690             :  * as a worker, we should end up here just as workers are finishing).
    1691             :  *
    1692             :  * Fills in fields needed for ambuild statistics, and lets caller set
    1693             :  * field indicating that some worker encountered a broken HOT chain.
    1694             :  *
    1695             :  * Returns the total number of heap tuples scanned.
    1696             :  */
    1697             : static double
    1698          88 : _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
    1699             : {
    1700          88 :     BTShared   *btshared = buildstate->btleader->btshared;
    1701             :     int         nparticipanttuplesorts;
    1702             :     double      reltuples;
    1703             : 
    1704          88 :     nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
    1705             :     for (;;)
    1706             :     {
    1707         224 :         SpinLockAcquire(&btshared->mutex);
    1708         224 :         if (btshared->nparticipantsdone == nparticipanttuplesorts)
    1709             :         {
    1710          88 :             buildstate->havedead = btshared->havedead;
    1711          88 :             buildstate->indtuples = btshared->indtuples;
    1712          88 :             *brokenhotchain = btshared->brokenhotchain;
    1713          88 :             reltuples = btshared->reltuples;
    1714          88 :             SpinLockRelease(&btshared->mutex);
    1715          88 :             break;
    1716             :         }
    1717         136 :         SpinLockRelease(&btshared->mutex);
    1718             : 
    1719         136 :         ConditionVariableSleep(&btshared->workersdonecv,
    1720             :                                WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
    1721             :     }
    1722             : 
    1723          88 :     ConditionVariableCancelSleep();
    1724             : 
    1725          88 :     return reltuples;
    1726             : }
    1727             : 
    1728             : /*
    1729             :  * Within leader, participate as a parallel worker.
    1730             :  */
    1731             : static void
    1732          88 : _bt_leader_participate_as_worker(BTBuildState *buildstate)
    1733             : {
    1734          88 :     BTLeader   *btleader = buildstate->btleader;
    1735             :     BTSpool    *leaderworker;
    1736             :     BTSpool    *leaderworker2;
    1737             :     int         sortmem;
    1738             : 
    1739             :     /* Allocate memory and initialize private spool */
    1740          88 :     leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
    1741          88 :     leaderworker->heap = buildstate->spool->heap;
    1742          88 :     leaderworker->index = buildstate->spool->index;
    1743          88 :     leaderworker->isunique = buildstate->spool->isunique;
    1744             : 
    1745             :     /* Initialize second spool, if required */
    1746          88 :     if (!btleader->btshared->isunique)
    1747          52 :         leaderworker2 = NULL;
    1748             :     else
    1749             :     {
    1750             :         /* Allocate memory for worker's own private secondary spool */
    1751          36 :         leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
    1752             : 
    1753             :         /* Initialize worker's own secondary spool */
    1754          36 :         leaderworker2->heap = leaderworker->heap;
    1755          36 :         leaderworker2->index = leaderworker->index;
    1756          36 :         leaderworker2->isunique = false;
    1757             :     }
    1758             : 
    1759             :     /*
    1760             :      * Might as well use reliable figure when doling out maintenance_work_mem
    1761             :      * (when requested number of workers were not launched, this will be
    1762             :      * somewhat higher than it is for other workers).
    1763             :      */
    1764          88 :     sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
    1765             : 
    1766             :     /* Perform work common to all participants */
    1767          88 :     _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
    1768             :                                btleader->sharedsort, btleader->sharedsort2,
    1769             :                                sortmem, true);
    1770             : 
    1771             : #ifdef BTREE_BUILD_STATS
    1772             :     if (log_btree_build_stats)
    1773             :     {
    1774             :         ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
    1775             :         ResetUsage();
    1776             :     }
    1777             : #endif                          /* BTREE_BUILD_STATS */
    1778          88 : }
    1779             : 
    1780             : /*
    1781             :  * Perform work within a launched parallel process.
    1782             :  */
    1783             : void
    1784          88 : _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
    1785             : {
    1786             :     char       *sharedquery;
    1787             :     BTSpool    *btspool;
    1788             :     BTSpool    *btspool2;
    1789             :     BTShared   *btshared;
    1790             :     Sharedsort *sharedsort;
    1791             :     Sharedsort *sharedsort2;
    1792             :     Relation    heapRel;
    1793             :     Relation    indexRel;
    1794             :     LOCKMODE    heapLockmode;
    1795             :     LOCKMODE    indexLockmode;
    1796             :     WalUsage   *walusage;
    1797             :     BufferUsage *bufferusage;
    1798             :     int         sortmem;
    1799             : 
    1800             : #ifdef BTREE_BUILD_STATS
    1801             :     if (log_btree_build_stats)
    1802             :         ResetUsage();
    1803             : #endif                          /* BTREE_BUILD_STATS */
    1804             : 
    1805             :     /* Set debug_query_string for individual workers first */
    1806          88 :     sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, false);
    1807          88 :     debug_query_string = sharedquery;
    1808             : 
    1809             :     /* Report the query string from leader */
    1810          88 :     pgstat_report_activity(STATE_RUNNING, debug_query_string);
    1811             : 
    1812             :     /* Look up nbtree shared state */
    1813          88 :     btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
    1814             : 
    1815             :     /* Open relations using lock modes known to be obtained by index.c */
    1816          88 :     if (!btshared->isconcurrent)
    1817             :     {
    1818          88 :         heapLockmode = ShareLock;
    1819          88 :         indexLockmode = AccessExclusiveLock;
    1820             :     }
    1821             :     else
    1822             :     {
    1823           0 :         heapLockmode = ShareUpdateExclusiveLock;
    1824           0 :         indexLockmode = RowExclusiveLock;
    1825             :     }
    1826             : 
    1827             :     /* Open relations within worker */
    1828          88 :     heapRel = table_open(btshared->heaprelid, heapLockmode);
    1829          88 :     indexRel = index_open(btshared->indexrelid, indexLockmode);
    1830             : 
    1831             :     /* Initialize worker's own spool */
    1832          88 :     btspool = (BTSpool *) palloc0(sizeof(BTSpool));
    1833          88 :     btspool->heap = heapRel;
    1834          88 :     btspool->index = indexRel;
    1835          88 :     btspool->isunique = btshared->isunique;
    1836             : 
    1837             :     /* Look up shared state private to tuplesort.c */
    1838          88 :     sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
    1839          88 :     tuplesort_attach_shared(sharedsort, seg);
    1840          88 :     if (!btshared->isunique)
    1841             :     {
    1842          52 :         btspool2 = NULL;
    1843          52 :         sharedsort2 = NULL;
    1844             :     }
    1845             :     else
    1846             :     {
    1847             :         /* Allocate memory for worker's own private secondary spool */
    1848          36 :         btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
    1849             : 
    1850             :         /* Initialize worker's own secondary spool */
    1851          36 :         btspool2->heap = btspool->heap;
    1852          36 :         btspool2->index = btspool->index;
    1853          36 :         btspool2->isunique = false;
    1854             :         /* Look up shared state private to tuplesort.c */
    1855          36 :         sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
    1856          36 :         tuplesort_attach_shared(sharedsort2, seg);
    1857             :     }
    1858             : 
    1859             :     /* Prepare to track buffer usage during parallel execution */
    1860          88 :     InstrStartParallelQuery();
    1861             : 
    1862             :     /* Perform sorting of spool, and possibly a spool2 */
    1863          88 :     sortmem = maintenance_work_mem / btshared->scantuplesortstates;
    1864          88 :     _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
    1865             :                                sharedsort2, sortmem, false);
    1866             : 
    1867             :     /* Report WAL/buffer usage during parallel execution */
    1868          88 :     bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
    1869          88 :     walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
    1870          88 :     InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
    1871          88 :                           &walusage[ParallelWorkerNumber]);
    1872             : 
    1873             : #ifdef BTREE_BUILD_STATS
    1874             :     if (log_btree_build_stats)
    1875             :     {
    1876             :         ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
    1877             :         ResetUsage();
    1878             :     }
    1879             : #endif                          /* BTREE_BUILD_STATS */
    1880             : 
    1881          88 :     index_close(indexRel, indexLockmode);
    1882          88 :     table_close(heapRel, heapLockmode);
    1883          88 : }
    1884             : 
    1885             : /*
    1886             :  * Perform a worker's portion of a parallel sort.
    1887             :  *
    1888             :  * This generates a tuplesort for passed btspool, and a second tuplesort
    1889             :  * state if a second btspool is need (i.e. for unique index builds).  All
    1890             :  * other spool fields should already be set when this is called.
    1891             :  *
    1892             :  * sortmem is the amount of working memory to use within each worker,
    1893             :  * expressed in KBs.
    1894             :  *
    1895             :  * When this returns, workers are done, and need only release resources.
    1896             :  */
    1897             : static void
    1898         176 : _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
    1899             :                            BTShared *btshared, Sharedsort *sharedsort,
    1900             :                            Sharedsort *sharedsort2, int sortmem, bool progress)
    1901             : {
    1902             :     SortCoordinate coordinate;
    1903             :     BTBuildState buildstate;
    1904             :     TableScanDesc scan;
    1905             :     double      reltuples;
    1906             :     IndexInfo  *indexInfo;
    1907             : 
    1908             :     /* Initialize local tuplesort coordination state */
    1909         176 :     coordinate = palloc0(sizeof(SortCoordinateData));
    1910         176 :     coordinate->isWorker = true;
    1911         176 :     coordinate->nParticipants = -1;
    1912         176 :     coordinate->sharedsort = sharedsort;
    1913             : 
    1914             :     /* Begin "partial" tuplesort */
    1915         352 :     btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
    1916             :                                                      btspool->index,
    1917         176 :                                                      btspool->isunique,
    1918             :                                                      sortmem, coordinate,
    1919             :                                                      false);
    1920             : 
    1921             :     /*
    1922             :      * Just as with serial case, there may be a second spool.  If so, a
    1923             :      * second, dedicated spool2 partial tuplesort is required.
    1924             :      */
    1925         176 :     if (btspool2)
    1926             :     {
    1927             :         SortCoordinate coordinate2;
    1928             : 
    1929             :         /*
    1930             :          * We expect that the second one (for dead tuples) won't get very
    1931             :          * full, so we give it only work_mem (unless sortmem is less for
    1932             :          * worker).  Worker processes are generally permitted to allocate
    1933             :          * work_mem independently.
    1934             :          */
    1935          72 :         coordinate2 = palloc0(sizeof(SortCoordinateData));
    1936          72 :         coordinate2->isWorker = true;
    1937          72 :         coordinate2->nParticipants = -1;
    1938          72 :         coordinate2->sharedsort = sharedsort2;
    1939          72 :         btspool2->sortstate =
    1940          72 :             tuplesort_begin_index_btree(btspool->heap, btspool->index, false,
    1941             :                                         Min(sortmem, work_mem), coordinate2,
    1942             :                                         false);
    1943             :     }
    1944             : 
    1945             :     /* Fill in buildstate for _bt_build_callback() */
    1946         176 :     buildstate.isunique = btshared->isunique;
    1947         176 :     buildstate.havedead = false;
    1948         176 :     buildstate.heap = btspool->heap;
    1949         176 :     buildstate.spool = btspool;
    1950         176 :     buildstate.spool2 = btspool2;
    1951         176 :     buildstate.indtuples = 0;
    1952         176 :     buildstate.btleader = NULL;
    1953             : 
    1954             :     /* Join parallel scan */
    1955         176 :     indexInfo = BuildIndexInfo(btspool->index);
    1956         176 :     indexInfo->ii_Concurrent = btshared->isconcurrent;
    1957         176 :     scan = table_beginscan_parallel(btspool->heap,
    1958             :                                     ParallelTableScanFromBTShared(btshared));
    1959         176 :     reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
    1960             :                                        true, progress, _bt_build_callback,
    1961             :                                        (void *) &buildstate, scan);
    1962             : 
    1963             :     /*
    1964             :      * Execute this worker's part of the sort.
    1965             :      *
    1966             :      * Unlike leader and serial cases, we cannot avoid calling
    1967             :      * tuplesort_performsort() for spool2 if it ends up containing no dead
    1968             :      * tuples (this is disallowed for workers by tuplesort).
    1969             :      */
    1970         176 :     tuplesort_performsort(btspool->sortstate);
    1971         176 :     if (btspool2)
    1972          72 :         tuplesort_performsort(btspool2->sortstate);
    1973             : 
    1974             :     /*
    1975             :      * Done.  Record ambuild statistics, and whether we encountered a broken
    1976             :      * HOT chain.
    1977             :      */
    1978         176 :     SpinLockAcquire(&btshared->mutex);
    1979         176 :     btshared->nparticipantsdone++;
    1980         176 :     btshared->reltuples += reltuples;
    1981         176 :     if (buildstate.havedead)
    1982           0 :         btshared->havedead = true;
    1983         176 :     btshared->indtuples += buildstate.indtuples;
    1984         176 :     if (indexInfo->ii_BrokenHotChain)
    1985           0 :         btshared->brokenhotchain = true;
    1986         176 :     SpinLockRelease(&btshared->mutex);
    1987             : 
    1988             :     /* Notify leader */
    1989         176 :     ConditionVariableSignal(&btshared->workersdonecv);
    1990             : 
    1991             :     /* We can end tuplesorts immediately */
    1992         176 :     tuplesort_end(btspool->sortstate);
    1993         176 :     if (btspool2)
    1994          72 :         tuplesort_end(btspool2->sortstate);
    1995         176 : }

Generated by: LCOV version 1.13