LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtsort.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 535 552 96.9 %
Date: 2025-04-01 16:15:31 Functions: 22 22 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtsort.c
       4             :  *      Build a btree from sorted input by loading leaf pages sequentially.
       5             :  *
       6             :  * NOTES
       7             :  *
       8             :  * We use tuplesort.c to sort the given index tuples into order.
       9             :  * Then we scan the index tuples in order and build the btree pages
      10             :  * for each level.  We load source tuples into leaf-level pages.
      11             :  * Whenever we fill a page at one level, we add a link to it to its
      12             :  * parent level (starting a new parent level if necessary).  When
      13             :  * done, we write out each final page on each level, adding it to
      14             :  * its parent level.  When we have only one page on a level, it must be
      15             :  * the root -- it can be attached to the btree metapage and we are done.
      16             :  *
      17             :  * It is not wise to pack the pages entirely full, since then *any*
      18             :  * insertion would cause a split (and not only of the leaf page; the need
      19             :  * for a split would cascade right up the tree).  The steady-state load
      20             :  * factor for btrees is usually estimated at 70%.  We choose to pack leaf
      21             :  * pages to the user-controllable fill factor (default 90%) while upper pages
      22             :  * are always packed to 70%.  This gives us reasonable density (there aren't
      23             :  * many upper pages if the keys are reasonable-size) without risking a lot of
      24             :  * cascading splits during early insertions.
      25             :  *
      26             :  * We use the bulk smgr loading facility to bypass the buffer cache and
      27             :  * WAL-log the pages efficiently.
      28             :  *
      29             :  * This code isn't concerned about the FSM at all. The caller is responsible
      30             :  * for initializing that.
      31             :  *
      32             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
      33             :  * Portions Copyright (c) 1994, Regents of the University of California
      34             :  *
      35             :  * IDENTIFICATION
      36             :  *    src/backend/access/nbtree/nbtsort.c
      37             :  *
      38             :  *-------------------------------------------------------------------------
      39             :  */
      40             : 
      41             : #include "postgres.h"
      42             : 
      43             : #include "access/nbtree.h"
      44             : #include "access/parallel.h"
      45             : #include "access/relscan.h"
      46             : #include "access/table.h"
      47             : #include "access/xact.h"
      48             : #include "catalog/index.h"
      49             : #include "commands/progress.h"
      50             : #include "executor/instrument.h"
      51             : #include "miscadmin.h"
      52             : #include "pgstat.h"
      53             : #include "storage/bulk_write.h"
      54             : #include "tcop/tcopprot.h"
      55             : #include "utils/rel.h"
      56             : #include "utils/sortsupport.h"
      57             : #include "utils/tuplesort.h"
      58             : 
      59             : 
      60             : /* Magic numbers for parallel state sharing */
      61             : #define PARALLEL_KEY_BTREE_SHARED       UINT64CONST(0xA000000000000001)
      62             : #define PARALLEL_KEY_TUPLESORT          UINT64CONST(0xA000000000000002)
      63             : #define PARALLEL_KEY_TUPLESORT_SPOOL2   UINT64CONST(0xA000000000000003)
      64             : #define PARALLEL_KEY_QUERY_TEXT         UINT64CONST(0xA000000000000004)
      65             : #define PARALLEL_KEY_WAL_USAGE          UINT64CONST(0xA000000000000005)
      66             : #define PARALLEL_KEY_BUFFER_USAGE       UINT64CONST(0xA000000000000006)
      67             : 
      68             : /*
      69             :  * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
      70             :  * parallel index builds.  This may be useful as a debugging aid.
      71             : #undef DISABLE_LEADER_PARTICIPATION
      72             :  */
      73             : 
      74             : /*
      75             :  * Status record for spooling/sorting phase.  (Note we may have two of
      76             :  * these due to the special requirements for uniqueness-checking with
      77             :  * dead tuples.)
      78             :  */
      79             : typedef struct BTSpool
      80             : {
      81             :     Tuplesortstate *sortstate;  /* state data for tuplesort.c */
      82             :     Relation    heap;
      83             :     Relation    index;
      84             :     bool        isunique;
      85             :     bool        nulls_not_distinct;
      86             : } BTSpool;
      87             : 
      88             : /*
      89             :  * Status for index builds performed in parallel.  This is allocated in a
      90             :  * dynamic shared memory segment.  Note that there is a separate tuplesort TOC
      91             :  * entry, private to tuplesort.c but allocated by this module on its behalf.
      92             :  */
      93             : typedef struct BTShared
      94             : {
      95             :     /*
      96             :      * These fields are not modified during the sort.  They primarily exist
      97             :      * for the benefit of worker processes that need to create BTSpool state
      98             :      * corresponding to that used by the leader.
      99             :      */
     100             :     Oid         heaprelid;
     101             :     Oid         indexrelid;
     102             :     bool        isunique;
     103             :     bool        nulls_not_distinct;
     104             :     bool        isconcurrent;
     105             :     int         scantuplesortstates;
     106             : 
     107             :     /* Query ID, for report in worker processes */
     108             :     uint64      queryid;
     109             : 
     110             :     /*
     111             :      * workersdonecv is used to monitor the progress of workers.  All parallel
     112             :      * participants must indicate that they are done before leader can use
     113             :      * mutable state that workers maintain during scan (and before leader can
     114             :      * proceed to tuplesort_performsort()).
     115             :      */
     116             :     ConditionVariable workersdonecv;
     117             : 
     118             :     /*
     119             :      * mutex protects all fields before heapdesc.
     120             :      *
     121             :      * These fields contain status information of interest to B-Tree index
     122             :      * builds that must work just the same when an index is built in parallel.
     123             :      */
     124             :     slock_t     mutex;
     125             : 
     126             :     /*
     127             :      * Mutable state that is maintained by workers, and reported back to
     128             :      * leader at end of parallel scan.
     129             :      *
     130             :      * nparticipantsdone is number of worker processes finished.
     131             :      *
     132             :      * reltuples is the total number of input heap tuples.
     133             :      *
     134             :      * havedead indicates if RECENTLY_DEAD tuples were encountered during
     135             :      * build.
     136             :      *
     137             :      * indtuples is the total number of tuples that made it into the index.
     138             :      *
     139             :      * brokenhotchain indicates if any worker detected a broken HOT chain
     140             :      * during build.
     141             :      */
     142             :     int         nparticipantsdone;
     143             :     double      reltuples;
     144             :     bool        havedead;
     145             :     double      indtuples;
     146             :     bool        brokenhotchain;
     147             : 
     148             :     /*
     149             :      * ParallelTableScanDescData data follows. Can't directly embed here, as
     150             :      * implementations of the parallel table scan desc interface might need
     151             :      * stronger alignment.
     152             :      */
     153             : } BTShared;
     154             : 
     155             : /*
     156             :  * Return pointer to a BTShared's parallel table scan.
     157             :  *
     158             :  * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
     159             :  * MAXALIGN.
     160             :  */
     161             : #define ParallelTableScanFromBTShared(shared) \
     162             :     (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
     163             : 
     164             : /*
     165             :  * Status for leader in parallel index build.
     166             :  */
     167             : typedef struct BTLeader
     168             : {
     169             :     /* parallel context itself */
     170             :     ParallelContext *pcxt;
     171             : 
     172             :     /*
     173             :      * nparticipanttuplesorts is the exact number of worker processes
     174             :      * successfully launched, plus one leader process if it participates as a
     175             :      * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
     176             :      * participating as a worker).
     177             :      */
     178             :     int         nparticipanttuplesorts;
     179             : 
     180             :     /*
     181             :      * Leader process convenience pointers to shared state (leader avoids TOC
     182             :      * lookups).
     183             :      *
     184             :      * btshared is the shared state for entire build.  sharedsort is the
     185             :      * shared, tuplesort-managed state passed to each process tuplesort.
     186             :      * sharedsort2 is the corresponding btspool2 shared state, used only when
     187             :      * building unique indexes.  snapshot is the snapshot used by the scan iff
     188             :      * an MVCC snapshot is required.
     189             :      */
     190             :     BTShared   *btshared;
     191             :     Sharedsort *sharedsort;
     192             :     Sharedsort *sharedsort2;
     193             :     Snapshot    snapshot;
     194             :     WalUsage   *walusage;
     195             :     BufferUsage *bufferusage;
     196             : } BTLeader;
     197             : 
     198             : /*
     199             :  * Working state for btbuild and its callback.
     200             :  *
     201             :  * When parallel CREATE INDEX is used, there is a BTBuildState for each
     202             :  * participant.
     203             :  */
     204             : typedef struct BTBuildState
     205             : {
     206             :     bool        isunique;
     207             :     bool        nulls_not_distinct;
     208             :     bool        havedead;
     209             :     Relation    heap;
     210             :     BTSpool    *spool;
     211             : 
     212             :     /*
     213             :      * spool2 is needed only when the index is a unique index. Dead tuples are
     214             :      * put into spool2 instead of spool in order to avoid uniqueness check.
     215             :      */
     216             :     BTSpool    *spool2;
     217             :     double      indtuples;
     218             : 
     219             :     /*
     220             :      * btleader is only present when a parallel index build is performed, and
     221             :      * only in the leader process. (Actually, only the leader has a
     222             :      * BTBuildState.  Workers have their own spool and spool2, though.)
     223             :      */
     224             :     BTLeader   *btleader;
     225             : } BTBuildState;
     226             : 
     227             : /*
     228             :  * Status record for a btree page being built.  We have one of these
     229             :  * for each active tree level.
     230             :  */
     231             : typedef struct BTPageState
     232             : {
     233             :     BulkWriteBuffer btps_buf;   /* workspace for page building */
     234             :     BlockNumber btps_blkno;     /* block # to write this page at */
     235             :     IndexTuple  btps_lowkey;    /* page's strict lower bound pivot tuple */
     236             :     OffsetNumber btps_lastoff;  /* last item offset loaded */
     237             :     Size        btps_lastextra; /* last item's extra posting list space */
     238             :     uint32      btps_level;     /* tree level (0 = leaf) */
     239             :     Size        btps_full;      /* "full" if less than this much free space */
     240             :     struct BTPageState *btps_next;  /* link to parent level, if any */
     241             : } BTPageState;
     242             : 
     243             : /*
     244             :  * Overall status record for index writing phase.
     245             :  */
     246             : typedef struct BTWriteState
     247             : {
     248             :     Relation    heap;
     249             :     Relation    index;
     250             :     BulkWriteState *bulkstate;
     251             :     BTScanInsert inskey;        /* generic insertion scankey */
     252             :     BlockNumber btws_pages_alloced; /* # pages allocated */
     253             : } BTWriteState;
     254             : 
     255             : 
     256             : static double _bt_spools_heapscan(Relation heap, Relation index,
     257             :                                   BTBuildState *buildstate, IndexInfo *indexInfo);
     258             : static void _bt_spooldestroy(BTSpool *btspool);
     259             : static void _bt_spool(BTSpool *btspool, ItemPointer self,
     260             :                       Datum *values, bool *isnull);
     261             : static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
     262             : static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
     263             :                                bool *isnull, bool tupleIsAlive, void *state);
     264             : static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level);
     265             : static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
     266             : static void _bt_slideleft(Page rightmostpage);
     267             : static void _bt_sortaddtup(Page page, Size itemsize,
     268             :                            IndexTuple itup, OffsetNumber itup_off,
     269             :                            bool newfirstdataitem);
     270             : static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
     271             :                          IndexTuple itup, Size truncextra);
     272             : static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
     273             :                                           BTPageState *state,
     274             :                                           BTDedupState dstate);
     275             : static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
     276             : static void _bt_load(BTWriteState *wstate,
     277             :                      BTSpool *btspool, BTSpool *btspool2);
     278             : static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
     279             :                                int request);
     280             : static void _bt_end_parallel(BTLeader *btleader);
     281             : static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
     282             : static double _bt_parallel_heapscan(BTBuildState *buildstate,
     283             :                                     bool *brokenhotchain);
     284             : static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
     285             : static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
     286             :                                        BTShared *btshared, Sharedsort *sharedsort,
     287             :                                        Sharedsort *sharedsort2, int sortmem,
     288             :                                        bool progress);
     289             : 
     290             : 
     291             : /*
     292             :  *  btbuild() -- build a new btree index.
     293             :  */
     294             : IndexBuildResult *
     295       48444 : btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
     296             : {
     297             :     IndexBuildResult *result;
     298             :     BTBuildState buildstate;
     299             :     double      reltuples;
     300             : 
     301             : #ifdef BTREE_BUILD_STATS
     302             :     if (log_btree_build_stats)
     303             :         ResetUsage();
     304             : #endif                          /* BTREE_BUILD_STATS */
     305             : 
     306       48444 :     buildstate.isunique = indexInfo->ii_Unique;
     307       48444 :     buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
     308       48444 :     buildstate.havedead = false;
     309       48444 :     buildstate.heap = heap;
     310       48444 :     buildstate.spool = NULL;
     311       48444 :     buildstate.spool2 = NULL;
     312       48444 :     buildstate.indtuples = 0;
     313       48444 :     buildstate.btleader = NULL;
     314             : 
     315             :     /*
     316             :      * We expect to be called exactly once for any index relation. If that's
     317             :      * not the case, big trouble's what we have.
     318             :      */
     319       48444 :     if (RelationGetNumberOfBlocks(index) != 0)
     320           0 :         elog(ERROR, "index \"%s\" already contains data",
     321             :              RelationGetRelationName(index));
     322             : 
     323       48444 :     reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
     324             : 
     325             :     /*
     326             :      * Finish the build by (1) completing the sort of the spool file, (2)
     327             :      * inserting the sorted tuples into btree pages and (3) building the upper
     328             :      * levels.  Finally, it may also be necessary to end use of parallelism.
     329             :      */
     330       48432 :     _bt_leafbuild(buildstate.spool, buildstate.spool2);
     331       48348 :     _bt_spooldestroy(buildstate.spool);
     332       48348 :     if (buildstate.spool2)
     333          22 :         _bt_spooldestroy(buildstate.spool2);
     334       48348 :     if (buildstate.btleader)
     335         148 :         _bt_end_parallel(buildstate.btleader);
     336             : 
     337       48348 :     result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
     338             : 
     339       48348 :     result->heap_tuples = reltuples;
     340       48348 :     result->index_tuples = buildstate.indtuples;
     341             : 
     342             : #ifdef BTREE_BUILD_STATS
     343             :     if (log_btree_build_stats)
     344             :     {
     345             :         ShowUsage("BTREE BUILD STATS");
     346             :         ResetUsage();
     347             :     }
     348             : #endif                          /* BTREE_BUILD_STATS */
     349             : 
     350       48348 :     return result;
     351             : }
     352             : 
     353             : /*
     354             :  * Create and initialize one or two spool structures, and save them in caller's
     355             :  * buildstate argument.  May also fill-in fields within indexInfo used by index
     356             :  * builds.
     357             :  *
     358             :  * Scans the heap, possibly in parallel, filling spools with IndexTuples.  This
     359             :  * routine encapsulates all aspects of managing parallelism.  Caller need only
     360             :  * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
     361             :  *
     362             :  * Returns the total number of heap tuples scanned.
     363             :  */
     364             : static double
     365       48444 : _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
     366             :                     IndexInfo *indexInfo)
     367             : {
     368       48444 :     BTSpool    *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
     369       48444 :     SortCoordinate coordinate = NULL;
     370       48444 :     double      reltuples = 0;
     371             : 
     372             :     /*
     373             :      * We size the sort area as maintenance_work_mem rather than work_mem to
     374             :      * speed index creation.  This should be OK since a single backend can't
     375             :      * run multiple index creations in parallel (see also: notes on
     376             :      * parallelism and maintenance_work_mem below).
     377             :      */
     378       48444 :     btspool->heap = heap;
     379       48444 :     btspool->index = index;
     380       48444 :     btspool->isunique = indexInfo->ii_Unique;
     381       48444 :     btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
     382             : 
     383             :     /* Save as primary spool */
     384       48444 :     buildstate->spool = btspool;
     385             : 
     386             :     /* Report table scan phase started */
     387       48444 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     388             :                                  PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);
     389             : 
     390             :     /* Attempt to launch parallel worker scan when required */
     391       48444 :     if (indexInfo->ii_ParallelWorkers > 0)
     392         148 :         _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
     393             :                            indexInfo->ii_ParallelWorkers);
     394             : 
     395             :     /*
     396             :      * If parallel build requested and at least one worker process was
     397             :      * successfully launched, set up coordination state
     398             :      */
     399       48444 :     if (buildstate->btleader)
     400             :     {
     401         148 :         coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
     402         148 :         coordinate->isWorker = false;
     403         148 :         coordinate->nParticipants =
     404         148 :             buildstate->btleader->nparticipanttuplesorts;
     405         148 :         coordinate->sharedsort = buildstate->btleader->sharedsort;
     406             :     }
     407             : 
     408             :     /*
     409             :      * Begin serial/leader tuplesort.
     410             :      *
     411             :      * In cases where parallelism is involved, the leader receives the same
     412             :      * share of maintenance_work_mem as a serial sort (it is generally treated
     413             :      * in the same way as a serial sort once we return).  Parallel worker
     414             :      * Tuplesortstates will have received only a fraction of
     415             :      * maintenance_work_mem, though.
     416             :      *
     417             :      * We rely on the lifetime of the Leader Tuplesortstate almost not
     418             :      * overlapping with any worker Tuplesortstate's lifetime.  There may be
     419             :      * some small overlap, but that's okay because we rely on leader
     420             :      * Tuplesortstate only allocating a small, fixed amount of memory here.
     421             :      * When its tuplesort_performsort() is called (by our caller), and
     422             :      * significant amounts of memory are likely to be used, all workers must
     423             :      * have already freed almost all memory held by their Tuplesortstates
     424             :      * (they are about to go away completely, too).  The overall effect is
     425             :      * that maintenance_work_mem always represents an absolute high watermark
     426             :      * on the amount of memory used by a CREATE INDEX operation, regardless of
     427             :      * the use of parallelism or any other factor.
     428             :      */
     429       96888 :     buildstate->spool->sortstate =
     430       48444 :         tuplesort_begin_index_btree(heap, index, buildstate->isunique,
     431       48444 :                                     buildstate->nulls_not_distinct,
     432             :                                     maintenance_work_mem, coordinate,
     433             :                                     TUPLESORT_NONE);
     434             : 
     435             :     /*
     436             :      * If building a unique index, put dead tuples in a second spool to keep
     437             :      * them out of the uniqueness check.  We expect that the second spool (for
     438             :      * dead tuples) won't get very full, so we give it only work_mem.
     439             :      */
     440       48444 :     if (indexInfo->ii_Unique)
     441             :     {
     442       39534 :         BTSpool    *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
     443       39534 :         SortCoordinate coordinate2 = NULL;
     444             : 
     445             :         /* Initialize secondary spool */
     446       39534 :         btspool2->heap = heap;
     447       39534 :         btspool2->index = index;
     448       39534 :         btspool2->isunique = false;
     449             :         /* Save as secondary spool */
     450       39534 :         buildstate->spool2 = btspool2;
     451             : 
     452       39534 :         if (buildstate->btleader)
     453             :         {
     454             :             /*
     455             :              * Set up non-private state that is passed to
     456             :              * tuplesort_begin_index_btree() about the basic high level
     457             :              * coordination of a parallel sort.
     458             :              */
     459          64 :             coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
     460          64 :             coordinate2->isWorker = false;
     461          64 :             coordinate2->nParticipants =
     462          64 :                 buildstate->btleader->nparticipanttuplesorts;
     463          64 :             coordinate2->sharedsort = buildstate->btleader->sharedsort2;
     464             :         }
     465             : 
     466             :         /*
     467             :          * We expect that the second one (for dead tuples) won't get very
     468             :          * full, so we give it only work_mem
     469             :          */
     470       39534 :         buildstate->spool2->sortstate =
     471       39534 :             tuplesort_begin_index_btree(heap, index, false, false, work_mem,
     472             :                                         coordinate2, TUPLESORT_NONE);
     473             :     }
     474             : 
     475             :     /* Fill spool using either serial or parallel heap scan */
     476       48444 :     if (!buildstate->btleader)
     477       48296 :         reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
     478             :                                            _bt_build_callback, buildstate,
     479             :                                            NULL);
     480             :     else
     481         148 :         reltuples = _bt_parallel_heapscan(buildstate,
     482             :                                           &indexInfo->ii_BrokenHotChain);
     483             : 
     484             :     /*
     485             :      * Set the progress target for the next phase.  Reset the block number
     486             :      * values set by table_index_build_scan
     487             :      */
     488             :     {
     489       48432 :         const int   progress_index[] = {
     490             :             PROGRESS_CREATEIDX_TUPLES_TOTAL,
     491             :             PROGRESS_SCAN_BLOCKS_TOTAL,
     492             :             PROGRESS_SCAN_BLOCKS_DONE
     493             :         };
     494       48432 :         const int64 progress_vals[] = {
     495       48432 :             buildstate->indtuples,
     496             :             0, 0
     497             :         };
     498             : 
     499       48432 :         pgstat_progress_update_multi_param(3, progress_index, progress_vals);
     500             :     }
     501             : 
     502             :     /* okay, all heap tuples are spooled */
     503       48432 :     if (buildstate->spool2 && !buildstate->havedead)
     504             :     {
     505             :         /* spool2 turns out to be unnecessary */
     506       39512 :         _bt_spooldestroy(buildstate->spool2);
     507       39512 :         buildstate->spool2 = NULL;
     508             :     }
     509             : 
     510       48432 :     return reltuples;
     511             : }
     512             : 
     513             : /*
     514             :  * clean up a spool structure and its substructures.
     515             :  */
     516             : static void
     517       87882 : _bt_spooldestroy(BTSpool *btspool)
     518             : {
     519       87882 :     tuplesort_end(btspool->sortstate);
     520       87882 :     pfree(btspool);
     521       87882 : }
     522             : 
     523             : /*
     524             :  * spool an index entry into the sort file.
     525             :  */
     526             : static void
     527    12512678 : _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
     528             : {
     529    12512678 :     tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
     530             :                                   self, values, isnull);
     531    12512678 : }
     532             : 
     533             : /*
     534             :  * given a spool loaded by successive calls to _bt_spool,
     535             :  * create an entire btree.
     536             :  */
     537             : static void
     538       48432 : _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
     539             : {
     540             :     BTWriteState wstate;
     541             : 
     542             : #ifdef BTREE_BUILD_STATS
     543             :     if (log_btree_build_stats)
     544             :     {
     545             :         ShowUsage("BTREE BUILD (Spool) STATISTICS");
     546             :         ResetUsage();
     547             :     }
     548             : #endif                          /* BTREE_BUILD_STATS */
     549             : 
     550             :     /* Execute the sort */
     551       48432 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     552             :                                  PROGRESS_BTREE_PHASE_PERFORMSORT_1);
     553       48432 :     tuplesort_performsort(btspool->sortstate);
     554       48348 :     if (btspool2)
     555             :     {
     556          22 :         pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     557             :                                      PROGRESS_BTREE_PHASE_PERFORMSORT_2);
     558          22 :         tuplesort_performsort(btspool2->sortstate);
     559             :     }
     560             : 
     561       48348 :     wstate.heap = btspool->heap;
     562       48348 :     wstate.index = btspool->index;
     563       48348 :     wstate.inskey = _bt_mkscankey(wstate.index, NULL);
     564             :     /* _bt_mkscankey() won't set allequalimage without metapage */
     565       48348 :     wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
     566             : 
     567             :     /* reserve the metapage */
     568       48348 :     wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
     569             : 
     570       48348 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     571             :                                  PROGRESS_BTREE_PHASE_LEAF_LOAD);
     572       48348 :     _bt_load(&wstate, btspool, btspool2);
     573       48348 : }
     574             : 
     575             : /*
     576             :  * Per-tuple callback for table_index_build_scan
     577             :  */
     578             : static void
     579    12512678 : _bt_build_callback(Relation index,
     580             :                    ItemPointer tid,
     581             :                    Datum *values,
     582             :                    bool *isnull,
     583             :                    bool tupleIsAlive,
     584             :                    void *state)
     585             : {
     586    12512678 :     BTBuildState *buildstate = (BTBuildState *) state;
     587             : 
     588             :     /*
     589             :      * insert the index tuple into the appropriate spool file for subsequent
     590             :      * processing
     591             :      */
     592    12512678 :     if (tupleIsAlive || buildstate->spool2 == NULL)
     593    12512176 :         _bt_spool(buildstate->spool, tid, values, isnull);
     594             :     else
     595             :     {
     596             :         /* dead tuples are put into spool2 */
     597         502 :         buildstate->havedead = true;
     598         502 :         _bt_spool(buildstate->spool2, tid, values, isnull);
     599             :     }
     600             : 
     601    12512678 :     buildstate->indtuples += 1;
     602    12512678 : }
     603             : 
     604             : /*
     605             :  * allocate workspace for a new, clean btree page, not linked to any siblings.
     606             :  */
     607             : static BulkWriteBuffer
     608       51382 : _bt_blnewpage(BTWriteState *wstate, uint32 level)
     609             : {
     610             :     BulkWriteBuffer buf;
     611             :     Page        page;
     612             :     BTPageOpaque opaque;
     613             : 
     614       51382 :     buf = smgr_bulk_get_buf(wstate->bulkstate);
     615       51382 :     page = (Page) buf;
     616             : 
     617             :     /* Zero the page and set up standard page header info */
     618       51382 :     _bt_pageinit(page, BLCKSZ);
     619             : 
     620             :     /* Initialize BT opaque state */
     621       51382 :     opaque = BTPageGetOpaque(page);
     622       51382 :     opaque->btpo_prev = opaque->btpo_next = P_NONE;
     623       51382 :     opaque->btpo_level = level;
     624       51382 :     opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
     625       51382 :     opaque->btpo_cycleid = 0;
     626             : 
     627             :     /* Make the P_HIKEY line pointer appear allocated */
     628       51382 :     ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
     629             : 
     630       51382 :     return buf;
     631             : }
     632             : 
     633             : /*
     634             :  * emit a completed btree page, and release the working storage.
     635             :  */
     636             : static void
     637       99730 : _bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
     638             : {
     639       99730 :     smgr_bulk_write(wstate->bulkstate, blkno, buf, true);
     640             :     /* smgr_bulk_write took ownership of 'buf' */
     641       99730 : }
     642             : 
     643             : /*
     644             :  * allocate and initialize a new BTPageState.  the returned structure
     645             :  * is suitable for immediate use by _bt_buildadd.
     646             :  */
     647             : static BTPageState *
     648       12114 : _bt_pagestate(BTWriteState *wstate, uint32 level)
     649             : {
     650       12114 :     BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
     651             : 
     652             :     /* create initial page for level */
     653       12114 :     state->btps_buf = _bt_blnewpage(wstate, level);
     654             : 
     655             :     /* and assign it a page position */
     656       12114 :     state->btps_blkno = wstate->btws_pages_alloced++;
     657             : 
     658       12114 :     state->btps_lowkey = NULL;
     659             :     /* initialize lastoff so first item goes into P_FIRSTKEY */
     660       12114 :     state->btps_lastoff = P_HIKEY;
     661       12114 :     state->btps_lastextra = 0;
     662       12114 :     state->btps_level = level;
     663             :     /* set "full" threshold based on level.  See notes at head of file. */
     664       12114 :     if (level > 0)
     665        2552 :         state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
     666             :     else
     667        9562 :         state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
     668             : 
     669             :     /* no parent level, yet */
     670       12114 :     state->btps_next = NULL;
     671             : 
     672       12114 :     return state;
     673             : }
     674             : 
     675             : /*
     676             :  * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
     677             :  * P_HIKEY, overwriting P_HIKEY).
     678             :  *
     679             :  * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
     680             :  * rightmost page on its level is not supposed to get a high key.  Now that
     681             :  * it's clear that this page is a rightmost page, remove the unneeded empty
     682             :  * P_HIKEY line pointer space.
     683             :  */
     684             : static void
     685       12114 : _bt_slideleft(Page rightmostpage)
     686             : {
     687             :     OffsetNumber off;
     688             :     OffsetNumber maxoff;
     689             :     ItemId      previi;
     690             : 
     691       12114 :     maxoff = PageGetMaxOffsetNumber(rightmostpage);
     692             :     Assert(maxoff >= P_FIRSTKEY);
     693       12114 :     previi = PageGetItemId(rightmostpage, P_HIKEY);
     694      702310 :     for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
     695             :     {
     696      690196 :         ItemId      thisii = PageGetItemId(rightmostpage, off);
     697             : 
     698      690196 :         *previi = *thisii;
     699      690196 :         previi = thisii;
     700             :     }
     701       12114 :     ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
     702       12114 : }
     703             : 
     704             : /*
     705             :  * Add an item to a page being built.
     706             :  *
     707             :  * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
     708             :  * raises an error directly.
     709             :  *
     710             :  * Note that our nbtsort.c caller does not know yet if the page will be
     711             :  * rightmost.  Offset P_FIRSTKEY is always assumed to be the first data key by
     712             :  * caller.  Page that turns out to be the rightmost on its level is fixed by
     713             :  * calling _bt_slideleft().
     714             :  */
     715             : static void
     716    11249406 : _bt_sortaddtup(Page page,
     717             :                Size itemsize,
     718             :                IndexTuple itup,
     719             :                OffsetNumber itup_off,
     720             :                bool newfirstdataitem)
     721             : {
     722             :     IndexTupleData trunctuple;
     723             : 
     724    11249406 :     if (newfirstdataitem)
     725             :     {
     726        2676 :         trunctuple = *itup;
     727        2676 :         trunctuple.t_info = sizeof(IndexTupleData);
     728        2676 :         BTreeTupleSetNAtts(&trunctuple, 0, false);
     729        2676 :         itup = &trunctuple;
     730        2676 :         itemsize = sizeof(IndexTupleData);
     731             :     }
     732             : 
     733    11249406 :     if (PageAddItem(page, (Item) itup, itemsize, itup_off,
     734             :                     false, false) == InvalidOffsetNumber)
     735           0 :         elog(ERROR, "failed to add item to the index page");
     736    11249406 : }
     737             : 
     738             : /*----------
     739             :  * Add an item to a disk page from the sort output (or add a posting list
     740             :  * item formed from the sort output).
     741             :  *
     742             :  * We must be careful to observe the page layout conventions of nbtsearch.c:
     743             :  * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
     744             :  * - on non-leaf pages, the key portion of the first item need not be
     745             :  *   stored, we should store only the link.
     746             :  *
     747             :  * A leaf page being built looks like:
     748             :  *
     749             :  * +----------------+---------------------------------+
     750             :  * | PageHeaderData | linp0 linp1 linp2 ...           |
     751             :  * +-----------+----+---------------------------------+
     752             :  * | ... linpN |                                      |
     753             :  * +-----------+--------------------------------------+
     754             :  * |     ^ last                                       |
     755             :  * |                                                  |
     756             :  * +-------------+------------------------------------+
     757             :  * |             | itemN ...                          |
     758             :  * +-------------+------------------+-----------------+
     759             :  * |          ... item3 item2 item1 | "special space" |
     760             :  * +--------------------------------+-----------------+
     761             :  *
     762             :  * Contrast this with the diagram in bufpage.h; note the mismatch
     763             :  * between linps and items.  This is because we reserve linp0 as a
     764             :  * placeholder for the pointer to the "high key" item; when we have
     765             :  * filled up the page, we will set linp0 to point to itemN and clear
     766             :  * linpN.  On the other hand, if we find this is the last (rightmost)
     767             :  * page, we leave the items alone and slide the linp array over.  If
     768             :  * the high key is to be truncated, offset 1 is deleted, and we insert
     769             :  * the truncated high key at offset 1.
     770             :  *
     771             :  * 'last' pointer indicates the last offset added to the page.
     772             :  *
     773             :  * 'truncextra' is the size of the posting list in itup, if any.  This
     774             :  * information is stashed for the next call here, when we may benefit
     775             :  * from considering the impact of truncating away the posting list on
     776             :  * the page before deciding to finish the page off.  Posting lists are
     777             :  * often relatively large, so it is worth going to the trouble of
     778             :  * accounting for the saving from truncating away the posting list of
     779             :  * the tuple that becomes the high key (that may be the only way to
     780             :  * get close to target free space on the page).  Note that this is
     781             :  * only used for the soft fillfactor-wise limit, not the critical hard
     782             :  * limit.
     783             :  *----------
     784             :  */
     785             : static void
     786    11210138 : _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
     787             :              Size truncextra)
     788             : {
     789             :     BulkWriteBuffer nbuf;
     790             :     Page        npage;
     791             :     BlockNumber nblkno;
     792             :     OffsetNumber last_off;
     793             :     Size        last_truncextra;
     794             :     Size        pgspc;
     795             :     Size        itupsz;
     796             :     bool        isleaf;
     797             : 
     798             :     /*
     799             :      * This is a handy place to check for cancel interrupts during the btree
     800             :      * load phase of index creation.
     801             :      */
     802    11210138 :     CHECK_FOR_INTERRUPTS();
     803             : 
     804    11210138 :     nbuf = state->btps_buf;
     805    11210138 :     npage = (Page) nbuf;
     806    11210138 :     nblkno = state->btps_blkno;
     807    11210138 :     last_off = state->btps_lastoff;
     808    11210138 :     last_truncextra = state->btps_lastextra;
     809    11210138 :     state->btps_lastextra = truncextra;
     810             : 
     811    11210138 :     pgspc = PageGetFreeSpace(npage);
     812    11210138 :     itupsz = IndexTupleSize(itup);
     813    11210138 :     itupsz = MAXALIGN(itupsz);
     814             :     /* Leaf case has slightly different rules due to suffix truncation */
     815    11210138 :     isleaf = (state->btps_level == 0);
     816             : 
     817             :     /*
     818             :      * Check whether the new item can fit on a btree page on current level at
     819             :      * all.
     820             :      *
     821             :      * Every newly built index will treat heap TID as part of the keyspace,
     822             :      * which imposes the requirement that new high keys must occasionally have
     823             :      * a heap TID appended within _bt_truncate().  That may leave a new pivot
     824             :      * tuple one or two MAXALIGN() quantums larger than the original
     825             :      * firstright tuple it's derived from.  v4 deals with the problem by
     826             :      * decreasing the limit on the size of tuples inserted on the leaf level
     827             :      * by the same small amount.  Enforce the new v4+ limit on the leaf level,
     828             :      * and the old limit on internal levels, since pivot tuples may need to
     829             :      * make use of the reserved space.  This should never fail on internal
     830             :      * pages.
     831             :      */
     832    11210138 :     if (unlikely(itupsz > BTMaxItemSize))
     833         264 :         _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
     834             :                              itup);
     835             : 
     836             :     /*
     837             :      * Check to see if current page will fit new item, with space left over to
     838             :      * append a heap TID during suffix truncation when page is a leaf page.
     839             :      *
     840             :      * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
     841             :      * high key with heap TID when finishing off a leaf page, since we rely on
     842             :      * _bt_check_third_page() rejecting oversized non-pivot tuples.  On
     843             :      * internal pages we can always fit 3 pivot tuples with larger internal
     844             :      * page tuple limit (includes page high key).
     845             :      *
     846             :      * Most of the time, a page is only "full" in the sense that the soft
     847             :      * fillfactor-wise limit has been exceeded.  However, we must always leave
     848             :      * at least two items plus a high key on each page before starting a new
     849             :      * page.  Disregard fillfactor and insert on "full" current page if we
     850             :      * don't have the minimum number of items yet.  (Note that we deliberately
     851             :      * assume that suffix truncation neither enlarges nor shrinks new high key
     852             :      * when applying soft limit, except when last tuple has a posting list.)
     853             :      */
     854             :     Assert(last_truncextra == 0 || isleaf);
     855    11210138 :     if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
     856    11208982 :         (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
     857             :     {
     858             :         /*
     859             :          * Finish off the page and write it out.
     860             :          */
     861       39268 :         BulkWriteBuffer obuf = nbuf;
     862       39268 :         Page        opage = npage;
     863       39268 :         BlockNumber oblkno = nblkno;
     864             :         ItemId      ii;
     865             :         ItemId      hii;
     866             :         IndexTuple  oitup;
     867             : 
     868             :         /* Create new page of same level */
     869       39268 :         nbuf = _bt_blnewpage(wstate, state->btps_level);
     870       39268 :         npage = (Page) nbuf;
     871             : 
     872             :         /* and assign it a page position */
     873       39268 :         nblkno = wstate->btws_pages_alloced++;
     874             : 
     875             :         /*
     876             :          * We copy the last item on the page into the new page, and then
     877             :          * rearrange the old page so that the 'last item' becomes its high key
     878             :          * rather than a true data item.  There had better be at least two
     879             :          * items on the page already, else the page would be empty of useful
     880             :          * data.
     881             :          */
     882             :         Assert(last_off > P_FIRSTKEY);
     883       39268 :         ii = PageGetItemId(opage, last_off);
     884       39268 :         oitup = (IndexTuple) PageGetItem(opage, ii);
     885       39268 :         _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
     886       39268 :                        !isleaf);
     887             : 
     888             :         /*
     889             :          * Move 'last' into the high key position on opage.  _bt_blnewpage()
     890             :          * allocated empty space for a line pointer when opage was first
     891             :          * created, so this is a matter of rearranging already-allocated space
     892             :          * on page, and initializing high key line pointer. (Actually, leaf
     893             :          * pages must also swap oitup with a truncated version of oitup, which
     894             :          * is sometimes larger than oitup, though never by more than the space
     895             :          * needed to append a heap TID.)
     896             :          */
     897       39268 :         hii = PageGetItemId(opage, P_HIKEY);
     898       39268 :         *hii = *ii;
     899       39268 :         ItemIdSetUnused(ii);    /* redundant */
     900       39268 :         ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
     901             : 
     902       39268 :         if (isleaf)
     903             :         {
     904             :             IndexTuple  lastleft;
     905             :             IndexTuple  truncated;
     906             : 
     907             :             /*
     908             :              * Truncate away any unneeded attributes from high key on leaf
     909             :              * level.  This is only done at the leaf level because downlinks
     910             :              * in internal pages are either negative infinity items, or get
     911             :              * their contents from copying from one level down.  See also:
     912             :              * _bt_split().
     913             :              *
     914             :              * We don't try to bias our choice of split point to make it more
     915             :              * likely that _bt_truncate() can truncate away more attributes,
     916             :              * whereas the split point used within _bt_split() is chosen much
     917             :              * more delicately.  Even still, the lastleft and firstright
     918             :              * tuples passed to _bt_truncate() here are at least not fully
     919             :              * equal to each other when deduplication is used, unless there is
     920             :              * a large group of duplicates (also, unique index builds usually
     921             :              * have few or no spool2 duplicates).  When the split point is
     922             :              * between two unequal tuples, _bt_truncate() will avoid including
     923             :              * a heap TID in the new high key, which is the most important
     924             :              * benefit of suffix truncation.
     925             :              *
     926             :              * Overwrite the old item with new truncated high key directly.
     927             :              * oitup is already located at the physical beginning of tuple
     928             :              * space, so this should directly reuse the existing tuple space.
     929             :              */
     930       39144 :             ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
     931       39144 :             lastleft = (IndexTuple) PageGetItem(opage, ii);
     932             : 
     933             :             Assert(IndexTupleSize(oitup) > last_truncextra);
     934       39144 :             truncated = _bt_truncate(wstate->index, lastleft, oitup,
     935             :                                      wstate->inskey);
     936       39144 :             if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated,
     937             :                                          IndexTupleSize(truncated)))
     938           0 :                 elog(ERROR, "failed to add high key to the index page");
     939       39144 :             pfree(truncated);
     940             : 
     941             :             /* oitup should continue to point to the page's high key */
     942       39144 :             hii = PageGetItemId(opage, P_HIKEY);
     943       39144 :             oitup = (IndexTuple) PageGetItem(opage, hii);
     944             :         }
     945             : 
     946             :         /*
     947             :          * Link the old page into its parent, using its low key.  If we don't
     948             :          * have a parent, we have to create one; this adds a new btree level.
     949             :          */
     950       39268 :         if (state->btps_next == NULL)
     951        2552 :             state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
     952             : 
     953             :         Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
     954             :                 IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
     955             :                 BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
     956             :                P_LEFTMOST(BTPageGetOpaque(opage)));
     957             :         Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
     958             :                !P_LEFTMOST(BTPageGetOpaque(opage)));
     959       39268 :         BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
     960       39268 :         _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
     961       39268 :         pfree(state->btps_lowkey);
     962             : 
     963             :         /*
     964             :          * Save a copy of the high key from the old page.  It is also the low
     965             :          * key for the new page.
     966             :          */
     967       39268 :         state->btps_lowkey = CopyIndexTuple(oitup);
     968             : 
     969             :         /*
     970             :          * Set the sibling links for both pages.
     971             :          */
     972             :         {
     973       39268 :             BTPageOpaque oopaque = BTPageGetOpaque(opage);
     974       39268 :             BTPageOpaque nopaque = BTPageGetOpaque(npage);
     975             : 
     976       39268 :             oopaque->btpo_next = nblkno;
     977       39268 :             nopaque->btpo_prev = oblkno;
     978       39268 :             nopaque->btpo_next = P_NONE; /* redundant */
     979             :         }
     980             : 
     981             :         /*
     982             :          * Write out the old page. _bt_blwritepage takes ownership of the
     983             :          * 'opage' buffer.
     984             :          */
     985       39268 :         _bt_blwritepage(wstate, obuf, oblkno);
     986             : 
     987             :         /*
     988             :          * Reset last_off to point to new page
     989             :          */
     990       39268 :         last_off = P_FIRSTKEY;
     991             :     }
     992             : 
     993             :     /*
     994             :      * By here, either original page is still the current page, or a new page
     995             :      * was created that became the current page.  Either way, the current page
     996             :      * definitely has space for new item.
     997             :      *
     998             :      * If the new item is the first for its page, it must also be the first
     999             :      * item on its entire level.  On later same-level pages, a low key for a
    1000             :      * page will be copied from the prior page in the code above.  Generate a
    1001             :      * minus infinity low key here instead.
    1002             :      */
    1003    11210138 :     if (last_off == P_HIKEY)
    1004             :     {
    1005             :         Assert(state->btps_lowkey == NULL);
    1006       12114 :         state->btps_lowkey = palloc0(sizeof(IndexTupleData));
    1007       12114 :         state->btps_lowkey->t_info = sizeof(IndexTupleData);
    1008       12114 :         BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
    1009             :     }
    1010             : 
    1011             :     /*
    1012             :      * Add the new item into the current page.
    1013             :      */
    1014    11210138 :     last_off = OffsetNumberNext(last_off);
    1015    11210138 :     _bt_sortaddtup(npage, itupsz, itup, last_off,
    1016    11210138 :                    !isleaf && last_off == P_FIRSTKEY);
    1017             : 
    1018    11210138 :     state->btps_buf = nbuf;
    1019    11210138 :     state->btps_blkno = nblkno;
    1020    11210138 :     state->btps_lastoff = last_off;
    1021    11210138 : }
    1022             : 
    1023             : /*
    1024             :  * Finalize pending posting list tuple, and add it to the index.  Final tuple
    1025             :  * is based on saved base tuple, and saved list of heap TIDs.
    1026             :  *
    1027             :  * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
    1028             :  * using _bt_buildadd().
    1029             :  */
    1030             : static void
    1031     4730614 : _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
    1032             :                               BTDedupState dstate)
    1033             : {
    1034             :     Assert(dstate->nitems > 0);
    1035             : 
    1036     4730614 :     if (dstate->nitems == 1)
    1037     4692072 :         _bt_buildadd(wstate, state, dstate->base, 0);
    1038             :     else
    1039             :     {
    1040             :         IndexTuple  postingtuple;
    1041             :         Size        truncextra;
    1042             : 
    1043             :         /* form a tuple with a posting list */
    1044       38542 :         postingtuple = _bt_form_posting(dstate->base,
    1045             :                                         dstate->htids,
    1046             :                                         dstate->nhtids);
    1047             :         /* Calculate posting list overhead */
    1048       38542 :         truncextra = IndexTupleSize(postingtuple) -
    1049       38542 :             BTreeTupleGetPostingOffset(postingtuple);
    1050             : 
    1051       38542 :         _bt_buildadd(wstate, state, postingtuple, truncextra);
    1052       38542 :         pfree(postingtuple);
    1053             :     }
    1054             : 
    1055     4730614 :     dstate->nmaxitems = 0;
    1056     4730614 :     dstate->nhtids = 0;
    1057     4730614 :     dstate->nitems = 0;
    1058     4730614 :     dstate->phystupsize = 0;
    1059     4730614 : }
    1060             : 
    1061             : /*
    1062             :  * Finish writing out the completed btree.
    1063             :  */
    1064             : static void
    1065       48348 : _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
    1066             : {
    1067             :     BTPageState *s;
    1068       48348 :     BlockNumber rootblkno = P_NONE;
    1069       48348 :     uint32      rootlevel = 0;
    1070             :     BulkWriteBuffer metabuf;
    1071             : 
    1072             :     /*
    1073             :      * Each iteration of this loop completes one more level of the tree.
    1074             :      */
    1075       60462 :     for (s = state; s != NULL; s = s->btps_next)
    1076             :     {
    1077             :         BlockNumber blkno;
    1078             :         BTPageOpaque opaque;
    1079             : 
    1080       12114 :         blkno = s->btps_blkno;
    1081       12114 :         opaque = BTPageGetOpaque((Page) s->btps_buf);
    1082             : 
    1083             :         /*
    1084             :          * We have to link the last page on this level to somewhere.
    1085             :          *
    1086             :          * If we're at the top, it's the root, so attach it to the metapage.
    1087             :          * Otherwise, add an entry for it to its parent using its low key.
    1088             :          * This may cause the last page of the parent level to split, but
    1089             :          * that's not a problem -- we haven't gotten to it yet.
    1090             :          */
    1091       12114 :         if (s->btps_next == NULL)
    1092             :         {
    1093        9562 :             opaque->btpo_flags |= BTP_ROOT;
    1094        9562 :             rootblkno = blkno;
    1095        9562 :             rootlevel = s->btps_level;
    1096             :         }
    1097             :         else
    1098             :         {
    1099             :             Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
    1100             :                     IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
    1101             :                     BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
    1102             :                    P_LEFTMOST(opaque));
    1103             :             Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
    1104             :                    !P_LEFTMOST(opaque));
    1105        2552 :             BTreeTupleSetDownLink(s->btps_lowkey, blkno);
    1106        2552 :             _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
    1107        2552 :             pfree(s->btps_lowkey);
    1108        2552 :             s->btps_lowkey = NULL;
    1109             :         }
    1110             : 
    1111             :         /*
    1112             :          * This is the rightmost page, so the ItemId array needs to be slid
    1113             :          * back one slot.  Then we can dump out the page.
    1114             :          */
    1115       12114 :         _bt_slideleft((Page) s->btps_buf);
    1116       12114 :         _bt_blwritepage(wstate, s->btps_buf, s->btps_blkno);
    1117       12114 :         s->btps_buf = NULL;      /* writepage took ownership of the buffer */
    1118             :     }
    1119             : 
    1120             :     /*
    1121             :      * As the last step in the process, construct the metapage and make it
    1122             :      * point to the new root (unless we had no data at all, in which case it's
    1123             :      * set to point to "P_NONE").  This changes the index to the "valid" state
    1124             :      * by filling in a valid magic number in the metapage.
    1125             :      */
    1126       48348 :     metabuf = smgr_bulk_get_buf(wstate->bulkstate);
    1127       48348 :     _bt_initmetapage((Page) metabuf, rootblkno, rootlevel,
    1128       48348 :                      wstate->inskey->allequalimage);
    1129       48348 :     _bt_blwritepage(wstate, metabuf, BTREE_METAPAGE);
    1130       48348 : }
    1131             : 
    1132             : /*
    1133             :  * Read tuples in correct sort order from tuplesort, and load them into
    1134             :  * btree leaves.
    1135             :  */
    1136             : static void
    1137       48348 : _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
    1138             : {
    1139       48348 :     BTPageState *state = NULL;
    1140       48348 :     bool        merge = (btspool2 != NULL);
    1141             :     IndexTuple  itup,
    1142       48348 :                 itup2 = NULL;
    1143             :     bool        load1;
    1144       48348 :     TupleDesc   tupdes = RelationGetDescr(wstate->index);
    1145             :     int         i,
    1146       48348 :                 keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
    1147             :     SortSupport sortKeys;
    1148       48348 :     int64       tuples_done = 0;
    1149             :     bool        deduplicate;
    1150             : 
    1151       48348 :     wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);
    1152             : 
    1153       56996 :     deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
    1154        8648 :         BTGetDeduplicateItems(wstate->index);
    1155             : 
    1156       48348 :     if (merge)
    1157             :     {
    1158             :         /*
    1159             :          * Another BTSpool for dead tuples exists. Now we have to merge
    1160             :          * btspool and btspool2.
    1161             :          */
    1162             : 
    1163             :         /* the preparation of merge */
    1164          22 :         itup = tuplesort_getindextuple(btspool->sortstate, true);
    1165          22 :         itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
    1166             : 
    1167             :         /* Prepare SortSupport data for each column */
    1168          22 :         sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
    1169             : 
    1170          46 :         for (i = 0; i < keysz; i++)
    1171             :         {
    1172          24 :             SortSupport sortKey = sortKeys + i;
    1173          24 :             ScanKey     scanKey = wstate->inskey->scankeys + i;
    1174             :             bool        reverse;
    1175             : 
    1176          24 :             sortKey->ssup_cxt = CurrentMemoryContext;
    1177          24 :             sortKey->ssup_collation = scanKey->sk_collation;
    1178          24 :             sortKey->ssup_nulls_first =
    1179          24 :                 (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
    1180          24 :             sortKey->ssup_attno = scanKey->sk_attno;
    1181             :             /* Abbreviation is not supported here */
    1182          24 :             sortKey->abbreviate = false;
    1183             : 
    1184             :             Assert(sortKey->ssup_attno != 0);
    1185             : 
    1186          24 :             reverse = (scanKey->sk_flags & SK_BT_DESC) != 0;
    1187             : 
    1188          24 :             PrepareSortSupportFromIndexRel(wstate->index, reverse, sortKey);
    1189             :         }
    1190             : 
    1191             :         for (;;)
    1192             :         {
    1193        3324 :             load1 = true;       /* load BTSpool next ? */
    1194        3324 :             if (itup2 == NULL)
    1195             :             {
    1196         154 :                 if (itup == NULL)
    1197          22 :                     break;
    1198             :             }
    1199        3170 :             else if (itup != NULL)
    1200             :             {
    1201        2976 :                 int32       compare = 0;
    1202             : 
    1203        3222 :                 for (i = 1; i <= keysz; i++)
    1204             :                 {
    1205             :                     SortSupport entry;
    1206             :                     Datum       attrDatum1,
    1207             :                                 attrDatum2;
    1208             :                     bool        isNull1,
    1209             :                                 isNull2;
    1210             : 
    1211        3028 :                     entry = sortKeys + i - 1;
    1212        3028 :                     attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
    1213        3028 :                     attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
    1214             : 
    1215        3028 :                     compare = ApplySortComparator(attrDatum1, isNull1,
    1216             :                                                   attrDatum2, isNull2,
    1217             :                                                   entry);
    1218        3028 :                     if (compare > 0)
    1219             :                     {
    1220         278 :                         load1 = false;
    1221        2782 :                         break;
    1222             :                     }
    1223        2750 :                     else if (compare < 0)
    1224        2504 :                         break;
    1225             :                 }
    1226             : 
    1227             :                 /*
    1228             :                  * If key values are equal, we sort on ItemPointer.  This is
    1229             :                  * required for btree indexes, since heap TID is treated as an
    1230             :                  * implicit last key attribute in order to ensure that all
    1231             :                  * keys in the index are physically unique.
    1232             :                  */
    1233        2976 :                 if (compare == 0)
    1234             :                 {
    1235         194 :                     compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
    1236             :                     Assert(compare != 0);
    1237         194 :                     if (compare > 0)
    1238          30 :                         load1 = false;
    1239             :                 }
    1240             :             }
    1241             :             else
    1242         194 :                 load1 = false;
    1243             : 
    1244             :             /* When we see first tuple, create first index page */
    1245        3302 :             if (state == NULL)
    1246          22 :                 state = _bt_pagestate(wstate, 0);
    1247             : 
    1248        3302 :             if (load1)
    1249             :             {
    1250        2800 :                 _bt_buildadd(wstate, state, itup, 0);
    1251        2800 :                 itup = tuplesort_getindextuple(btspool->sortstate, true);
    1252             :             }
    1253             :             else
    1254             :             {
    1255         502 :                 _bt_buildadd(wstate, state, itup2, 0);
    1256         502 :                 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
    1257             :             }
    1258             : 
    1259             :             /* Report progress */
    1260        3302 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1261             :                                          ++tuples_done);
    1262             :         }
    1263          22 :         pfree(sortKeys);
    1264             :     }
    1265       48326 :     else if (deduplicate)
    1266             :     {
    1267             :         /* merge is unnecessary, deduplicate into posting lists */
    1268             :         BTDedupState dstate;
    1269             : 
    1270        8648 :         dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
    1271        8648 :         dstate->deduplicate = true; /* unused */
    1272        8648 :         dstate->nmaxitems = 0;   /* unused */
    1273        8648 :         dstate->maxpostingsize = 0; /* set later */
    1274             :         /* Metadata about base tuple of current pending posting list */
    1275        8648 :         dstate->base = NULL;
    1276        8648 :         dstate->baseoff = InvalidOffsetNumber;   /* unused */
    1277        8648 :         dstate->basetupsize = 0;
    1278             :         /* Metadata about current pending posting list TIDs */
    1279        8648 :         dstate->htids = NULL;
    1280        8648 :         dstate->nhtids = 0;
    1281        8648 :         dstate->nitems = 0;
    1282        8648 :         dstate->phystupsize = 0; /* unused */
    1283        8648 :         dstate->nintervals = 0; /* unused */
    1284             : 
    1285     6083244 :         while ((itup = tuplesort_getindextuple(btspool->sortstate,
    1286             :                                                true)) != NULL)
    1287             :         {
    1288             :             /* When we see first tuple, create first index page */
    1289     6074596 :             if (state == NULL)
    1290             :             {
    1291        2352 :                 state = _bt_pagestate(wstate, 0);
    1292             : 
    1293             :                 /*
    1294             :                  * Limit size of posting list tuples to 1/10 space we want to
    1295             :                  * leave behind on the page, plus space for final item's line
    1296             :                  * pointer.  This is equal to the space that we'd like to
    1297             :                  * leave behind on each leaf page when fillfactor is 90,
    1298             :                  * allowing us to get close to fillfactor% space utilization
    1299             :                  * when there happen to be a great many duplicates.  (This
    1300             :                  * makes higher leaf fillfactor settings ineffective when
    1301             :                  * building indexes that have many duplicates, but packing
    1302             :                  * leaf pages full with few very large tuples doesn't seem
    1303             :                  * like a useful goal.)
    1304             :                  */
    1305        2352 :                 dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
    1306             :                     sizeof(ItemIdData);
    1307             :                 Assert(dstate->maxpostingsize <= BTMaxItemSize &&
    1308             :                        dstate->maxpostingsize <= INDEX_SIZE_MASK);
    1309        2352 :                 dstate->htids = palloc(dstate->maxpostingsize);
    1310             : 
    1311             :                 /* start new pending posting list with itup copy */
    1312        2352 :                 _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
    1313             :                                         InvalidOffsetNumber);
    1314             :             }
    1315     6072244 :             else if (_bt_keep_natts_fast(wstate->index, dstate->base,
    1316     1351852 :                                          itup) > keysz &&
    1317     1351852 :                      _bt_dedup_save_htid(dstate, itup))
    1318             :             {
    1319             :                 /*
    1320             :                  * Tuple is equal to base tuple of pending posting list.  Heap
    1321             :                  * TID from itup has been saved in state.
    1322             :                  */
    1323             :             }
    1324             :             else
    1325             :             {
    1326             :                 /*
    1327             :                  * Tuple is not equal to pending posting list tuple, or
    1328             :                  * _bt_dedup_save_htid() opted to not merge current item into
    1329             :                  * pending posting list.
    1330             :                  */
    1331     4728262 :                 _bt_sort_dedup_finish_pending(wstate, state, dstate);
    1332     4728262 :                 pfree(dstate->base);
    1333             : 
    1334             :                 /* start new pending posting list with itup copy */
    1335     4728262 :                 _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
    1336             :                                         InvalidOffsetNumber);
    1337             :             }
    1338             : 
    1339             :             /* Report progress */
    1340     6074596 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1341             :                                          ++tuples_done);
    1342             :         }
    1343             : 
    1344        8648 :         if (state)
    1345             :         {
    1346             :             /*
    1347             :              * Handle the last item (there must be a last item when the
    1348             :              * tuplesort returned one or more tuples)
    1349             :              */
    1350        2352 :             _bt_sort_dedup_finish_pending(wstate, state, dstate);
    1351        2352 :             pfree(dstate->base);
    1352        2352 :             pfree(dstate->htids);
    1353             :         }
    1354             : 
    1355        8648 :         pfree(dstate);
    1356             :     }
    1357             :     else
    1358             :     {
    1359             :         /* merging and deduplication are both unnecessary */
    1360     6474080 :         while ((itup = tuplesort_getindextuple(btspool->sortstate,
    1361             :                                                true)) != NULL)
    1362             :         {
    1363             :             /* When we see first tuple, create first index page */
    1364     6434402 :             if (state == NULL)
    1365        7188 :                 state = _bt_pagestate(wstate, 0);
    1366             : 
    1367     6434402 :             _bt_buildadd(wstate, state, itup, 0);
    1368             : 
    1369             :             /* Report progress */
    1370     6434402 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1371             :                                          ++tuples_done);
    1372             :         }
    1373             :     }
    1374             : 
    1375             :     /* Close down final pages and write the metapage */
    1376       48348 :     _bt_uppershutdown(wstate, state);
    1377       48348 :     smgr_bulk_finish(wstate->bulkstate);
    1378       48348 : }
    1379             : 
    1380             : /*
    1381             :  * Create parallel context, and launch workers for leader.
    1382             :  *
    1383             :  * buildstate argument should be initialized (with the exception of the
    1384             :  * tuplesort state in spools, which may later be created based on shared
    1385             :  * state initially set up here).
    1386             :  *
    1387             :  * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
    1388             :  *
    1389             :  * request is the target number of parallel worker processes to launch.
    1390             :  *
    1391             :  * Sets buildstate's BTLeader, which caller must use to shut down parallel
    1392             :  * mode by passing it to _bt_end_parallel() at the very end of its index
    1393             :  * build.  If not even a single worker process can be launched, this is
    1394             :  * never set, and caller should proceed with a serial index build.
    1395             :  */
    1396             : static void
    1397         148 : _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
    1398             : {
    1399             :     ParallelContext *pcxt;
    1400             :     int         scantuplesortstates;
    1401             :     Snapshot    snapshot;
    1402             :     Size        estbtshared;
    1403             :     Size        estsort;
    1404             :     BTShared   *btshared;
    1405             :     Sharedsort *sharedsort;
    1406             :     Sharedsort *sharedsort2;
    1407         148 :     BTSpool    *btspool = buildstate->spool;
    1408         148 :     BTLeader   *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
    1409             :     WalUsage   *walusage;
    1410             :     BufferUsage *bufferusage;
    1411         148 :     bool        leaderparticipates = true;
    1412             :     int         querylen;
    1413             : 
    1414             : #ifdef DISABLE_LEADER_PARTICIPATION
    1415             :     leaderparticipates = false;
    1416             : #endif
    1417             : 
    1418             :     /*
    1419             :      * Enter parallel mode, and create context for parallel build of btree
    1420             :      * index
    1421             :      */
    1422         148 :     EnterParallelMode();
    1423             :     Assert(request > 0);
    1424         148 :     pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
    1425             :                                  request);
    1426             : 
    1427         148 :     scantuplesortstates = leaderparticipates ? request + 1 : request;
    1428             : 
    1429             :     /*
    1430             :      * Prepare for scan of the base relation.  In a normal index build, we use
    1431             :      * SnapshotAny because we must retrieve all tuples and do our own time
    1432             :      * qual checks (because we have to index RECENTLY_DEAD tuples).  In a
    1433             :      * concurrent build, we take a regular MVCC snapshot and index whatever's
    1434             :      * live according to that.
    1435             :      */
    1436         148 :     if (!isconcurrent)
    1437         148 :         snapshot = SnapshotAny;
    1438             :     else
    1439           0 :         snapshot = RegisterSnapshot(GetTransactionSnapshot());
    1440             : 
    1441             :     /*
    1442             :      * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
    1443             :      * PARALLEL_KEY_TUPLESORT tuplesort workspace
    1444             :      */
    1445         148 :     estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
    1446         148 :     shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
    1447         148 :     estsort = tuplesort_estimate_shared(scantuplesortstates);
    1448         148 :     shm_toc_estimate_chunk(&pcxt->estimator, estsort);
    1449             : 
    1450             :     /*
    1451             :      * Unique case requires a second spool, and so we may have to account for
    1452             :      * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
    1453             :      */
    1454         148 :     if (!btspool->isunique)
    1455          84 :         shm_toc_estimate_keys(&pcxt->estimator, 2);
    1456             :     else
    1457             :     {
    1458          64 :         shm_toc_estimate_chunk(&pcxt->estimator, estsort);
    1459          64 :         shm_toc_estimate_keys(&pcxt->estimator, 3);
    1460             :     }
    1461             : 
    1462             :     /*
    1463             :      * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
    1464             :      * and PARALLEL_KEY_BUFFER_USAGE.
    1465             :      *
    1466             :      * If there are no extensions loaded that care, we could skip this.  We
    1467             :      * have no way of knowing whether anyone's looking at pgWalUsage or
    1468             :      * pgBufferUsage, so do it unconditionally.
    1469             :      */
    1470         148 :     shm_toc_estimate_chunk(&pcxt->estimator,
    1471             :                            mul_size(sizeof(WalUsage), pcxt->nworkers));
    1472         148 :     shm_toc_estimate_keys(&pcxt->estimator, 1);
    1473         148 :     shm_toc_estimate_chunk(&pcxt->estimator,
    1474             :                            mul_size(sizeof(BufferUsage), pcxt->nworkers));
    1475         148 :     shm_toc_estimate_keys(&pcxt->estimator, 1);
    1476             : 
    1477             :     /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
    1478         148 :     if (debug_query_string)
    1479             :     {
    1480         148 :         querylen = strlen(debug_query_string);
    1481         148 :         shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
    1482         148 :         shm_toc_estimate_keys(&pcxt->estimator, 1);
    1483             :     }
    1484             :     else
    1485           0 :         querylen = 0;           /* keep compiler quiet */
    1486             : 
    1487             :     /* Everyone's had a chance to ask for space, so now create the DSM */
    1488         148 :     InitializeParallelDSM(pcxt);
    1489             : 
    1490             :     /* If no DSM segment was available, back out (do serial build) */
    1491         148 :     if (pcxt->seg == NULL)
    1492             :     {
    1493           0 :         if (IsMVCCSnapshot(snapshot))
    1494           0 :             UnregisterSnapshot(snapshot);
    1495           0 :         DestroyParallelContext(pcxt);
    1496           0 :         ExitParallelMode();
    1497           0 :         return;
    1498             :     }
    1499             : 
    1500             :     /* Store shared build state, for which we reserved space */
    1501         148 :     btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
    1502             :     /* Initialize immutable state */
    1503         148 :     btshared->heaprelid = RelationGetRelid(btspool->heap);
    1504         148 :     btshared->indexrelid = RelationGetRelid(btspool->index);
    1505         148 :     btshared->isunique = btspool->isunique;
    1506         148 :     btshared->nulls_not_distinct = btspool->nulls_not_distinct;
    1507         148 :     btshared->isconcurrent = isconcurrent;
    1508         148 :     btshared->scantuplesortstates = scantuplesortstates;
    1509         148 :     btshared->queryid = pgstat_get_my_query_id();
    1510         148 :     ConditionVariableInit(&btshared->workersdonecv);
    1511         148 :     SpinLockInit(&btshared->mutex);
    1512             :     /* Initialize mutable state */
    1513         148 :     btshared->nparticipantsdone = 0;
    1514         148 :     btshared->reltuples = 0.0;
    1515         148 :     btshared->havedead = false;
    1516         148 :     btshared->indtuples = 0.0;
    1517         148 :     btshared->brokenhotchain = false;
    1518         148 :     table_parallelscan_initialize(btspool->heap,
    1519             :                                   ParallelTableScanFromBTShared(btshared),
    1520             :                                   snapshot);
    1521             : 
    1522             :     /*
    1523             :      * Store shared tuplesort-private state, for which we reserved space.
    1524             :      * Then, initialize opaque state using tuplesort routine.
    1525             :      */
    1526         148 :     sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
    1527         148 :     tuplesort_initialize_shared(sharedsort, scantuplesortstates,
    1528             :                                 pcxt->seg);
    1529             : 
    1530         148 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
    1531         148 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
    1532             : 
    1533             :     /* Unique case requires a second spool, and associated shared state */
    1534         148 :     if (!btspool->isunique)
    1535          84 :         sharedsort2 = NULL;
    1536             :     else
    1537             :     {
    1538             :         /*
    1539             :          * Store additional shared tuplesort-private state, for which we
    1540             :          * reserved space.  Then, initialize opaque state using tuplesort
    1541             :          * routine.
    1542             :          */
    1543          64 :         sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
    1544          64 :         tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
    1545             :                                     pcxt->seg);
    1546             : 
    1547          64 :         shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
    1548             :     }
    1549             : 
    1550             :     /* Store query string for workers */
    1551         148 :     if (debug_query_string)
    1552             :     {
    1553             :         char       *sharedquery;
    1554             : 
    1555         148 :         sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
    1556         148 :         memcpy(sharedquery, debug_query_string, querylen + 1);
    1557         148 :         shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
    1558             :     }
    1559             : 
    1560             :     /*
    1561             :      * Allocate space for each worker's WalUsage and BufferUsage; no need to
    1562             :      * initialize.
    1563             :      */
    1564         148 :     walusage = shm_toc_allocate(pcxt->toc,
    1565         148 :                                 mul_size(sizeof(WalUsage), pcxt->nworkers));
    1566         148 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
    1567         148 :     bufferusage = shm_toc_allocate(pcxt->toc,
    1568         148 :                                    mul_size(sizeof(BufferUsage), pcxt->nworkers));
    1569         148 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
    1570             : 
    1571             :     /* Launch workers, saving status for leader/caller */
    1572         148 :     LaunchParallelWorkers(pcxt);
    1573         148 :     btleader->pcxt = pcxt;
    1574         148 :     btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
    1575         148 :     if (leaderparticipates)
    1576         148 :         btleader->nparticipanttuplesorts++;
    1577         148 :     btleader->btshared = btshared;
    1578         148 :     btleader->sharedsort = sharedsort;
    1579         148 :     btleader->sharedsort2 = sharedsort2;
    1580         148 :     btleader->snapshot = snapshot;
    1581         148 :     btleader->walusage = walusage;
    1582         148 :     btleader->bufferusage = bufferusage;
    1583             : 
    1584             :     /* If no workers were successfully launched, back out (do serial build) */
    1585         148 :     if (pcxt->nworkers_launched == 0)
    1586             :     {
    1587           0 :         _bt_end_parallel(btleader);
    1588           0 :         return;
    1589             :     }
    1590             : 
    1591             :     /* Save leader state now that it's clear build will be parallel */
    1592         148 :     buildstate->btleader = btleader;
    1593             : 
    1594             :     /* Join heap scan ourselves */
    1595         148 :     if (leaderparticipates)
    1596         148 :         _bt_leader_participate_as_worker(buildstate);
    1597             : 
    1598             :     /*
    1599             :      * Caller needs to wait for all launched workers when we return.  Make
    1600             :      * sure that the failure-to-start case will not hang forever.
    1601             :      */
    1602         148 :     WaitForParallelWorkersToAttach(pcxt);
    1603             : }
    1604             : 
    1605             : /*
    1606             :  * Shut down workers, destroy parallel context, and end parallel mode.
    1607             :  */
    1608             : static void
    1609         148 : _bt_end_parallel(BTLeader *btleader)
    1610             : {
    1611             :     int         i;
    1612             : 
    1613             :     /* Shutdown worker processes */
    1614         148 :     WaitForParallelWorkersToFinish(btleader->pcxt);
    1615             : 
    1616             :     /*
    1617             :      * Next, accumulate WAL usage.  (This must wait for the workers to finish,
    1618             :      * or we might get incomplete data.)
    1619             :      */
    1620         296 :     for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
    1621         148 :         InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
    1622             : 
    1623             :     /* Free last reference to MVCC snapshot, if one was used */
    1624         148 :     if (IsMVCCSnapshot(btleader->snapshot))
    1625           0 :         UnregisterSnapshot(btleader->snapshot);
    1626         148 :     DestroyParallelContext(btleader->pcxt);
    1627         148 :     ExitParallelMode();
    1628         148 : }
    1629             : 
    1630             : /*
    1631             :  * Returns size of shared memory required to store state for a parallel
    1632             :  * btree index build based on the snapshot its parallel scan will use.
    1633             :  */
    1634             : static Size
    1635         148 : _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
    1636             : {
    1637             :     /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
    1638         148 :     return add_size(BUFFERALIGN(sizeof(BTShared)),
    1639             :                     table_parallelscan_estimate(heap, snapshot));
    1640             : }
    1641             : 
    1642             : /*
    1643             :  * Within leader, wait for end of heap scan.
    1644             :  *
    1645             :  * When called, parallel heap scan started by _bt_begin_parallel() will
    1646             :  * already be underway within worker processes (when leader participates
    1647             :  * as a worker, we should end up here just as workers are finishing).
    1648             :  *
    1649             :  * Fills in fields needed for ambuild statistics, and lets caller set
    1650             :  * field indicating that some worker encountered a broken HOT chain.
    1651             :  *
    1652             :  * Returns the total number of heap tuples scanned.
    1653             :  */
    1654             : static double
    1655         148 : _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
    1656             : {
    1657         148 :     BTShared   *btshared = buildstate->btleader->btshared;
    1658             :     int         nparticipanttuplesorts;
    1659             :     double      reltuples;
    1660             : 
    1661         148 :     nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
    1662             :     for (;;)
    1663             :     {
    1664         380 :         SpinLockAcquire(&btshared->mutex);
    1665         380 :         if (btshared->nparticipantsdone == nparticipanttuplesorts)
    1666             :         {
    1667         148 :             buildstate->havedead = btshared->havedead;
    1668         148 :             buildstate->indtuples = btshared->indtuples;
    1669         148 :             *brokenhotchain = btshared->brokenhotchain;
    1670         148 :             reltuples = btshared->reltuples;
    1671         148 :             SpinLockRelease(&btshared->mutex);
    1672         148 :             break;
    1673             :         }
    1674         232 :         SpinLockRelease(&btshared->mutex);
    1675             : 
    1676         232 :         ConditionVariableSleep(&btshared->workersdonecv,
    1677             :                                WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
    1678             :     }
    1679             : 
    1680         148 :     ConditionVariableCancelSleep();
    1681             : 
    1682         148 :     return reltuples;
    1683             : }
    1684             : 
    1685             : /*
    1686             :  * Within leader, participate as a parallel worker.
    1687             :  */
    1688             : static void
    1689         148 : _bt_leader_participate_as_worker(BTBuildState *buildstate)
    1690             : {
    1691         148 :     BTLeader   *btleader = buildstate->btleader;
    1692             :     BTSpool    *leaderworker;
    1693             :     BTSpool    *leaderworker2;
    1694             :     int         sortmem;
    1695             : 
    1696             :     /* Allocate memory and initialize private spool */
    1697         148 :     leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
    1698         148 :     leaderworker->heap = buildstate->spool->heap;
    1699         148 :     leaderworker->index = buildstate->spool->index;
    1700         148 :     leaderworker->isunique = buildstate->spool->isunique;
    1701         148 :     leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;
    1702             : 
    1703             :     /* Initialize second spool, if required */
    1704         148 :     if (!btleader->btshared->isunique)
    1705          84 :         leaderworker2 = NULL;
    1706             :     else
    1707             :     {
    1708             :         /* Allocate memory for worker's own private secondary spool */
    1709          64 :         leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
    1710             : 
    1711             :         /* Initialize worker's own secondary spool */
    1712          64 :         leaderworker2->heap = leaderworker->heap;
    1713          64 :         leaderworker2->index = leaderworker->index;
    1714          64 :         leaderworker2->isunique = false;
    1715             :     }
    1716             : 
    1717             :     /*
    1718             :      * Might as well use reliable figure when doling out maintenance_work_mem
    1719             :      * (when requested number of workers were not launched, this will be
    1720             :      * somewhat higher than it is for other workers).
    1721             :      */
    1722         148 :     sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
    1723             : 
    1724             :     /* Perform work common to all participants */
    1725         148 :     _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
    1726             :                                btleader->sharedsort, btleader->sharedsort2,
    1727             :                                sortmem, true);
    1728             : 
    1729             : #ifdef BTREE_BUILD_STATS
    1730             :     if (log_btree_build_stats)
    1731             :     {
    1732             :         ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
    1733             :         ResetUsage();
    1734             :     }
    1735             : #endif                          /* BTREE_BUILD_STATS */
    1736         148 : }
    1737             : 
    1738             : /*
    1739             :  * Perform work within a launched parallel process.
    1740             :  */
    1741             : void
    1742         148 : _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
    1743             : {
    1744             :     char       *sharedquery;
    1745             :     BTSpool    *btspool;
    1746             :     BTSpool    *btspool2;
    1747             :     BTShared   *btshared;
    1748             :     Sharedsort *sharedsort;
    1749             :     Sharedsort *sharedsort2;
    1750             :     Relation    heapRel;
    1751             :     Relation    indexRel;
    1752             :     LOCKMODE    heapLockmode;
    1753             :     LOCKMODE    indexLockmode;
    1754             :     WalUsage   *walusage;
    1755             :     BufferUsage *bufferusage;
    1756             :     int         sortmem;
    1757             : 
    1758             : #ifdef BTREE_BUILD_STATS
    1759             :     if (log_btree_build_stats)
    1760             :         ResetUsage();
    1761             : #endif                          /* BTREE_BUILD_STATS */
    1762             : 
    1763             :     /*
    1764             :      * The only possible status flag that can be set to the parallel worker is
    1765             :      * PROC_IN_SAFE_IC.
    1766             :      */
    1767             :     Assert((MyProc->statusFlags == 0) ||
    1768             :            (MyProc->statusFlags == PROC_IN_SAFE_IC));
    1769             : 
    1770             :     /* Set debug_query_string for individual workers first */
    1771         148 :     sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
    1772         148 :     debug_query_string = sharedquery;
    1773             : 
    1774             :     /* Report the query string from leader */
    1775         148 :     pgstat_report_activity(STATE_RUNNING, debug_query_string);
    1776             : 
    1777             :     /* Look up nbtree shared state */
    1778         148 :     btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
    1779             : 
    1780             :     /* Open relations using lock modes known to be obtained by index.c */
    1781         148 :     if (!btshared->isconcurrent)
    1782             :     {
    1783         148 :         heapLockmode = ShareLock;
    1784         148 :         indexLockmode = AccessExclusiveLock;
    1785             :     }
    1786             :     else
    1787             :     {
    1788           0 :         heapLockmode = ShareUpdateExclusiveLock;
    1789           0 :         indexLockmode = RowExclusiveLock;
    1790             :     }
    1791             : 
    1792             :     /* Track query ID */
    1793         148 :     pgstat_report_query_id(btshared->queryid, false);
    1794             : 
    1795             :     /* Open relations within worker */
    1796         148 :     heapRel = table_open(btshared->heaprelid, heapLockmode);
    1797         148 :     indexRel = index_open(btshared->indexrelid, indexLockmode);
    1798             : 
    1799             :     /* Initialize worker's own spool */
    1800         148 :     btspool = (BTSpool *) palloc0(sizeof(BTSpool));
    1801         148 :     btspool->heap = heapRel;
    1802         148 :     btspool->index = indexRel;
    1803         148 :     btspool->isunique = btshared->isunique;
    1804         148 :     btspool->nulls_not_distinct = btshared->nulls_not_distinct;
    1805             : 
    1806             :     /* Look up shared state private to tuplesort.c */
    1807         148 :     sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
    1808         148 :     tuplesort_attach_shared(sharedsort, seg);
    1809         148 :     if (!btshared->isunique)
    1810             :     {
    1811          84 :         btspool2 = NULL;
    1812          84 :         sharedsort2 = NULL;
    1813             :     }
    1814             :     else
    1815             :     {
    1816             :         /* Allocate memory for worker's own private secondary spool */
    1817          64 :         btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
    1818             : 
    1819             :         /* Initialize worker's own secondary spool */
    1820          64 :         btspool2->heap = btspool->heap;
    1821          64 :         btspool2->index = btspool->index;
    1822          64 :         btspool2->isunique = false;
    1823             :         /* Look up shared state private to tuplesort.c */
    1824          64 :         sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
    1825          64 :         tuplesort_attach_shared(sharedsort2, seg);
    1826             :     }
    1827             : 
    1828             :     /* Prepare to track buffer usage during parallel execution */
    1829         148 :     InstrStartParallelQuery();
    1830             : 
    1831             :     /* Perform sorting of spool, and possibly a spool2 */
    1832         148 :     sortmem = maintenance_work_mem / btshared->scantuplesortstates;
    1833         148 :     _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
    1834             :                                sharedsort2, sortmem, false);
    1835             : 
    1836             :     /* Report WAL/buffer usage during parallel execution */
    1837         148 :     bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
    1838         148 :     walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
    1839         148 :     InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
    1840         148 :                           &walusage[ParallelWorkerNumber]);
    1841             : 
    1842             : #ifdef BTREE_BUILD_STATS
    1843             :     if (log_btree_build_stats)
    1844             :     {
    1845             :         ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
    1846             :         ResetUsage();
    1847             :     }
    1848             : #endif                          /* BTREE_BUILD_STATS */
    1849             : 
    1850         148 :     index_close(indexRel, indexLockmode);
    1851         148 :     table_close(heapRel, heapLockmode);
    1852         148 : }
    1853             : 
    1854             : /*
    1855             :  * Perform a worker's portion of a parallel sort.
    1856             :  *
    1857             :  * This generates a tuplesort for passed btspool, and a second tuplesort
    1858             :  * state if a second btspool is need (i.e. for unique index builds).  All
    1859             :  * other spool fields should already be set when this is called.
    1860             :  *
    1861             :  * sortmem is the amount of working memory to use within each worker,
    1862             :  * expressed in KBs.
    1863             :  *
    1864             :  * When this returns, workers are done, and need only release resources.
    1865             :  */
    1866             : static void
    1867         296 : _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
    1868             :                            BTShared *btshared, Sharedsort *sharedsort,
    1869             :                            Sharedsort *sharedsort2, int sortmem, bool progress)
    1870             : {
    1871             :     SortCoordinate coordinate;
    1872             :     BTBuildState buildstate;
    1873             :     TableScanDesc scan;
    1874             :     double      reltuples;
    1875             :     IndexInfo  *indexInfo;
    1876             : 
    1877             :     /* Initialize local tuplesort coordination state */
    1878         296 :     coordinate = palloc0(sizeof(SortCoordinateData));
    1879         296 :     coordinate->isWorker = true;
    1880         296 :     coordinate->nParticipants = -1;
    1881         296 :     coordinate->sharedsort = sharedsort;
    1882             : 
    1883             :     /* Begin "partial" tuplesort */
    1884         592 :     btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
    1885             :                                                      btspool->index,
    1886         296 :                                                      btspool->isunique,
    1887         296 :                                                      btspool->nulls_not_distinct,
    1888             :                                                      sortmem, coordinate,
    1889             :                                                      TUPLESORT_NONE);
    1890             : 
    1891             :     /*
    1892             :      * Just as with serial case, there may be a second spool.  If so, a
    1893             :      * second, dedicated spool2 partial tuplesort is required.
    1894             :      */
    1895         296 :     if (btspool2)
    1896             :     {
    1897             :         SortCoordinate coordinate2;
    1898             : 
    1899             :         /*
    1900             :          * We expect that the second one (for dead tuples) won't get very
    1901             :          * full, so we give it only work_mem (unless sortmem is less for
    1902             :          * worker).  Worker processes are generally permitted to allocate
    1903             :          * work_mem independently.
    1904             :          */
    1905         128 :         coordinate2 = palloc0(sizeof(SortCoordinateData));
    1906         128 :         coordinate2->isWorker = true;
    1907         128 :         coordinate2->nParticipants = -1;
    1908         128 :         coordinate2->sharedsort = sharedsort2;
    1909         128 :         btspool2->sortstate =
    1910         128 :             tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,
    1911             :                                         Min(sortmem, work_mem), coordinate2,
    1912             :                                         false);
    1913             :     }
    1914             : 
    1915             :     /* Fill in buildstate for _bt_build_callback() */
    1916         296 :     buildstate.isunique = btshared->isunique;
    1917         296 :     buildstate.nulls_not_distinct = btshared->nulls_not_distinct;
    1918         296 :     buildstate.havedead = false;
    1919         296 :     buildstate.heap = btspool->heap;
    1920         296 :     buildstate.spool = btspool;
    1921         296 :     buildstate.spool2 = btspool2;
    1922         296 :     buildstate.indtuples = 0;
    1923         296 :     buildstate.btleader = NULL;
    1924             : 
    1925             :     /* Join parallel scan */
    1926         296 :     indexInfo = BuildIndexInfo(btspool->index);
    1927         296 :     indexInfo->ii_Concurrent = btshared->isconcurrent;
    1928         296 :     scan = table_beginscan_parallel(btspool->heap,
    1929             :                                     ParallelTableScanFromBTShared(btshared));
    1930         296 :     reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
    1931             :                                        true, progress, _bt_build_callback,
    1932             :                                        &buildstate, scan);
    1933             : 
    1934             :     /* Execute this worker's part of the sort */
    1935         296 :     if (progress)
    1936         148 :         pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
    1937             :                                      PROGRESS_BTREE_PHASE_PERFORMSORT_1);
    1938         296 :     tuplesort_performsort(btspool->sortstate);
    1939         296 :     if (btspool2)
    1940             :     {
    1941         128 :         if (progress)
    1942          64 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
    1943             :                                          PROGRESS_BTREE_PHASE_PERFORMSORT_2);
    1944         128 :         tuplesort_performsort(btspool2->sortstate);
    1945             :     }
    1946             : 
    1947             :     /*
    1948             :      * Done.  Record ambuild statistics, and whether we encountered a broken
    1949             :      * HOT chain.
    1950             :      */
    1951         296 :     SpinLockAcquire(&btshared->mutex);
    1952         296 :     btshared->nparticipantsdone++;
    1953         296 :     btshared->reltuples += reltuples;
    1954         296 :     if (buildstate.havedead)
    1955           0 :         btshared->havedead = true;
    1956         296 :     btshared->indtuples += buildstate.indtuples;
    1957         296 :     if (indexInfo->ii_BrokenHotChain)
    1958           0 :         btshared->brokenhotchain = true;
    1959         296 :     SpinLockRelease(&btshared->mutex);
    1960             : 
    1961             :     /* Notify leader */
    1962         296 :     ConditionVariableSignal(&btshared->workersdonecv);
    1963             : 
    1964             :     /* We can end tuplesorts immediately */
    1965         296 :     tuplesort_end(btspool->sortstate);
    1966         296 :     if (btspool2)
    1967         128 :         tuplesort_end(btspool2->sortstate);
    1968         296 : }

Generated by: LCOV version 1.14