LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtsort.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 536 553 96.9 %
Date: 2025-01-18 04:15:08 Functions: 22 22 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtsort.c
       4             :  *      Build a btree from sorted input by loading leaf pages sequentially.
       5             :  *
       6             :  * NOTES
       7             :  *
       8             :  * We use tuplesort.c to sort the given index tuples into order.
       9             :  * Then we scan the index tuples in order and build the btree pages
      10             :  * for each level.  We load source tuples into leaf-level pages.
      11             :  * Whenever we fill a page at one level, we add a link to it to its
      12             :  * parent level (starting a new parent level if necessary).  When
      13             :  * done, we write out each final page on each level, adding it to
      14             :  * its parent level.  When we have only one page on a level, it must be
      15             :  * the root -- it can be attached to the btree metapage and we are done.
      16             :  *
      17             :  * It is not wise to pack the pages entirely full, since then *any*
      18             :  * insertion would cause a split (and not only of the leaf page; the need
      19             :  * for a split would cascade right up the tree).  The steady-state load
      20             :  * factor for btrees is usually estimated at 70%.  We choose to pack leaf
      21             :  * pages to the user-controllable fill factor (default 90%) while upper pages
      22             :  * are always packed to 70%.  This gives us reasonable density (there aren't
      23             :  * many upper pages if the keys are reasonable-size) without risking a lot of
      24             :  * cascading splits during early insertions.
      25             :  *
      26             :  * We use the bulk smgr loading facility to bypass the buffer cache and
      27             :  * WAL-log the pages efficiently.
      28             :  *
      29             :  * This code isn't concerned about the FSM at all. The caller is responsible
      30             :  * for initializing that.
      31             :  *
      32             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
      33             :  * Portions Copyright (c) 1994, Regents of the University of California
      34             :  *
      35             :  * IDENTIFICATION
      36             :  *    src/backend/access/nbtree/nbtsort.c
      37             :  *
      38             :  *-------------------------------------------------------------------------
      39             :  */
      40             : 
      41             : #include "postgres.h"
      42             : 
      43             : #include "access/nbtree.h"
      44             : #include "access/parallel.h"
      45             : #include "access/relscan.h"
      46             : #include "access/table.h"
      47             : #include "access/xact.h"
      48             : #include "catalog/index.h"
      49             : #include "commands/progress.h"
      50             : #include "executor/instrument.h"
      51             : #include "miscadmin.h"
      52             : #include "pgstat.h"
      53             : #include "storage/bulk_write.h"
      54             : #include "tcop/tcopprot.h"
      55             : #include "utils/rel.h"
      56             : #include "utils/sortsupport.h"
      57             : #include "utils/tuplesort.h"
      58             : 
      59             : 
      60             : /* Magic numbers for parallel state sharing */
      61             : #define PARALLEL_KEY_BTREE_SHARED       UINT64CONST(0xA000000000000001)
      62             : #define PARALLEL_KEY_TUPLESORT          UINT64CONST(0xA000000000000002)
      63             : #define PARALLEL_KEY_TUPLESORT_SPOOL2   UINT64CONST(0xA000000000000003)
      64             : #define PARALLEL_KEY_QUERY_TEXT         UINT64CONST(0xA000000000000004)
      65             : #define PARALLEL_KEY_WAL_USAGE          UINT64CONST(0xA000000000000005)
      66             : #define PARALLEL_KEY_BUFFER_USAGE       UINT64CONST(0xA000000000000006)
      67             : 
      68             : /*
      69             :  * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
      70             :  * parallel index builds.  This may be useful as a debugging aid.
      71             : #undef DISABLE_LEADER_PARTICIPATION
      72             :  */
      73             : 
      74             : /*
      75             :  * Status record for spooling/sorting phase.  (Note we may have two of
      76             :  * these due to the special requirements for uniqueness-checking with
      77             :  * dead tuples.)
      78             :  */
      79             : typedef struct BTSpool
      80             : {
      81             :     Tuplesortstate *sortstate;  /* state data for tuplesort.c */
      82             :     Relation    heap;
      83             :     Relation    index;
      84             :     bool        isunique;
      85             :     bool        nulls_not_distinct;
      86             : } BTSpool;
      87             : 
      88             : /*
      89             :  * Status for index builds performed in parallel.  This is allocated in a
      90             :  * dynamic shared memory segment.  Note that there is a separate tuplesort TOC
      91             :  * entry, private to tuplesort.c but allocated by this module on its behalf.
      92             :  */
      93             : typedef struct BTShared
      94             : {
      95             :     /*
      96             :      * These fields are not modified during the sort.  They primarily exist
      97             :      * for the benefit of worker processes that need to create BTSpool state
      98             :      * corresponding to that used by the leader.
      99             :      */
     100             :     Oid         heaprelid;
     101             :     Oid         indexrelid;
     102             :     bool        isunique;
     103             :     bool        nulls_not_distinct;
     104             :     bool        isconcurrent;
     105             :     int         scantuplesortstates;
     106             : 
     107             :     /* Query ID, for report in worker processes */
     108             :     uint64      queryid;
     109             : 
     110             :     /*
     111             :      * workersdonecv is used to monitor the progress of workers.  All parallel
     112             :      * participants must indicate that they are done before leader can use
     113             :      * mutable state that workers maintain during scan (and before leader can
     114             :      * proceed to tuplesort_performsort()).
     115             :      */
     116             :     ConditionVariable workersdonecv;
     117             : 
     118             :     /*
     119             :      * mutex protects all fields before heapdesc.
     120             :      *
     121             :      * These fields contain status information of interest to B-Tree index
     122             :      * builds that must work just the same when an index is built in parallel.
     123             :      */
     124             :     slock_t     mutex;
     125             : 
     126             :     /*
     127             :      * Mutable state that is maintained by workers, and reported back to
     128             :      * leader at end of parallel scan.
     129             :      *
     130             :      * nparticipantsdone is number of worker processes finished.
     131             :      *
     132             :      * reltuples is the total number of input heap tuples.
     133             :      *
     134             :      * havedead indicates if RECENTLY_DEAD tuples were encountered during
     135             :      * build.
     136             :      *
     137             :      * indtuples is the total number of tuples that made it into the index.
     138             :      *
     139             :      * brokenhotchain indicates if any worker detected a broken HOT chain
     140             :      * during build.
     141             :      */
     142             :     int         nparticipantsdone;
     143             :     double      reltuples;
     144             :     bool        havedead;
     145             :     double      indtuples;
     146             :     bool        brokenhotchain;
     147             : 
     148             :     /*
     149             :      * ParallelTableScanDescData data follows. Can't directly embed here, as
     150             :      * implementations of the parallel table scan desc interface might need
     151             :      * stronger alignment.
     152             :      */
     153             : } BTShared;
     154             : 
     155             : /*
     156             :  * Return pointer to a BTShared's parallel table scan.
     157             :  *
     158             :  * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
     159             :  * MAXALIGN.
     160             :  */
     161             : #define ParallelTableScanFromBTShared(shared) \
     162             :     (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
     163             : 
     164             : /*
     165             :  * Status for leader in parallel index build.
     166             :  */
     167             : typedef struct BTLeader
     168             : {
     169             :     /* parallel context itself */
     170             :     ParallelContext *pcxt;
     171             : 
     172             :     /*
     173             :      * nparticipanttuplesorts is the exact number of worker processes
     174             :      * successfully launched, plus one leader process if it participates as a
     175             :      * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
     176             :      * participating as a worker).
     177             :      */
     178             :     int         nparticipanttuplesorts;
     179             : 
     180             :     /*
     181             :      * Leader process convenience pointers to shared state (leader avoids TOC
     182             :      * lookups).
     183             :      *
     184             :      * btshared is the shared state for entire build.  sharedsort is the
     185             :      * shared, tuplesort-managed state passed to each process tuplesort.
     186             :      * sharedsort2 is the corresponding btspool2 shared state, used only when
     187             :      * building unique indexes.  snapshot is the snapshot used by the scan iff
     188             :      * an MVCC snapshot is required.
     189             :      */
     190             :     BTShared   *btshared;
     191             :     Sharedsort *sharedsort;
     192             :     Sharedsort *sharedsort2;
     193             :     Snapshot    snapshot;
     194             :     WalUsage   *walusage;
     195             :     BufferUsage *bufferusage;
     196             : } BTLeader;
     197             : 
     198             : /*
     199             :  * Working state for btbuild and its callback.
     200             :  *
     201             :  * When parallel CREATE INDEX is used, there is a BTBuildState for each
     202             :  * participant.
     203             :  */
     204             : typedef struct BTBuildState
     205             : {
     206             :     bool        isunique;
     207             :     bool        nulls_not_distinct;
     208             :     bool        havedead;
     209             :     Relation    heap;
     210             :     BTSpool    *spool;
     211             : 
     212             :     /*
     213             :      * spool2 is needed only when the index is a unique index. Dead tuples are
     214             :      * put into spool2 instead of spool in order to avoid uniqueness check.
     215             :      */
     216             :     BTSpool    *spool2;
     217             :     double      indtuples;
     218             : 
     219             :     /*
     220             :      * btleader is only present when a parallel index build is performed, and
     221             :      * only in the leader process. (Actually, only the leader has a
     222             :      * BTBuildState.  Workers have their own spool and spool2, though.)
     223             :      */
     224             :     BTLeader   *btleader;
     225             : } BTBuildState;
     226             : 
     227             : /*
     228             :  * Status record for a btree page being built.  We have one of these
     229             :  * for each active tree level.
     230             :  */
     231             : typedef struct BTPageState
     232             : {
     233             :     BulkWriteBuffer btps_buf;   /* workspace for page building */
     234             :     BlockNumber btps_blkno;     /* block # to write this page at */
     235             :     IndexTuple  btps_lowkey;    /* page's strict lower bound pivot tuple */
     236             :     OffsetNumber btps_lastoff;  /* last item offset loaded */
     237             :     Size        btps_lastextra; /* last item's extra posting list space */
     238             :     uint32      btps_level;     /* tree level (0 = leaf) */
     239             :     Size        btps_full;      /* "full" if less than this much free space */
     240             :     struct BTPageState *btps_next;  /* link to parent level, if any */
     241             : } BTPageState;
     242             : 
     243             : /*
     244             :  * Overall status record for index writing phase.
     245             :  */
     246             : typedef struct BTWriteState
     247             : {
     248             :     Relation    heap;
     249             :     Relation    index;
     250             :     BulkWriteState *bulkstate;
     251             :     BTScanInsert inskey;        /* generic insertion scankey */
     252             :     BlockNumber btws_pages_alloced; /* # pages allocated */
     253             : } BTWriteState;
     254             : 
     255             : 
     256             : static double _bt_spools_heapscan(Relation heap, Relation index,
     257             :                                   BTBuildState *buildstate, IndexInfo *indexInfo);
     258             : static void _bt_spooldestroy(BTSpool *btspool);
     259             : static void _bt_spool(BTSpool *btspool, ItemPointer self,
     260             :                       Datum *values, bool *isnull);
     261             : static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
     262             : static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
     263             :                                bool *isnull, bool tupleIsAlive, void *state);
     264             : static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level);
     265             : static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
     266             : static void _bt_slideleft(Page rightmostpage);
     267             : static void _bt_sortaddtup(Page page, Size itemsize,
     268             :                            IndexTuple itup, OffsetNumber itup_off,
     269             :                            bool newfirstdataitem);
     270             : static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
     271             :                          IndexTuple itup, Size truncextra);
     272             : static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
     273             :                                           BTPageState *state,
     274             :                                           BTDedupState dstate);
     275             : static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
     276             : static void _bt_load(BTWriteState *wstate,
     277             :                      BTSpool *btspool, BTSpool *btspool2);
     278             : static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
     279             :                                int request);
     280             : static void _bt_end_parallel(BTLeader *btleader);
     281             : static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
     282             : static double _bt_parallel_heapscan(BTBuildState *buildstate,
     283             :                                     bool *brokenhotchain);
     284             : static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
     285             : static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
     286             :                                        BTShared *btshared, Sharedsort *sharedsort,
     287             :                                        Sharedsort *sharedsort2, int sortmem,
     288             :                                        bool progress);
     289             : 
     290             : 
     291             : /*
     292             :  *  btbuild() -- build a new btree index.
     293             :  */
     294             : IndexBuildResult *
     295       47764 : btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
     296             : {
     297             :     IndexBuildResult *result;
     298             :     BTBuildState buildstate;
     299             :     double      reltuples;
     300             : 
     301             : #ifdef BTREE_BUILD_STATS
     302             :     if (log_btree_build_stats)
     303             :         ResetUsage();
     304             : #endif                          /* BTREE_BUILD_STATS */
     305             : 
     306       47764 :     buildstate.isunique = indexInfo->ii_Unique;
     307       47764 :     buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
     308       47764 :     buildstate.havedead = false;
     309       47764 :     buildstate.heap = heap;
     310       47764 :     buildstate.spool = NULL;
     311       47764 :     buildstate.spool2 = NULL;
     312       47764 :     buildstate.indtuples = 0;
     313       47764 :     buildstate.btleader = NULL;
     314             : 
     315             :     /*
     316             :      * We expect to be called exactly once for any index relation. If that's
     317             :      * not the case, big trouble's what we have.
     318             :      */
     319       47764 :     if (RelationGetNumberOfBlocks(index) != 0)
     320           0 :         elog(ERROR, "index \"%s\" already contains data",
     321             :              RelationGetRelationName(index));
     322             : 
     323       47764 :     reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
     324             : 
     325             :     /*
     326             :      * Finish the build by (1) completing the sort of the spool file, (2)
     327             :      * inserting the sorted tuples into btree pages and (3) building the upper
     328             :      * levels.  Finally, it may also be necessary to end use of parallelism.
     329             :      */
     330       47752 :     _bt_leafbuild(buildstate.spool, buildstate.spool2);
     331       47668 :     _bt_spooldestroy(buildstate.spool);
     332       47668 :     if (buildstate.spool2)
     333          20 :         _bt_spooldestroy(buildstate.spool2);
     334       47668 :     if (buildstate.btleader)
     335         148 :         _bt_end_parallel(buildstate.btleader);
     336             : 
     337       47668 :     result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
     338             : 
     339       47668 :     result->heap_tuples = reltuples;
     340       47668 :     result->index_tuples = buildstate.indtuples;
     341             : 
     342             : #ifdef BTREE_BUILD_STATS
     343             :     if (log_btree_build_stats)
     344             :     {
     345             :         ShowUsage("BTREE BUILD STATS");
     346             :         ResetUsage();
     347             :     }
     348             : #endif                          /* BTREE_BUILD_STATS */
     349             : 
     350       47668 :     return result;
     351             : }
     352             : 
     353             : /*
     354             :  * Create and initialize one or two spool structures, and save them in caller's
     355             :  * buildstate argument.  May also fill-in fields within indexInfo used by index
     356             :  * builds.
     357             :  *
     358             :  * Scans the heap, possibly in parallel, filling spools with IndexTuples.  This
     359             :  * routine encapsulates all aspects of managing parallelism.  Caller need only
     360             :  * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
     361             :  *
     362             :  * Returns the total number of heap tuples scanned.
     363             :  */
     364             : static double
     365       47764 : _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
     366             :                     IndexInfo *indexInfo)
     367             : {
     368       47764 :     BTSpool    *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
     369       47764 :     SortCoordinate coordinate = NULL;
     370       47764 :     double      reltuples = 0;
     371             : 
     372             :     /*
     373             :      * We size the sort area as maintenance_work_mem rather than work_mem to
     374             :      * speed index creation.  This should be OK since a single backend can't
     375             :      * run multiple index creations in parallel (see also: notes on
     376             :      * parallelism and maintenance_work_mem below).
     377             :      */
     378       47764 :     btspool->heap = heap;
     379       47764 :     btspool->index = index;
     380       47764 :     btspool->isunique = indexInfo->ii_Unique;
     381       47764 :     btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
     382             : 
     383             :     /* Save as primary spool */
     384       47764 :     buildstate->spool = btspool;
     385             : 
     386             :     /* Report table scan phase started */
     387       47764 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     388             :                                  PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);
     389             : 
     390             :     /* Attempt to launch parallel worker scan when required */
     391       47764 :     if (indexInfo->ii_ParallelWorkers > 0)
     392         148 :         _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
     393             :                            indexInfo->ii_ParallelWorkers);
     394             : 
     395             :     /*
     396             :      * If parallel build requested and at least one worker process was
     397             :      * successfully launched, set up coordination state
     398             :      */
     399       47764 :     if (buildstate->btleader)
     400             :     {
     401         148 :         coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
     402         148 :         coordinate->isWorker = false;
     403         148 :         coordinate->nParticipants =
     404         148 :             buildstate->btleader->nparticipanttuplesorts;
     405         148 :         coordinate->sharedsort = buildstate->btleader->sharedsort;
     406             :     }
     407             : 
     408             :     /*
     409             :      * Begin serial/leader tuplesort.
     410             :      *
     411             :      * In cases where parallelism is involved, the leader receives the same
     412             :      * share of maintenance_work_mem as a serial sort (it is generally treated
     413             :      * in the same way as a serial sort once we return).  Parallel worker
     414             :      * Tuplesortstates will have received only a fraction of
     415             :      * maintenance_work_mem, though.
     416             :      *
     417             :      * We rely on the lifetime of the Leader Tuplesortstate almost not
     418             :      * overlapping with any worker Tuplesortstate's lifetime.  There may be
     419             :      * some small overlap, but that's okay because we rely on leader
     420             :      * Tuplesortstate only allocating a small, fixed amount of memory here.
     421             :      * When its tuplesort_performsort() is called (by our caller), and
     422             :      * significant amounts of memory are likely to be used, all workers must
     423             :      * have already freed almost all memory held by their Tuplesortstates
     424             :      * (they are about to go away completely, too).  The overall effect is
     425             :      * that maintenance_work_mem always represents an absolute high watermark
     426             :      * on the amount of memory used by a CREATE INDEX operation, regardless of
     427             :      * the use of parallelism or any other factor.
     428             :      */
     429       95528 :     buildstate->spool->sortstate =
     430       47764 :         tuplesort_begin_index_btree(heap, index, buildstate->isunique,
     431       47764 :                                     buildstate->nulls_not_distinct,
     432             :                                     maintenance_work_mem, coordinate,
     433             :                                     TUPLESORT_NONE);
     434             : 
     435             :     /*
     436             :      * If building a unique index, put dead tuples in a second spool to keep
     437             :      * them out of the uniqueness check.  We expect that the second spool (for
     438             :      * dead tuples) won't get very full, so we give it only work_mem.
     439             :      */
     440       47764 :     if (indexInfo->ii_Unique)
     441             :     {
     442       38916 :         BTSpool    *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
     443       38916 :         SortCoordinate coordinate2 = NULL;
     444             : 
     445             :         /* Initialize secondary spool */
     446       38916 :         btspool2->heap = heap;
     447       38916 :         btspool2->index = index;
     448       38916 :         btspool2->isunique = false;
     449             :         /* Save as secondary spool */
     450       38916 :         buildstate->spool2 = btspool2;
     451             : 
     452       38916 :         if (buildstate->btleader)
     453             :         {
     454             :             /*
     455             :              * Set up non-private state that is passed to
     456             :              * tuplesort_begin_index_btree() about the basic high level
     457             :              * coordination of a parallel sort.
     458             :              */
     459          64 :             coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
     460          64 :             coordinate2->isWorker = false;
     461          64 :             coordinate2->nParticipants =
     462          64 :                 buildstate->btleader->nparticipanttuplesorts;
     463          64 :             coordinate2->sharedsort = buildstate->btleader->sharedsort2;
     464             :         }
     465             : 
     466             :         /*
     467             :          * We expect that the second one (for dead tuples) won't get very
     468             :          * full, so we give it only work_mem
     469             :          */
     470       38916 :         buildstate->spool2->sortstate =
     471       38916 :             tuplesort_begin_index_btree(heap, index, false, false, work_mem,
     472             :                                         coordinate2, TUPLESORT_NONE);
     473             :     }
     474             : 
     475             :     /* Fill spool using either serial or parallel heap scan */
     476       47764 :     if (!buildstate->btleader)
     477       47616 :         reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
     478             :                                            _bt_build_callback, buildstate,
     479             :                                            NULL);
     480             :     else
     481         148 :         reltuples = _bt_parallel_heapscan(buildstate,
     482             :                                           &indexInfo->ii_BrokenHotChain);
     483             : 
     484             :     /*
     485             :      * Set the progress target for the next phase.  Reset the block number
     486             :      * values set by table_index_build_scan
     487             :      */
     488             :     {
     489       47752 :         const int   progress_index[] = {
     490             :             PROGRESS_CREATEIDX_TUPLES_TOTAL,
     491             :             PROGRESS_SCAN_BLOCKS_TOTAL,
     492             :             PROGRESS_SCAN_BLOCKS_DONE
     493             :         };
     494       47752 :         const int64 progress_vals[] = {
     495       47752 :             buildstate->indtuples,
     496             :             0, 0
     497             :         };
     498             : 
     499       47752 :         pgstat_progress_update_multi_param(3, progress_index, progress_vals);
     500             :     }
     501             : 
     502             :     /* okay, all heap tuples are spooled */
     503       47752 :     if (buildstate->spool2 && !buildstate->havedead)
     504             :     {
     505             :         /* spool2 turns out to be unnecessary */
     506       38896 :         _bt_spooldestroy(buildstate->spool2);
     507       38896 :         buildstate->spool2 = NULL;
     508             :     }
     509             : 
     510       47752 :     return reltuples;
     511             : }
     512             : 
     513             : /*
     514             :  * clean up a spool structure and its substructures.
     515             :  */
     516             : static void
     517       86584 : _bt_spooldestroy(BTSpool *btspool)
     518             : {
     519       86584 :     tuplesort_end(btspool->sortstate);
     520       86584 :     pfree(btspool);
     521       86584 : }
     522             : 
     523             : /*
     524             :  * spool an index entry into the sort file.
     525             :  */
     526             : static void
     527    12495744 : _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
     528             : {
     529    12495744 :     tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
     530             :                                   self, values, isnull);
     531    12495744 : }
     532             : 
     533             : /*
     534             :  * given a spool loaded by successive calls to _bt_spool,
     535             :  * create an entire btree.
     536             :  */
     537             : static void
     538       47752 : _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
     539             : {
     540             :     BTWriteState wstate;
     541             : 
     542             : #ifdef BTREE_BUILD_STATS
     543             :     if (log_btree_build_stats)
     544             :     {
     545             :         ShowUsage("BTREE BUILD (Spool) STATISTICS");
     546             :         ResetUsage();
     547             :     }
     548             : #endif                          /* BTREE_BUILD_STATS */
     549             : 
     550             :     /* Execute the sort */
     551       47752 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     552             :                                  PROGRESS_BTREE_PHASE_PERFORMSORT_1);
     553       47752 :     tuplesort_performsort(btspool->sortstate);
     554       47668 :     if (btspool2)
     555             :     {
     556          20 :         pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     557             :                                      PROGRESS_BTREE_PHASE_PERFORMSORT_2);
     558          20 :         tuplesort_performsort(btspool2->sortstate);
     559             :     }
     560             : 
     561       47668 :     wstate.heap = btspool->heap;
     562       47668 :     wstate.index = btspool->index;
     563       47668 :     wstate.inskey = _bt_mkscankey(wstate.index, NULL);
     564             :     /* _bt_mkscankey() won't set allequalimage without metapage */
     565       47668 :     wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
     566             : 
     567             :     /* reserve the metapage */
     568       47668 :     wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
     569             : 
     570       47668 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     571             :                                  PROGRESS_BTREE_PHASE_LEAF_LOAD);
     572       47668 :     _bt_load(&wstate, btspool, btspool2);
     573       47668 : }
     574             : 
     575             : /*
     576             :  * Per-tuple callback for table_index_build_scan
     577             :  */
     578             : static void
     579    12495744 : _bt_build_callback(Relation index,
     580             :                    ItemPointer tid,
     581             :                    Datum *values,
     582             :                    bool *isnull,
     583             :                    bool tupleIsAlive,
     584             :                    void *state)
     585             : {
     586    12495744 :     BTBuildState *buildstate = (BTBuildState *) state;
     587             : 
     588             :     /*
     589             :      * insert the index tuple into the appropriate spool file for subsequent
     590             :      * processing
     591             :      */
     592    12495744 :     if (tupleIsAlive || buildstate->spool2 == NULL)
     593    12495254 :         _bt_spool(buildstate->spool, tid, values, isnull);
     594             :     else
     595             :     {
     596             :         /* dead tuples are put into spool2 */
     597         490 :         buildstate->havedead = true;
     598         490 :         _bt_spool(buildstate->spool2, tid, values, isnull);
     599             :     }
     600             : 
     601    12495744 :     buildstate->indtuples += 1;
     602    12495744 : }
     603             : 
     604             : /*
     605             :  * allocate workspace for a new, clean btree page, not linked to any siblings.
     606             :  */
     607             : static BulkWriteBuffer
     608       51174 : _bt_blnewpage(BTWriteState *wstate, uint32 level)
     609             : {
     610             :     BulkWriteBuffer buf;
     611             :     Page        page;
     612             :     BTPageOpaque opaque;
     613             : 
     614       51174 :     buf = smgr_bulk_get_buf(wstate->bulkstate);
     615       51174 :     page = (Page) buf;
     616             : 
     617             :     /* Zero the page and set up standard page header info */
     618       51174 :     _bt_pageinit(page, BLCKSZ);
     619             : 
     620             :     /* Initialize BT opaque state */
     621       51174 :     opaque = BTPageGetOpaque(page);
     622       51174 :     opaque->btpo_prev = opaque->btpo_next = P_NONE;
     623       51174 :     opaque->btpo_level = level;
     624       51174 :     opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
     625       51174 :     opaque->btpo_cycleid = 0;
     626             : 
     627             :     /* Make the P_HIKEY line pointer appear allocated */
     628       51174 :     ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
     629             : 
     630       51174 :     return buf;
     631             : }
     632             : 
     633             : /*
     634             :  * emit a completed btree page, and release the working storage.
     635             :  */
     636             : static void
     637       98842 : _bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
     638             : {
     639       98842 :     smgr_bulk_write(wstate->bulkstate, blkno, buf, true);
     640             :     /* smgr_bulk_write took ownership of 'buf' */
     641       98842 : }
     642             : 
     643             : /*
     644             :  * allocate and initialize a new BTPageState.  the returned structure
     645             :  * is suitable for immediate use by _bt_buildadd.
     646             :  */
     647             : static BTPageState *
     648       12030 : _bt_pagestate(BTWriteState *wstate, uint32 level)
     649             : {
     650       12030 :     BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
     651             : 
     652             :     /* create initial page for level */
     653       12030 :     state->btps_buf = _bt_blnewpage(wstate, level);
     654             : 
     655             :     /* and assign it a page position */
     656       12030 :     state->btps_blkno = wstate->btws_pages_alloced++;
     657             : 
     658       12030 :     state->btps_lowkey = NULL;
     659             :     /* initialize lastoff so first item goes into P_FIRSTKEY */
     660       12030 :     state->btps_lastoff = P_HIKEY;
     661       12030 :     state->btps_lastextra = 0;
     662       12030 :     state->btps_level = level;
     663             :     /* set "full" threshold based on level.  See notes at head of file. */
     664       12030 :     if (level > 0)
     665        2528 :         state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
     666             :     else
     667        9502 :         state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
     668             : 
     669             :     /* no parent level, yet */
     670       12030 :     state->btps_next = NULL;
     671             : 
     672       12030 :     return state;
     673             : }
     674             : 
     675             : /*
     676             :  * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
     677             :  * P_HIKEY, overwriting P_HIKEY).
     678             :  *
     679             :  * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
     680             :  * rightmost page on its level is not supposed to get a high key.  Now that
     681             :  * it's clear that this page is a rightmost page, remove the unneeded empty
     682             :  * P_HIKEY line pointer space.
     683             :  */
     684             : static void
     685       12030 : _bt_slideleft(Page rightmostpage)
     686             : {
     687             :     OffsetNumber off;
     688             :     OffsetNumber maxoff;
     689             :     ItemId      previi;
     690             : 
     691       12030 :     maxoff = PageGetMaxOffsetNumber(rightmostpage);
     692             :     Assert(maxoff >= P_FIRSTKEY);
     693       12030 :     previi = PageGetItemId(rightmostpage, P_HIKEY);
     694      711012 :     for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
     695             :     {
     696      698982 :         ItemId      thisii = PageGetItemId(rightmostpage, off);
     697             : 
     698      698982 :         *previi = *thisii;
     699      698982 :         previi = thisii;
     700             :     }
     701       12030 :     ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
     702       12030 : }
     703             : 
     704             : /*
     705             :  * Add an item to a page being built.
     706             :  *
     707             :  * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
     708             :  * raises an error directly.
     709             :  *
     710             :  * Note that our nbtsort.c caller does not know yet if the page will be
     711             :  * rightmost.  Offset P_FIRSTKEY is always assumed to be the first data key by
     712             :  * caller.  Page that turns out to be the rightmost on its level is fixed by
     713             :  * calling _bt_slideleft().
     714             :  */
     715             : static void
     716    11223860 : _bt_sortaddtup(Page page,
     717             :                Size itemsize,
     718             :                IndexTuple itup,
     719             :                OffsetNumber itup_off,
     720             :                bool newfirstdataitem)
     721             : {
     722             :     IndexTupleData trunctuple;
     723             : 
     724    11223860 :     if (newfirstdataitem)
     725             :     {
     726        2652 :         trunctuple = *itup;
     727        2652 :         trunctuple.t_info = sizeof(IndexTupleData);
     728        2652 :         BTreeTupleSetNAtts(&trunctuple, 0, false);
     729        2652 :         itup = &trunctuple;
     730        2652 :         itemsize = sizeof(IndexTupleData);
     731             :     }
     732             : 
     733    11223860 :     if (PageAddItem(page, (Item) itup, itemsize, itup_off,
     734             :                     false, false) == InvalidOffsetNumber)
     735           0 :         elog(ERROR, "failed to add item to the index page");
     736    11223860 : }
     737             : 
     738             : /*----------
     739             :  * Add an item to a disk page from the sort output (or add a posting list
     740             :  * item formed from the sort output).
     741             :  *
     742             :  * We must be careful to observe the page layout conventions of nbtsearch.c:
     743             :  * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
     744             :  * - on non-leaf pages, the key portion of the first item need not be
     745             :  *   stored, we should store only the link.
     746             :  *
     747             :  * A leaf page being built looks like:
     748             :  *
     749             :  * +----------------+---------------------------------+
     750             :  * | PageHeaderData | linp0 linp1 linp2 ...           |
     751             :  * +-----------+----+---------------------------------+
     752             :  * | ... linpN |                                      |
     753             :  * +-----------+--------------------------------------+
     754             :  * |     ^ last                                       |
     755             :  * |                                                  |
     756             :  * +-------------+------------------------------------+
     757             :  * |             | itemN ...                          |
     758             :  * +-------------+------------------+-----------------+
     759             :  * |          ... item3 item2 item1 | "special space" |
     760             :  * +--------------------------------+-----------------+
     761             :  *
     762             :  * Contrast this with the diagram in bufpage.h; note the mismatch
     763             :  * between linps and items.  This is because we reserve linp0 as a
     764             :  * placeholder for the pointer to the "high key" item; when we have
     765             :  * filled up the page, we will set linp0 to point to itemN and clear
     766             :  * linpN.  On the other hand, if we find this is the last (rightmost)
     767             :  * page, we leave the items alone and slide the linp array over.  If
     768             :  * the high key is to be truncated, offset 1 is deleted, and we insert
     769             :  * the truncated high key at offset 1.
     770             :  *
     771             :  * 'last' pointer indicates the last offset added to the page.
     772             :  *
     773             :  * 'truncextra' is the size of the posting list in itup, if any.  This
     774             :  * information is stashed for the next call here, when we may benefit
     775             :  * from considering the impact of truncating away the posting list on
     776             :  * the page before deciding to finish the page off.  Posting lists are
     777             :  * often relatively large, so it is worth going to the trouble of
     778             :  * accounting for the saving from truncating away the posting list of
     779             :  * the tuple that becomes the high key (that may be the only way to
     780             :  * get close to target free space on the page).  Note that this is
     781             :  * only used for the soft fillfactor-wise limit, not the critical hard
     782             :  * limit.
     783             :  *----------
     784             :  */
     785             : static void
     786    11184716 : _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
     787             :              Size truncextra)
     788             : {
     789             :     BulkWriteBuffer nbuf;
     790             :     Page        npage;
     791             :     BlockNumber nblkno;
     792             :     OffsetNumber last_off;
     793             :     Size        last_truncextra;
     794             :     Size        pgspc;
     795             :     Size        itupsz;
     796             :     bool        isleaf;
     797             : 
     798             :     /*
     799             :      * This is a handy place to check for cancel interrupts during the btree
     800             :      * load phase of index creation.
     801             :      */
     802    11184716 :     CHECK_FOR_INTERRUPTS();
     803             : 
     804    11184716 :     nbuf = state->btps_buf;
     805    11184716 :     npage = (Page) nbuf;
     806    11184716 :     nblkno = state->btps_blkno;
     807    11184716 :     last_off = state->btps_lastoff;
     808    11184716 :     last_truncextra = state->btps_lastextra;
     809    11184716 :     state->btps_lastextra = truncextra;
     810             : 
     811    11184716 :     pgspc = PageGetFreeSpace(npage);
     812    11184716 :     itupsz = IndexTupleSize(itup);
     813    11184716 :     itupsz = MAXALIGN(itupsz);
     814             :     /* Leaf case has slightly different rules due to suffix truncation */
     815    11184716 :     isleaf = (state->btps_level == 0);
     816             : 
     817             :     /*
     818             :      * Check whether the new item can fit on a btree page on current level at
     819             :      * all.
     820             :      *
     821             :      * Every newly built index will treat heap TID as part of the keyspace,
     822             :      * which imposes the requirement that new high keys must occasionally have
     823             :      * a heap TID appended within _bt_truncate().  That may leave a new pivot
     824             :      * tuple one or two MAXALIGN() quantums larger than the original
     825             :      * firstright tuple it's derived from.  v4 deals with the problem by
     826             :      * decreasing the limit on the size of tuples inserted on the leaf level
     827             :      * by the same small amount.  Enforce the new v4+ limit on the leaf level,
     828             :      * and the old limit on internal levels, since pivot tuples may need to
     829             :      * make use of the reserved space.  This should never fail on internal
     830             :      * pages.
     831             :      */
     832    11184716 :     if (unlikely(itupsz > BTMaxItemSize(npage)))
     833         264 :         _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
     834             :                              itup);
     835             : 
     836             :     /*
     837             :      * Check to see if current page will fit new item, with space left over to
     838             :      * append a heap TID during suffix truncation when page is a leaf page.
     839             :      *
     840             :      * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
     841             :      * high key with heap TID when finishing off a leaf page, since we rely on
     842             :      * _bt_check_third_page() rejecting oversized non-pivot tuples.  On
     843             :      * internal pages we can always fit 3 pivot tuples with larger internal
     844             :      * page tuple limit (includes page high key).
     845             :      *
     846             :      * Most of the time, a page is only "full" in the sense that the soft
     847             :      * fillfactor-wise limit has been exceeded.  However, we must always leave
     848             :      * at least two items plus a high key on each page before starting a new
     849             :      * page.  Disregard fillfactor and insert on "full" current page if we
     850             :      * don't have the minimum number of items yet.  (Note that we deliberately
     851             :      * assume that suffix truncation neither enlarges nor shrinks new high key
     852             :      * when applying soft limit, except when last tuple has a posting list.)
     853             :      */
     854             :     Assert(last_truncextra == 0 || isleaf);
     855    11184716 :     if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
     856    11183562 :         (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
     857             :     {
     858             :         /*
     859             :          * Finish off the page and write it out.
     860             :          */
     861       39144 :         BulkWriteBuffer obuf = nbuf;
     862       39144 :         Page        opage = npage;
     863       39144 :         BlockNumber oblkno = nblkno;
     864             :         ItemId      ii;
     865             :         ItemId      hii;
     866             :         IndexTuple  oitup;
     867             : 
     868             :         /* Create new page of same level */
     869       39144 :         nbuf = _bt_blnewpage(wstate, state->btps_level);
     870       39144 :         npage = (Page) nbuf;
     871             : 
     872             :         /* and assign it a page position */
     873       39144 :         nblkno = wstate->btws_pages_alloced++;
     874             : 
     875             :         /*
     876             :          * We copy the last item on the page into the new page, and then
     877             :          * rearrange the old page so that the 'last item' becomes its high key
     878             :          * rather than a true data item.  There had better be at least two
     879             :          * items on the page already, else the page would be empty of useful
     880             :          * data.
     881             :          */
     882             :         Assert(last_off > P_FIRSTKEY);
     883       39144 :         ii = PageGetItemId(opage, last_off);
     884       39144 :         oitup = (IndexTuple) PageGetItem(opage, ii);
     885       39144 :         _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
     886       39144 :                        !isleaf);
     887             : 
     888             :         /*
     889             :          * Move 'last' into the high key position on opage.  _bt_blnewpage()
     890             :          * allocated empty space for a line pointer when opage was first
     891             :          * created, so this is a matter of rearranging already-allocated space
     892             :          * on page, and initializing high key line pointer. (Actually, leaf
     893             :          * pages must also swap oitup with a truncated version of oitup, which
     894             :          * is sometimes larger than oitup, though never by more than the space
     895             :          * needed to append a heap TID.)
     896             :          */
     897       39144 :         hii = PageGetItemId(opage, P_HIKEY);
     898       39144 :         *hii = *ii;
     899       39144 :         ItemIdSetUnused(ii);    /* redundant */
     900       39144 :         ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
     901             : 
     902       39144 :         if (isleaf)
     903             :         {
     904             :             IndexTuple  lastleft;
     905             :             IndexTuple  truncated;
     906             : 
     907             :             /*
     908             :              * Truncate away any unneeded attributes from high key on leaf
     909             :              * level.  This is only done at the leaf level because downlinks
     910             :              * in internal pages are either negative infinity items, or get
     911             :              * their contents from copying from one level down.  See also:
     912             :              * _bt_split().
     913             :              *
     914             :              * We don't try to bias our choice of split point to make it more
     915             :              * likely that _bt_truncate() can truncate away more attributes,
     916             :              * whereas the split point used within _bt_split() is chosen much
     917             :              * more delicately.  Even still, the lastleft and firstright
     918             :              * tuples passed to _bt_truncate() here are at least not fully
     919             :              * equal to each other when deduplication is used, unless there is
     920             :              * a large group of duplicates (also, unique index builds usually
     921             :              * have few or no spool2 duplicates).  When the split point is
     922             :              * between two unequal tuples, _bt_truncate() will avoid including
     923             :              * a heap TID in the new high key, which is the most important
     924             :              * benefit of suffix truncation.
     925             :              *
     926             :              * Overwrite the old item with new truncated high key directly.
     927             :              * oitup is already located at the physical beginning of tuple
     928             :              * space, so this should directly reuse the existing tuple space.
     929             :              */
     930       39020 :             ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
     931       39020 :             lastleft = (IndexTuple) PageGetItem(opage, ii);
     932             : 
     933             :             Assert(IndexTupleSize(oitup) > last_truncextra);
     934       39020 :             truncated = _bt_truncate(wstate->index, lastleft, oitup,
     935             :                                      wstate->inskey);
     936       39020 :             if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated,
     937       39020 :                                          IndexTupleSize(truncated)))
     938           0 :                 elog(ERROR, "failed to add high key to the index page");
     939       39020 :             pfree(truncated);
     940             : 
     941             :             /* oitup should continue to point to the page's high key */
     942       39020 :             hii = PageGetItemId(opage, P_HIKEY);
     943       39020 :             oitup = (IndexTuple) PageGetItem(opage, hii);
     944             :         }
     945             : 
     946             :         /*
     947             :          * Link the old page into its parent, using its low key.  If we don't
     948             :          * have a parent, we have to create one; this adds a new btree level.
     949             :          */
     950       39144 :         if (state->btps_next == NULL)
     951        2528 :             state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
     952             : 
     953             :         Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
     954             :                 IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
     955             :                 BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
     956             :                P_LEFTMOST(BTPageGetOpaque(opage)));
     957             :         Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
     958             :                !P_LEFTMOST(BTPageGetOpaque(opage)));
     959       39144 :         BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
     960       39144 :         _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
     961       39144 :         pfree(state->btps_lowkey);
     962             : 
     963             :         /*
     964             :          * Save a copy of the high key from the old page.  It is also the low
     965             :          * key for the new page.
     966             :          */
     967       39144 :         state->btps_lowkey = CopyIndexTuple(oitup);
     968             : 
     969             :         /*
     970             :          * Set the sibling links for both pages.
     971             :          */
     972             :         {
     973       39144 :             BTPageOpaque oopaque = BTPageGetOpaque(opage);
     974       39144 :             BTPageOpaque nopaque = BTPageGetOpaque(npage);
     975             : 
     976       39144 :             oopaque->btpo_next = nblkno;
     977       39144 :             nopaque->btpo_prev = oblkno;
     978       39144 :             nopaque->btpo_next = P_NONE; /* redundant */
     979             :         }
     980             : 
     981             :         /*
     982             :          * Write out the old page. _bt_blwritepage takes ownership of the
     983             :          * 'opage' buffer.
     984             :          */
     985       39144 :         _bt_blwritepage(wstate, obuf, oblkno);
     986             : 
     987             :         /*
     988             :          * Reset last_off to point to new page
     989             :          */
     990       39144 :         last_off = P_FIRSTKEY;
     991             :     }
     992             : 
     993             :     /*
     994             :      * By here, either original page is still the current page, or a new page
     995             :      * was created that became the current page.  Either way, the current page
     996             :      * definitely has space for new item.
     997             :      *
     998             :      * If the new item is the first for its page, it must also be the first
     999             :      * item on its entire level.  On later same-level pages, a low key for a
    1000             :      * page will be copied from the prior page in the code above.  Generate a
    1001             :      * minus infinity low key here instead.
    1002             :      */
    1003    11184716 :     if (last_off == P_HIKEY)
    1004             :     {
    1005             :         Assert(state->btps_lowkey == NULL);
    1006       12030 :         state->btps_lowkey = palloc0(sizeof(IndexTupleData));
    1007       12030 :         state->btps_lowkey->t_info = sizeof(IndexTupleData);
    1008       12030 :         BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
    1009             :     }
    1010             : 
    1011             :     /*
    1012             :      * Add the new item into the current page.
    1013             :      */
    1014    11184716 :     last_off = OffsetNumberNext(last_off);
    1015    11184716 :     _bt_sortaddtup(npage, itupsz, itup, last_off,
    1016    11184716 :                    !isleaf && last_off == P_FIRSTKEY);
    1017             : 
    1018    11184716 :     state->btps_buf = nbuf;
    1019    11184716 :     state->btps_blkno = nblkno;
    1020    11184716 :     state->btps_lastoff = last_off;
    1021    11184716 : }
    1022             : 
    1023             : /*
    1024             :  * Finalize pending posting list tuple, and add it to the index.  Final tuple
    1025             :  * is based on saved base tuple, and saved list of heap TIDs.
    1026             :  *
    1027             :  * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
    1028             :  * using _bt_buildadd().
    1029             :  */
    1030             : static void
    1031     4712644 : _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
    1032             :                               BTDedupState dstate)
    1033             : {
    1034             :     Assert(dstate->nitems > 0);
    1035             : 
    1036     4712644 :     if (dstate->nitems == 1)
    1037     4673742 :         _bt_buildadd(wstate, state, dstate->base, 0);
    1038             :     else
    1039             :     {
    1040             :         IndexTuple  postingtuple;
    1041             :         Size        truncextra;
    1042             : 
    1043             :         /* form a tuple with a posting list */
    1044       38902 :         postingtuple = _bt_form_posting(dstate->base,
    1045             :                                         dstate->htids,
    1046             :                                         dstate->nhtids);
    1047             :         /* Calculate posting list overhead */
    1048       77804 :         truncextra = IndexTupleSize(postingtuple) -
    1049       38902 :             BTreeTupleGetPostingOffset(postingtuple);
    1050             : 
    1051       38902 :         _bt_buildadd(wstate, state, postingtuple, truncextra);
    1052       38902 :         pfree(postingtuple);
    1053             :     }
    1054             : 
    1055     4712644 :     dstate->nmaxitems = 0;
    1056     4712644 :     dstate->nhtids = 0;
    1057     4712644 :     dstate->nitems = 0;
    1058     4712644 :     dstate->phystupsize = 0;
    1059     4712644 : }
    1060             : 
    1061             : /*
    1062             :  * Finish writing out the completed btree.
    1063             :  */
    1064             : static void
    1065       47668 : _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
    1066             : {
    1067             :     BTPageState *s;
    1068       47668 :     BlockNumber rootblkno = P_NONE;
    1069       47668 :     uint32      rootlevel = 0;
    1070             :     BulkWriteBuffer metabuf;
    1071             : 
    1072             :     /*
    1073             :      * Each iteration of this loop completes one more level of the tree.
    1074             :      */
    1075       59698 :     for (s = state; s != NULL; s = s->btps_next)
    1076             :     {
    1077             :         BlockNumber blkno;
    1078             :         BTPageOpaque opaque;
    1079             : 
    1080       12030 :         blkno = s->btps_blkno;
    1081       12030 :         opaque = BTPageGetOpaque((Page) s->btps_buf);
    1082             : 
    1083             :         /*
    1084             :          * We have to link the last page on this level to somewhere.
    1085             :          *
    1086             :          * If we're at the top, it's the root, so attach it to the metapage.
    1087             :          * Otherwise, add an entry for it to its parent using its low key.
    1088             :          * This may cause the last page of the parent level to split, but
    1089             :          * that's not a problem -- we haven't gotten to it yet.
    1090             :          */
    1091       12030 :         if (s->btps_next == NULL)
    1092             :         {
    1093        9502 :             opaque->btpo_flags |= BTP_ROOT;
    1094        9502 :             rootblkno = blkno;
    1095        9502 :             rootlevel = s->btps_level;
    1096             :         }
    1097             :         else
    1098             :         {
    1099             :             Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
    1100             :                     IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
    1101             :                     BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
    1102             :                    P_LEFTMOST(opaque));
    1103             :             Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
    1104             :                    !P_LEFTMOST(opaque));
    1105        2528 :             BTreeTupleSetDownLink(s->btps_lowkey, blkno);
    1106        2528 :             _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
    1107        2528 :             pfree(s->btps_lowkey);
    1108        2528 :             s->btps_lowkey = NULL;
    1109             :         }
    1110             : 
    1111             :         /*
    1112             :          * This is the rightmost page, so the ItemId array needs to be slid
    1113             :          * back one slot.  Then we can dump out the page.
    1114             :          */
    1115       12030 :         _bt_slideleft((Page) s->btps_buf);
    1116       12030 :         _bt_blwritepage(wstate, s->btps_buf, s->btps_blkno);
    1117       12030 :         s->btps_buf = NULL;      /* writepage took ownership of the buffer */
    1118             :     }
    1119             : 
    1120             :     /*
    1121             :      * As the last step in the process, construct the metapage and make it
    1122             :      * point to the new root (unless we had no data at all, in which case it's
    1123             :      * set to point to "P_NONE").  This changes the index to the "valid" state
    1124             :      * by filling in a valid magic number in the metapage.
    1125             :      */
    1126       47668 :     metabuf = smgr_bulk_get_buf(wstate->bulkstate);
    1127       47668 :     _bt_initmetapage((Page) metabuf, rootblkno, rootlevel,
    1128       47668 :                      wstate->inskey->allequalimage);
    1129       47668 :     _bt_blwritepage(wstate, metabuf, BTREE_METAPAGE);
    1130       47668 : }
    1131             : 
    1132             : /*
    1133             :  * Read tuples in correct sort order from tuplesort, and load them into
    1134             :  * btree leaves.
    1135             :  */
    1136             : static void
    1137       47668 : _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
    1138             : {
    1139       47668 :     BTPageState *state = NULL;
    1140       47668 :     bool        merge = (btspool2 != NULL);
    1141             :     IndexTuple  itup,
    1142       47668 :                 itup2 = NULL;
    1143             :     bool        load1;
    1144       47668 :     TupleDesc   tupdes = RelationGetDescr(wstate->index);
    1145             :     int         i,
    1146       47668 :                 keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
    1147             :     SortSupport sortKeys;
    1148       47668 :     int64       tuples_done = 0;
    1149             :     bool        deduplicate;
    1150             : 
    1151       47668 :     wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);
    1152             : 
    1153       56254 :     deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
    1154        8586 :         BTGetDeduplicateItems(wstate->index);
    1155             : 
    1156       47668 :     if (merge)
    1157             :     {
    1158             :         /*
    1159             :          * Another BTSpool for dead tuples exists. Now we have to merge
    1160             :          * btspool and btspool2.
    1161             :          */
    1162             : 
    1163             :         /* the preparation of merge */
    1164          20 :         itup = tuplesort_getindextuple(btspool->sortstate, true);
    1165          20 :         itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
    1166             : 
    1167             :         /* Prepare SortSupport data for each column */
    1168          20 :         sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
    1169             : 
    1170          42 :         for (i = 0; i < keysz; i++)
    1171             :         {
    1172          22 :             SortSupport sortKey = sortKeys + i;
    1173          22 :             ScanKey     scanKey = wstate->inskey->scankeys + i;
    1174             :             int16       strategy;
    1175             : 
    1176          22 :             sortKey->ssup_cxt = CurrentMemoryContext;
    1177          22 :             sortKey->ssup_collation = scanKey->sk_collation;
    1178          22 :             sortKey->ssup_nulls_first =
    1179          22 :                 (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
    1180          22 :             sortKey->ssup_attno = scanKey->sk_attno;
    1181             :             /* Abbreviation is not supported here */
    1182          22 :             sortKey->abbreviate = false;
    1183             : 
    1184             :             Assert(sortKey->ssup_attno != 0);
    1185             : 
    1186          22 :             strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
    1187             :                 BTGreaterStrategyNumber : BTLessStrategyNumber;
    1188             : 
    1189          22 :             PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
    1190             :         }
    1191             : 
    1192             :         for (;;)
    1193             :         {
    1194        3298 :             load1 = true;       /* load BTSpool next ? */
    1195        3298 :             if (itup2 == NULL)
    1196             :             {
    1197         154 :                 if (itup == NULL)
    1198          20 :                     break;
    1199             :             }
    1200        3144 :             else if (itup != NULL)
    1201             :             {
    1202        2952 :                 int32       compare = 0;
    1203             : 
    1204        3188 :                 for (i = 1; i <= keysz; i++)
    1205             :                 {
    1206             :                     SortSupport entry;
    1207             :                     Datum       attrDatum1,
    1208             :                                 attrDatum2;
    1209             :                     bool        isNull1,
    1210             :                                 isNull2;
    1211             : 
    1212        3004 :                     entry = sortKeys + i - 1;
    1213        3004 :                     attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
    1214        3004 :                     attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
    1215             : 
    1216        3004 :                     compare = ApplySortComparator(attrDatum1, isNull1,
    1217             :                                                   attrDatum2, isNull2,
    1218             :                                                   entry);
    1219        3004 :                     if (compare > 0)
    1220             :                     {
    1221         276 :                         load1 = false;
    1222        2768 :                         break;
    1223             :                     }
    1224        2728 :                     else if (compare < 0)
    1225        2492 :                         break;
    1226             :                 }
    1227             : 
    1228             :                 /*
    1229             :                  * If key values are equal, we sort on ItemPointer.  This is
    1230             :                  * required for btree indexes, since heap TID is treated as an
    1231             :                  * implicit last key attribute in order to ensure that all
    1232             :                  * keys in the index are physically unique.
    1233             :                  */
    1234        2952 :                 if (compare == 0)
    1235             :                 {
    1236         184 :                     compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
    1237             :                     Assert(compare != 0);
    1238         184 :                     if (compare > 0)
    1239          22 :                         load1 = false;
    1240             :                 }
    1241             :             }
    1242             :             else
    1243         192 :                 load1 = false;
    1244             : 
    1245             :             /* When we see first tuple, create first index page */
    1246        3278 :             if (state == NULL)
    1247          20 :                 state = _bt_pagestate(wstate, 0);
    1248             : 
    1249        3278 :             if (load1)
    1250             :             {
    1251        2788 :                 _bt_buildadd(wstate, state, itup, 0);
    1252        2788 :                 itup = tuplesort_getindextuple(btspool->sortstate, true);
    1253             :             }
    1254             :             else
    1255             :             {
    1256         490 :                 _bt_buildadd(wstate, state, itup2, 0);
    1257         490 :                 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
    1258             :             }
    1259             : 
    1260             :             /* Report progress */
    1261        3278 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1262             :                                          ++tuples_done);
    1263             :         }
    1264          20 :         pfree(sortKeys);
    1265             :     }
    1266       47648 :     else if (deduplicate)
    1267             :     {
    1268             :         /* merge is unnecessary, deduplicate into posting lists */
    1269             :         BTDedupState dstate;
    1270             : 
    1271        8586 :         dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
    1272        8586 :         dstate->deduplicate = true; /* unused */
    1273        8586 :         dstate->nmaxitems = 0;   /* unused */
    1274        8586 :         dstate->maxpostingsize = 0; /* set later */
    1275             :         /* Metadata about base tuple of current pending posting list */
    1276        8586 :         dstate->base = NULL;
    1277        8586 :         dstate->baseoff = InvalidOffsetNumber;   /* unused */
    1278        8586 :         dstate->basetupsize = 0;
    1279             :         /* Metadata about current pending posting list TIDs */
    1280        8586 :         dstate->htids = NULL;
    1281        8586 :         dstate->nhtids = 0;
    1282        8586 :         dstate->nitems = 0;
    1283        8586 :         dstate->phystupsize = 0; /* unused */
    1284        8586 :         dstate->nintervals = 0; /* unused */
    1285             : 
    1286     6073552 :         while ((itup = tuplesort_getindextuple(btspool->sortstate,
    1287             :                                                true)) != NULL)
    1288             :         {
    1289             :             /* When we see first tuple, create first index page */
    1290     6064966 :             if (state == NULL)
    1291             :             {
    1292        2332 :                 state = _bt_pagestate(wstate, 0);
    1293             : 
    1294             :                 /*
    1295             :                  * Limit size of posting list tuples to 1/10 space we want to
    1296             :                  * leave behind on the page, plus space for final item's line
    1297             :                  * pointer.  This is equal to the space that we'd like to
    1298             :                  * leave behind on each leaf page when fillfactor is 90,
    1299             :                  * allowing us to get close to fillfactor% space utilization
    1300             :                  * when there happen to be a great many duplicates.  (This
    1301             :                  * makes higher leaf fillfactor settings ineffective when
    1302             :                  * building indexes that have many duplicates, but packing
    1303             :                  * leaf pages full with few very large tuples doesn't seem
    1304             :                  * like a useful goal.)
    1305             :                  */
    1306        2332 :                 dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
    1307             :                     sizeof(ItemIdData);
    1308             :                 Assert(dstate->maxpostingsize <= BTMaxItemSize((Page) state->btps_buf) &&
    1309             :                        dstate->maxpostingsize <= INDEX_SIZE_MASK);
    1310        2332 :                 dstate->htids = palloc(dstate->maxpostingsize);
    1311             : 
    1312             :                 /* start new pending posting list with itup copy */
    1313        2332 :                 _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
    1314             :                                         InvalidOffsetNumber);
    1315             :             }
    1316     6062634 :             else if (_bt_keep_natts_fast(wstate->index, dstate->base,
    1317     1360238 :                                          itup) > keysz &&
    1318     1360238 :                      _bt_dedup_save_htid(dstate, itup))
    1319             :             {
    1320             :                 /*
    1321             :                  * Tuple is equal to base tuple of pending posting list.  Heap
    1322             :                  * TID from itup has been saved in state.
    1323             :                  */
    1324             :             }
    1325             :             else
    1326             :             {
    1327             :                 /*
    1328             :                  * Tuple is not equal to pending posting list tuple, or
    1329             :                  * _bt_dedup_save_htid() opted to not merge current item into
    1330             :                  * pending posting list.
    1331             :                  */
    1332     4710312 :                 _bt_sort_dedup_finish_pending(wstate, state, dstate);
    1333     4710312 :                 pfree(dstate->base);
    1334             : 
    1335             :                 /* start new pending posting list with itup copy */
    1336     4710312 :                 _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
    1337             :                                         InvalidOffsetNumber);
    1338             :             }
    1339             : 
    1340             :             /* Report progress */
    1341     6064966 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1342             :                                          ++tuples_done);
    1343             :         }
    1344             : 
    1345        8586 :         if (state)
    1346             :         {
    1347             :             /*
    1348             :              * Handle the last item (there must be a last item when the
    1349             :              * tuplesort returned one or more tuples)
    1350             :              */
    1351        2332 :             _bt_sort_dedup_finish_pending(wstate, state, dstate);
    1352        2332 :             pfree(dstate->base);
    1353        2332 :             pfree(dstate->htids);
    1354             :         }
    1355             : 
    1356        8586 :         pfree(dstate);
    1357             :     }
    1358             :     else
    1359             :     {
    1360             :         /* merging and deduplication are both unnecessary */
    1361     6466184 :         while ((itup = tuplesort_getindextuple(btspool->sortstate,
    1362             :                                                true)) != NULL)
    1363             :         {
    1364             :             /* When we see first tuple, create first index page */
    1365     6427122 :             if (state == NULL)
    1366        7150 :                 state = _bt_pagestate(wstate, 0);
    1367             : 
    1368     6427122 :             _bt_buildadd(wstate, state, itup, 0);
    1369             : 
    1370             :             /* Report progress */
    1371     6427122 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1372             :                                          ++tuples_done);
    1373             :         }
    1374             :     }
    1375             : 
    1376             :     /* Close down final pages and write the metapage */
    1377       47668 :     _bt_uppershutdown(wstate, state);
    1378       47668 :     smgr_bulk_finish(wstate->bulkstate);
    1379       47668 : }
    1380             : 
    1381             : /*
    1382             :  * Create parallel context, and launch workers for leader.
    1383             :  *
    1384             :  * buildstate argument should be initialized (with the exception of the
    1385             :  * tuplesort state in spools, which may later be created based on shared
    1386             :  * state initially set up here).
    1387             :  *
    1388             :  * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
    1389             :  *
    1390             :  * request is the target number of parallel worker processes to launch.
    1391             :  *
    1392             :  * Sets buildstate's BTLeader, which caller must use to shut down parallel
    1393             :  * mode by passing it to _bt_end_parallel() at the very end of its index
    1394             :  * build.  If not even a single worker process can be launched, this is
    1395             :  * never set, and caller should proceed with a serial index build.
    1396             :  */
    1397             : static void
    1398         148 : _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
    1399             : {
    1400             :     ParallelContext *pcxt;
    1401             :     int         scantuplesortstates;
    1402             :     Snapshot    snapshot;
    1403             :     Size        estbtshared;
    1404             :     Size        estsort;
    1405             :     BTShared   *btshared;
    1406             :     Sharedsort *sharedsort;
    1407             :     Sharedsort *sharedsort2;
    1408         148 :     BTSpool    *btspool = buildstate->spool;
    1409         148 :     BTLeader   *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
    1410             :     WalUsage   *walusage;
    1411             :     BufferUsage *bufferusage;
    1412         148 :     bool        leaderparticipates = true;
    1413             :     int         querylen;
    1414             : 
    1415             : #ifdef DISABLE_LEADER_PARTICIPATION
    1416             :     leaderparticipates = false;
    1417             : #endif
    1418             : 
    1419             :     /*
    1420             :      * Enter parallel mode, and create context for parallel build of btree
    1421             :      * index
    1422             :      */
    1423         148 :     EnterParallelMode();
    1424             :     Assert(request > 0);
    1425         148 :     pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
    1426             :                                  request);
    1427             : 
    1428         148 :     scantuplesortstates = leaderparticipates ? request + 1 : request;
    1429             : 
    1430             :     /*
    1431             :      * Prepare for scan of the base relation.  In a normal index build, we use
    1432             :      * SnapshotAny because we must retrieve all tuples and do our own time
    1433             :      * qual checks (because we have to index RECENTLY_DEAD tuples).  In a
    1434             :      * concurrent build, we take a regular MVCC snapshot and index whatever's
    1435             :      * live according to that.
    1436             :      */
    1437         148 :     if (!isconcurrent)
    1438         148 :         snapshot = SnapshotAny;
    1439             :     else
    1440           0 :         snapshot = RegisterSnapshot(GetTransactionSnapshot());
    1441             : 
    1442             :     /*
    1443             :      * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
    1444             :      * PARALLEL_KEY_TUPLESORT tuplesort workspace
    1445             :      */
    1446         148 :     estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
    1447         148 :     shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
    1448         148 :     estsort = tuplesort_estimate_shared(scantuplesortstates);
    1449         148 :     shm_toc_estimate_chunk(&pcxt->estimator, estsort);
    1450             : 
    1451             :     /*
    1452             :      * Unique case requires a second spool, and so we may have to account for
    1453             :      * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
    1454             :      */
    1455         148 :     if (!btspool->isunique)
    1456          84 :         shm_toc_estimate_keys(&pcxt->estimator, 2);
    1457             :     else
    1458             :     {
    1459          64 :         shm_toc_estimate_chunk(&pcxt->estimator, estsort);
    1460          64 :         shm_toc_estimate_keys(&pcxt->estimator, 3);
    1461             :     }
    1462             : 
    1463             :     /*
    1464             :      * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
    1465             :      * and PARALLEL_KEY_BUFFER_USAGE.
    1466             :      *
    1467             :      * If there are no extensions loaded that care, we could skip this.  We
    1468             :      * have no way of knowing whether anyone's looking at pgWalUsage or
    1469             :      * pgBufferUsage, so do it unconditionally.
    1470             :      */
    1471         148 :     shm_toc_estimate_chunk(&pcxt->estimator,
    1472             :                            mul_size(sizeof(WalUsage), pcxt->nworkers));
    1473         148 :     shm_toc_estimate_keys(&pcxt->estimator, 1);
    1474         148 :     shm_toc_estimate_chunk(&pcxt->estimator,
    1475             :                            mul_size(sizeof(BufferUsage), pcxt->nworkers));
    1476         148 :     shm_toc_estimate_keys(&pcxt->estimator, 1);
    1477             : 
    1478             :     /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
    1479         148 :     if (debug_query_string)
    1480             :     {
    1481         148 :         querylen = strlen(debug_query_string);
    1482         148 :         shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
    1483         148 :         shm_toc_estimate_keys(&pcxt->estimator, 1);
    1484             :     }
    1485             :     else
    1486           0 :         querylen = 0;           /* keep compiler quiet */
    1487             : 
    1488             :     /* Everyone's had a chance to ask for space, so now create the DSM */
    1489         148 :     InitializeParallelDSM(pcxt);
    1490             : 
    1491             :     /* If no DSM segment was available, back out (do serial build) */
    1492         148 :     if (pcxt->seg == NULL)
    1493             :     {
    1494           0 :         if (IsMVCCSnapshot(snapshot))
    1495           0 :             UnregisterSnapshot(snapshot);
    1496           0 :         DestroyParallelContext(pcxt);
    1497           0 :         ExitParallelMode();
    1498           0 :         return;
    1499             :     }
    1500             : 
    1501             :     /* Store shared build state, for which we reserved space */
    1502         148 :     btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
    1503             :     /* Initialize immutable state */
    1504         148 :     btshared->heaprelid = RelationGetRelid(btspool->heap);
    1505         148 :     btshared->indexrelid = RelationGetRelid(btspool->index);
    1506         148 :     btshared->isunique = btspool->isunique;
    1507         148 :     btshared->nulls_not_distinct = btspool->nulls_not_distinct;
    1508         148 :     btshared->isconcurrent = isconcurrent;
    1509         148 :     btshared->scantuplesortstates = scantuplesortstates;
    1510         148 :     btshared->queryid = pgstat_get_my_query_id();
    1511         148 :     ConditionVariableInit(&btshared->workersdonecv);
    1512         148 :     SpinLockInit(&btshared->mutex);
    1513             :     /* Initialize mutable state */
    1514         148 :     btshared->nparticipantsdone = 0;
    1515         148 :     btshared->reltuples = 0.0;
    1516         148 :     btshared->havedead = false;
    1517         148 :     btshared->indtuples = 0.0;
    1518         148 :     btshared->brokenhotchain = false;
    1519         148 :     table_parallelscan_initialize(btspool->heap,
    1520             :                                   ParallelTableScanFromBTShared(btshared),
    1521             :                                   snapshot);
    1522             : 
    1523             :     /*
    1524             :      * Store shared tuplesort-private state, for which we reserved space.
    1525             :      * Then, initialize opaque state using tuplesort routine.
    1526             :      */
    1527         148 :     sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
    1528         148 :     tuplesort_initialize_shared(sharedsort, scantuplesortstates,
    1529             :                                 pcxt->seg);
    1530             : 
    1531         148 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
    1532         148 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
    1533             : 
    1534             :     /* Unique case requires a second spool, and associated shared state */
    1535         148 :     if (!btspool->isunique)
    1536          84 :         sharedsort2 = NULL;
    1537             :     else
    1538             :     {
    1539             :         /*
    1540             :          * Store additional shared tuplesort-private state, for which we
    1541             :          * reserved space.  Then, initialize opaque state using tuplesort
    1542             :          * routine.
    1543             :          */
    1544          64 :         sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
    1545          64 :         tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
    1546             :                                     pcxt->seg);
    1547             : 
    1548          64 :         shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
    1549             :     }
    1550             : 
    1551             :     /* Store query string for workers */
    1552         148 :     if (debug_query_string)
    1553             :     {
    1554             :         char       *sharedquery;
    1555             : 
    1556         148 :         sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
    1557         148 :         memcpy(sharedquery, debug_query_string, querylen + 1);
    1558         148 :         shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
    1559             :     }
    1560             : 
    1561             :     /*
    1562             :      * Allocate space for each worker's WalUsage and BufferUsage; no need to
    1563             :      * initialize.
    1564             :      */
    1565         148 :     walusage = shm_toc_allocate(pcxt->toc,
    1566         148 :                                 mul_size(sizeof(WalUsage), pcxt->nworkers));
    1567         148 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
    1568         148 :     bufferusage = shm_toc_allocate(pcxt->toc,
    1569         148 :                                    mul_size(sizeof(BufferUsage), pcxt->nworkers));
    1570         148 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
    1571             : 
    1572             :     /* Launch workers, saving status for leader/caller */
    1573         148 :     LaunchParallelWorkers(pcxt);
    1574         148 :     btleader->pcxt = pcxt;
    1575         148 :     btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
    1576         148 :     if (leaderparticipates)
    1577         148 :         btleader->nparticipanttuplesorts++;
    1578         148 :     btleader->btshared = btshared;
    1579         148 :     btleader->sharedsort = sharedsort;
    1580         148 :     btleader->sharedsort2 = sharedsort2;
    1581         148 :     btleader->snapshot = snapshot;
    1582         148 :     btleader->walusage = walusage;
    1583         148 :     btleader->bufferusage = bufferusage;
    1584             : 
    1585             :     /* If no workers were successfully launched, back out (do serial build) */
    1586         148 :     if (pcxt->nworkers_launched == 0)
    1587             :     {
    1588           0 :         _bt_end_parallel(btleader);
    1589           0 :         return;
    1590             :     }
    1591             : 
    1592             :     /* Save leader state now that it's clear build will be parallel */
    1593         148 :     buildstate->btleader = btleader;
    1594             : 
    1595             :     /* Join heap scan ourselves */
    1596         148 :     if (leaderparticipates)
    1597         148 :         _bt_leader_participate_as_worker(buildstate);
    1598             : 
    1599             :     /*
    1600             :      * Caller needs to wait for all launched workers when we return.  Make
    1601             :      * sure that the failure-to-start case will not hang forever.
    1602             :      */
    1603         148 :     WaitForParallelWorkersToAttach(pcxt);
    1604             : }
    1605             : 
    1606             : /*
    1607             :  * Shut down workers, destroy parallel context, and end parallel mode.
    1608             :  */
    1609             : static void
    1610         148 : _bt_end_parallel(BTLeader *btleader)
    1611             : {
    1612             :     int         i;
    1613             : 
    1614             :     /* Shutdown worker processes */
    1615         148 :     WaitForParallelWorkersToFinish(btleader->pcxt);
    1616             : 
    1617             :     /*
    1618             :      * Next, accumulate WAL usage.  (This must wait for the workers to finish,
    1619             :      * or we might get incomplete data.)
    1620             :      */
    1621         296 :     for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
    1622         148 :         InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
    1623             : 
    1624             :     /* Free last reference to MVCC snapshot, if one was used */
    1625         148 :     if (IsMVCCSnapshot(btleader->snapshot))
    1626           0 :         UnregisterSnapshot(btleader->snapshot);
    1627         148 :     DestroyParallelContext(btleader->pcxt);
    1628         148 :     ExitParallelMode();
    1629         148 : }
    1630             : 
    1631             : /*
    1632             :  * Returns size of shared memory required to store state for a parallel
    1633             :  * btree index build based on the snapshot its parallel scan will use.
    1634             :  */
    1635             : static Size
    1636         148 : _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
    1637             : {
    1638             :     /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
    1639         148 :     return add_size(BUFFERALIGN(sizeof(BTShared)),
    1640             :                     table_parallelscan_estimate(heap, snapshot));
    1641             : }
    1642             : 
    1643             : /*
    1644             :  * Within leader, wait for end of heap scan.
    1645             :  *
    1646             :  * When called, parallel heap scan started by _bt_begin_parallel() will
    1647             :  * already be underway within worker processes (when leader participates
    1648             :  * as a worker, we should end up here just as workers are finishing).
    1649             :  *
    1650             :  * Fills in fields needed for ambuild statistics, and lets caller set
    1651             :  * field indicating that some worker encountered a broken HOT chain.
    1652             :  *
    1653             :  * Returns the total number of heap tuples scanned.
    1654             :  */
    1655             : static double
    1656         148 : _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
    1657             : {
    1658         148 :     BTShared   *btshared = buildstate->btleader->btshared;
    1659             :     int         nparticipanttuplesorts;
    1660             :     double      reltuples;
    1661             : 
    1662         148 :     nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
    1663             :     for (;;)
    1664             :     {
    1665         396 :         SpinLockAcquire(&btshared->mutex);
    1666         396 :         if (btshared->nparticipantsdone == nparticipanttuplesorts)
    1667             :         {
    1668         148 :             buildstate->havedead = btshared->havedead;
    1669         148 :             buildstate->indtuples = btshared->indtuples;
    1670         148 :             *brokenhotchain = btshared->brokenhotchain;
    1671         148 :             reltuples = btshared->reltuples;
    1672         148 :             SpinLockRelease(&btshared->mutex);
    1673         148 :             break;
    1674             :         }
    1675         248 :         SpinLockRelease(&btshared->mutex);
    1676             : 
    1677         248 :         ConditionVariableSleep(&btshared->workersdonecv,
    1678             :                                WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
    1679             :     }
    1680             : 
    1681         148 :     ConditionVariableCancelSleep();
    1682             : 
    1683         148 :     return reltuples;
    1684             : }
    1685             : 
    1686             : /*
    1687             :  * Within leader, participate as a parallel worker.
    1688             :  */
    1689             : static void
    1690         148 : _bt_leader_participate_as_worker(BTBuildState *buildstate)
    1691             : {
    1692         148 :     BTLeader   *btleader = buildstate->btleader;
    1693             :     BTSpool    *leaderworker;
    1694             :     BTSpool    *leaderworker2;
    1695             :     int         sortmem;
    1696             : 
    1697             :     /* Allocate memory and initialize private spool */
    1698         148 :     leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
    1699         148 :     leaderworker->heap = buildstate->spool->heap;
    1700         148 :     leaderworker->index = buildstate->spool->index;
    1701         148 :     leaderworker->isunique = buildstate->spool->isunique;
    1702         148 :     leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;
    1703             : 
    1704             :     /* Initialize second spool, if required */
    1705         148 :     if (!btleader->btshared->isunique)
    1706          84 :         leaderworker2 = NULL;
    1707             :     else
    1708             :     {
    1709             :         /* Allocate memory for worker's own private secondary spool */
    1710          64 :         leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
    1711             : 
    1712             :         /* Initialize worker's own secondary spool */
    1713          64 :         leaderworker2->heap = leaderworker->heap;
    1714          64 :         leaderworker2->index = leaderworker->index;
    1715          64 :         leaderworker2->isunique = false;
    1716             :     }
    1717             : 
    1718             :     /*
    1719             :      * Might as well use reliable figure when doling out maintenance_work_mem
    1720             :      * (when requested number of workers were not launched, this will be
    1721             :      * somewhat higher than it is for other workers).
    1722             :      */
    1723         148 :     sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
    1724             : 
    1725             :     /* Perform work common to all participants */
    1726         148 :     _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
    1727             :                                btleader->sharedsort, btleader->sharedsort2,
    1728             :                                sortmem, true);
    1729             : 
    1730             : #ifdef BTREE_BUILD_STATS
    1731             :     if (log_btree_build_stats)
    1732             :     {
    1733             :         ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
    1734             :         ResetUsage();
    1735             :     }
    1736             : #endif                          /* BTREE_BUILD_STATS */
    1737         148 : }
    1738             : 
    1739             : /*
    1740             :  * Perform work within a launched parallel process.
    1741             :  */
    1742             : void
    1743         148 : _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
    1744             : {
    1745             :     char       *sharedquery;
    1746             :     BTSpool    *btspool;
    1747             :     BTSpool    *btspool2;
    1748             :     BTShared   *btshared;
    1749             :     Sharedsort *sharedsort;
    1750             :     Sharedsort *sharedsort2;
    1751             :     Relation    heapRel;
    1752             :     Relation    indexRel;
    1753             :     LOCKMODE    heapLockmode;
    1754             :     LOCKMODE    indexLockmode;
    1755             :     WalUsage   *walusage;
    1756             :     BufferUsage *bufferusage;
    1757             :     int         sortmem;
    1758             : 
    1759             : #ifdef BTREE_BUILD_STATS
    1760             :     if (log_btree_build_stats)
    1761             :         ResetUsage();
    1762             : #endif                          /* BTREE_BUILD_STATS */
    1763             : 
    1764             :     /*
    1765             :      * The only possible status flag that can be set to the parallel worker is
    1766             :      * PROC_IN_SAFE_IC.
    1767             :      */
    1768             :     Assert((MyProc->statusFlags == 0) ||
    1769             :            (MyProc->statusFlags == PROC_IN_SAFE_IC));
    1770             : 
    1771             :     /* Set debug_query_string for individual workers first */
    1772         148 :     sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
    1773         148 :     debug_query_string = sharedquery;
    1774             : 
    1775             :     /* Report the query string from leader */
    1776         148 :     pgstat_report_activity(STATE_RUNNING, debug_query_string);
    1777             : 
    1778             :     /* Look up nbtree shared state */
    1779         148 :     btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
    1780             : 
    1781             :     /* Open relations using lock modes known to be obtained by index.c */
    1782         148 :     if (!btshared->isconcurrent)
    1783             :     {
    1784         148 :         heapLockmode = ShareLock;
    1785         148 :         indexLockmode = AccessExclusiveLock;
    1786             :     }
    1787             :     else
    1788             :     {
    1789           0 :         heapLockmode = ShareUpdateExclusiveLock;
    1790           0 :         indexLockmode = RowExclusiveLock;
    1791             :     }
    1792             : 
    1793             :     /* Track query ID */
    1794         148 :     pgstat_report_query_id(btshared->queryid, false);
    1795             : 
    1796             :     /* Open relations within worker */
    1797         148 :     heapRel = table_open(btshared->heaprelid, heapLockmode);
    1798         148 :     indexRel = index_open(btshared->indexrelid, indexLockmode);
    1799             : 
    1800             :     /* Initialize worker's own spool */
    1801         148 :     btspool = (BTSpool *) palloc0(sizeof(BTSpool));
    1802         148 :     btspool->heap = heapRel;
    1803         148 :     btspool->index = indexRel;
    1804         148 :     btspool->isunique = btshared->isunique;
    1805         148 :     btspool->nulls_not_distinct = btshared->nulls_not_distinct;
    1806             : 
    1807             :     /* Look up shared state private to tuplesort.c */
    1808         148 :     sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
    1809         148 :     tuplesort_attach_shared(sharedsort, seg);
    1810         148 :     if (!btshared->isunique)
    1811             :     {
    1812          84 :         btspool2 = NULL;
    1813          84 :         sharedsort2 = NULL;
    1814             :     }
    1815             :     else
    1816             :     {
    1817             :         /* Allocate memory for worker's own private secondary spool */
    1818          64 :         btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
    1819             : 
    1820             :         /* Initialize worker's own secondary spool */
    1821          64 :         btspool2->heap = btspool->heap;
    1822          64 :         btspool2->index = btspool->index;
    1823          64 :         btspool2->isunique = false;
    1824             :         /* Look up shared state private to tuplesort.c */
    1825          64 :         sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
    1826          64 :         tuplesort_attach_shared(sharedsort2, seg);
    1827             :     }
    1828             : 
    1829             :     /* Prepare to track buffer usage during parallel execution */
    1830         148 :     InstrStartParallelQuery();
    1831             : 
    1832             :     /* Perform sorting of spool, and possibly a spool2 */
    1833         148 :     sortmem = maintenance_work_mem / btshared->scantuplesortstates;
    1834         148 :     _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
    1835             :                                sharedsort2, sortmem, false);
    1836             : 
    1837             :     /* Report WAL/buffer usage during parallel execution */
    1838         148 :     bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
    1839         148 :     walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
    1840         148 :     InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
    1841         148 :                           &walusage[ParallelWorkerNumber]);
    1842             : 
    1843             : #ifdef BTREE_BUILD_STATS
    1844             :     if (log_btree_build_stats)
    1845             :     {
    1846             :         ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
    1847             :         ResetUsage();
    1848             :     }
    1849             : #endif                          /* BTREE_BUILD_STATS */
    1850             : 
    1851         148 :     index_close(indexRel, indexLockmode);
    1852         148 :     table_close(heapRel, heapLockmode);
    1853         148 : }
    1854             : 
    1855             : /*
    1856             :  * Perform a worker's portion of a parallel sort.
    1857             :  *
    1858             :  * This generates a tuplesort for passed btspool, and a second tuplesort
    1859             :  * state if a second btspool is need (i.e. for unique index builds).  All
    1860             :  * other spool fields should already be set when this is called.
    1861             :  *
    1862             :  * sortmem is the amount of working memory to use within each worker,
    1863             :  * expressed in KBs.
    1864             :  *
    1865             :  * When this returns, workers are done, and need only release resources.
    1866             :  */
    1867             : static void
    1868         296 : _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
    1869             :                            BTShared *btshared, Sharedsort *sharedsort,
    1870             :                            Sharedsort *sharedsort2, int sortmem, bool progress)
    1871             : {
    1872             :     SortCoordinate coordinate;
    1873             :     BTBuildState buildstate;
    1874             :     TableScanDesc scan;
    1875             :     double      reltuples;
    1876             :     IndexInfo  *indexInfo;
    1877             : 
    1878             :     /* Initialize local tuplesort coordination state */
    1879         296 :     coordinate = palloc0(sizeof(SortCoordinateData));
    1880         296 :     coordinate->isWorker = true;
    1881         296 :     coordinate->nParticipants = -1;
    1882         296 :     coordinate->sharedsort = sharedsort;
    1883             : 
    1884             :     /* Begin "partial" tuplesort */
    1885         592 :     btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
    1886             :                                                      btspool->index,
    1887         296 :                                                      btspool->isunique,
    1888         296 :                                                      btspool->nulls_not_distinct,
    1889             :                                                      sortmem, coordinate,
    1890             :                                                      TUPLESORT_NONE);
    1891             : 
    1892             :     /*
    1893             :      * Just as with serial case, there may be a second spool.  If so, a
    1894             :      * second, dedicated spool2 partial tuplesort is required.
    1895             :      */
    1896         296 :     if (btspool2)
    1897             :     {
    1898             :         SortCoordinate coordinate2;
    1899             : 
    1900             :         /*
    1901             :          * We expect that the second one (for dead tuples) won't get very
    1902             :          * full, so we give it only work_mem (unless sortmem is less for
    1903             :          * worker).  Worker processes are generally permitted to allocate
    1904             :          * work_mem independently.
    1905             :          */
    1906         128 :         coordinate2 = palloc0(sizeof(SortCoordinateData));
    1907         128 :         coordinate2->isWorker = true;
    1908         128 :         coordinate2->nParticipants = -1;
    1909         128 :         coordinate2->sharedsort = sharedsort2;
    1910         128 :         btspool2->sortstate =
    1911         128 :             tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,
    1912             :                                         Min(sortmem, work_mem), coordinate2,
    1913             :                                         false);
    1914             :     }
    1915             : 
    1916             :     /* Fill in buildstate for _bt_build_callback() */
    1917         296 :     buildstate.isunique = btshared->isunique;
    1918         296 :     buildstate.nulls_not_distinct = btshared->nulls_not_distinct;
    1919         296 :     buildstate.havedead = false;
    1920         296 :     buildstate.heap = btspool->heap;
    1921         296 :     buildstate.spool = btspool;
    1922         296 :     buildstate.spool2 = btspool2;
    1923         296 :     buildstate.indtuples = 0;
    1924         296 :     buildstate.btleader = NULL;
    1925             : 
    1926             :     /* Join parallel scan */
    1927         296 :     indexInfo = BuildIndexInfo(btspool->index);
    1928         296 :     indexInfo->ii_Concurrent = btshared->isconcurrent;
    1929         296 :     scan = table_beginscan_parallel(btspool->heap,
    1930             :                                     ParallelTableScanFromBTShared(btshared));
    1931         296 :     reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
    1932             :                                        true, progress, _bt_build_callback,
    1933             :                                        &buildstate, scan);
    1934             : 
    1935             :     /* Execute this worker's part of the sort */
    1936         296 :     if (progress)
    1937         148 :         pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
    1938             :                                      PROGRESS_BTREE_PHASE_PERFORMSORT_1);
    1939         296 :     tuplesort_performsort(btspool->sortstate);
    1940         296 :     if (btspool2)
    1941             :     {
    1942         128 :         if (progress)
    1943          64 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
    1944             :                                          PROGRESS_BTREE_PHASE_PERFORMSORT_2);
    1945         128 :         tuplesort_performsort(btspool2->sortstate);
    1946             :     }
    1947             : 
    1948             :     /*
    1949             :      * Done.  Record ambuild statistics, and whether we encountered a broken
    1950             :      * HOT chain.
    1951             :      */
    1952         296 :     SpinLockAcquire(&btshared->mutex);
    1953         296 :     btshared->nparticipantsdone++;
    1954         296 :     btshared->reltuples += reltuples;
    1955         296 :     if (buildstate.havedead)
    1956           0 :         btshared->havedead = true;
    1957         296 :     btshared->indtuples += buildstate.indtuples;
    1958         296 :     if (indexInfo->ii_BrokenHotChain)
    1959           0 :         btshared->brokenhotchain = true;
    1960         296 :     SpinLockRelease(&btshared->mutex);
    1961             : 
    1962             :     /* Notify leader */
    1963         296 :     ConditionVariableSignal(&btshared->workersdonecv);
    1964             : 
    1965             :     /* We can end tuplesorts immediately */
    1966         296 :     tuplesort_end(btspool->sortstate);
    1967         296 :     if (btspool2)
    1968         128 :         tuplesort_end(btspool2->sortstate);
    1969         296 : }

Generated by: LCOV version 1.14