LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtsort.c (source / functions) Hit Total Coverage
Test: PostgreSQL 17devel Lines: 534 551 96.9 %
Date: 2024-04-19 22:11:39 Functions: 22 22 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtsort.c
       4             :  *      Build a btree from sorted input by loading leaf pages sequentially.
       5             :  *
       6             :  * NOTES
       7             :  *
       8             :  * We use tuplesort.c to sort the given index tuples into order.
       9             :  * Then we scan the index tuples in order and build the btree pages
      10             :  * for each level.  We load source tuples into leaf-level pages.
      11             :  * Whenever we fill a page at one level, we add a link to it to its
      12             :  * parent level (starting a new parent level if necessary).  When
      13             :  * done, we write out each final page on each level, adding it to
      14             :  * its parent level.  When we have only one page on a level, it must be
      15             :  * the root -- it can be attached to the btree metapage and we are done.
      16             :  *
      17             :  * It is not wise to pack the pages entirely full, since then *any*
      18             :  * insertion would cause a split (and not only of the leaf page; the need
      19             :  * for a split would cascade right up the tree).  The steady-state load
      20             :  * factor for btrees is usually estimated at 70%.  We choose to pack leaf
      21             :  * pages to the user-controllable fill factor (default 90%) while upper pages
      22             :  * are always packed to 70%.  This gives us reasonable density (there aren't
      23             :  * many upper pages if the keys are reasonable-size) without risking a lot of
      24             :  * cascading splits during early insertions.
      25             :  *
      26             :  * We use the bulk smgr loading facility to bypass the buffer cache and
      27             :  * WAL-log the pages efficiently.
      28             :  *
      29             :  * This code isn't concerned about the FSM at all. The caller is responsible
      30             :  * for initializing that.
      31             :  *
      32             :  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
      33             :  * Portions Copyright (c) 1994, Regents of the University of California
      34             :  *
      35             :  * IDENTIFICATION
      36             :  *    src/backend/access/nbtree/nbtsort.c
      37             :  *
      38             :  *-------------------------------------------------------------------------
      39             :  */
      40             : 
      41             : #include "postgres.h"
      42             : 
      43             : #include "access/nbtree.h"
      44             : #include "access/parallel.h"
      45             : #include "access/relscan.h"
      46             : #include "access/table.h"
      47             : #include "access/xact.h"
      48             : #include "access/xloginsert.h"
      49             : #include "catalog/index.h"
      50             : #include "commands/progress.h"
      51             : #include "executor/instrument.h"
      52             : #include "miscadmin.h"
      53             : #include "pgstat.h"
      54             : #include "storage/bulk_write.h"
      55             : #include "tcop/tcopprot.h"        /* pgrminclude ignore */
      56             : #include "utils/rel.h"
      57             : #include "utils/sortsupport.h"
      58             : #include "utils/tuplesort.h"
      59             : 
      60             : 
      61             : /* Magic numbers for parallel state sharing */
      62             : #define PARALLEL_KEY_BTREE_SHARED       UINT64CONST(0xA000000000000001)
      63             : #define PARALLEL_KEY_TUPLESORT          UINT64CONST(0xA000000000000002)
      64             : #define PARALLEL_KEY_TUPLESORT_SPOOL2   UINT64CONST(0xA000000000000003)
      65             : #define PARALLEL_KEY_QUERY_TEXT         UINT64CONST(0xA000000000000004)
      66             : #define PARALLEL_KEY_WAL_USAGE          UINT64CONST(0xA000000000000005)
      67             : #define PARALLEL_KEY_BUFFER_USAGE       UINT64CONST(0xA000000000000006)
      68             : 
      69             : /*
      70             :  * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
      71             :  * parallel index builds.  This may be useful as a debugging aid.
      72             : #undef DISABLE_LEADER_PARTICIPATION
      73             :  */
      74             : 
      75             : /*
      76             :  * Status record for spooling/sorting phase.  (Note we may have two of
      77             :  * these due to the special requirements for uniqueness-checking with
      78             :  * dead tuples.)
      79             :  */
      80             : typedef struct BTSpool
      81             : {
      82             :     Tuplesortstate *sortstate;  /* state data for tuplesort.c */
      83             :     Relation    heap;
      84             :     Relation    index;
      85             :     bool        isunique;
      86             :     bool        nulls_not_distinct;
      87             : } BTSpool;
      88             : 
      89             : /*
      90             :  * Status for index builds performed in parallel.  This is allocated in a
      91             :  * dynamic shared memory segment.  Note that there is a separate tuplesort TOC
      92             :  * entry, private to tuplesort.c but allocated by this module on its behalf.
      93             :  */
      94             : typedef struct BTShared
      95             : {
      96             :     /*
      97             :      * These fields are not modified during the sort.  They primarily exist
      98             :      * for the benefit of worker processes that need to create BTSpool state
      99             :      * corresponding to that used by the leader.
     100             :      */
     101             :     Oid         heaprelid;
     102             :     Oid         indexrelid;
     103             :     bool        isunique;
     104             :     bool        nulls_not_distinct;
     105             :     bool        isconcurrent;
     106             :     int         scantuplesortstates;
     107             : 
     108             :     /*
     109             :      * workersdonecv is used to monitor the progress of workers.  All parallel
     110             :      * participants must indicate that they are done before leader can use
     111             :      * mutable state that workers maintain during scan (and before leader can
     112             :      * proceed to tuplesort_performsort()).
     113             :      */
     114             :     ConditionVariable workersdonecv;
     115             : 
     116             :     /*
     117             :      * mutex protects all fields before heapdesc.
     118             :      *
     119             :      * These fields contain status information of interest to B-Tree index
     120             :      * builds that must work just the same when an index is built in parallel.
     121             :      */
     122             :     slock_t     mutex;
     123             : 
     124             :     /*
     125             :      * Mutable state that is maintained by workers, and reported back to
     126             :      * leader at end of parallel scan.
     127             :      *
     128             :      * nparticipantsdone is number of worker processes finished.
     129             :      *
     130             :      * reltuples is the total number of input heap tuples.
     131             :      *
     132             :      * havedead indicates if RECENTLY_DEAD tuples were encountered during
     133             :      * build.
     134             :      *
     135             :      * indtuples is the total number of tuples that made it into the index.
     136             :      *
     137             :      * brokenhotchain indicates if any worker detected a broken HOT chain
     138             :      * during build.
     139             :      */
     140             :     int         nparticipantsdone;
     141             :     double      reltuples;
     142             :     bool        havedead;
     143             :     double      indtuples;
     144             :     bool        brokenhotchain;
     145             : 
     146             :     /*
     147             :      * ParallelTableScanDescData data follows. Can't directly embed here, as
     148             :      * implementations of the parallel table scan desc interface might need
     149             :      * stronger alignment.
     150             :      */
     151             : } BTShared;
     152             : 
     153             : /*
     154             :  * Return pointer to a BTShared's parallel table scan.
     155             :  *
     156             :  * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
     157             :  * MAXALIGN.
     158             :  */
     159             : #define ParallelTableScanFromBTShared(shared) \
     160             :     (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
     161             : 
     162             : /*
     163             :  * Status for leader in parallel index build.
     164             :  */
     165             : typedef struct BTLeader
     166             : {
     167             :     /* parallel context itself */
     168             :     ParallelContext *pcxt;
     169             : 
     170             :     /*
     171             :      * nparticipanttuplesorts is the exact number of worker processes
     172             :      * successfully launched, plus one leader process if it participates as a
     173             :      * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
     174             :      * participating as a worker).
     175             :      */
     176             :     int         nparticipanttuplesorts;
     177             : 
     178             :     /*
     179             :      * Leader process convenience pointers to shared state (leader avoids TOC
     180             :      * lookups).
     181             :      *
     182             :      * btshared is the shared state for entire build.  sharedsort is the
     183             :      * shared, tuplesort-managed state passed to each process tuplesort.
     184             :      * sharedsort2 is the corresponding btspool2 shared state, used only when
     185             :      * building unique indexes.  snapshot is the snapshot used by the scan iff
     186             :      * an MVCC snapshot is required.
     187             :      */
     188             :     BTShared   *btshared;
     189             :     Sharedsort *sharedsort;
     190             :     Sharedsort *sharedsort2;
     191             :     Snapshot    snapshot;
     192             :     WalUsage   *walusage;
     193             :     BufferUsage *bufferusage;
     194             : } BTLeader;
     195             : 
     196             : /*
     197             :  * Working state for btbuild and its callback.
     198             :  *
     199             :  * When parallel CREATE INDEX is used, there is a BTBuildState for each
     200             :  * participant.
     201             :  */
     202             : typedef struct BTBuildState
     203             : {
     204             :     bool        isunique;
     205             :     bool        nulls_not_distinct;
     206             :     bool        havedead;
     207             :     Relation    heap;
     208             :     BTSpool    *spool;
     209             : 
     210             :     /*
     211             :      * spool2 is needed only when the index is a unique index. Dead tuples are
     212             :      * put into spool2 instead of spool in order to avoid uniqueness check.
     213             :      */
     214             :     BTSpool    *spool2;
     215             :     double      indtuples;
     216             : 
     217             :     /*
     218             :      * btleader is only present when a parallel index build is performed, and
     219             :      * only in the leader process. (Actually, only the leader has a
     220             :      * BTBuildState.  Workers have their own spool and spool2, though.)
     221             :      */
     222             :     BTLeader   *btleader;
     223             : } BTBuildState;
     224             : 
     225             : /*
     226             :  * Status record for a btree page being built.  We have one of these
     227             :  * for each active tree level.
     228             :  */
     229             : typedef struct BTPageState
     230             : {
     231             :     BulkWriteBuffer btps_buf;   /* workspace for page building */
     232             :     BlockNumber btps_blkno;     /* block # to write this page at */
     233             :     IndexTuple  btps_lowkey;    /* page's strict lower bound pivot tuple */
     234             :     OffsetNumber btps_lastoff;  /* last item offset loaded */
     235             :     Size        btps_lastextra; /* last item's extra posting list space */
     236             :     uint32      btps_level;     /* tree level (0 = leaf) */
     237             :     Size        btps_full;      /* "full" if less than this much free space */
     238             :     struct BTPageState *btps_next;  /* link to parent level, if any */
     239             : } BTPageState;
     240             : 
     241             : /*
     242             :  * Overall status record for index writing phase.
     243             :  */
     244             : typedef struct BTWriteState
     245             : {
     246             :     Relation    heap;
     247             :     Relation    index;
     248             :     BulkWriteState *bulkstate;
     249             :     BTScanInsert inskey;        /* generic insertion scankey */
     250             :     BlockNumber btws_pages_alloced; /* # pages allocated */
     251             : } BTWriteState;
     252             : 
     253             : 
     254             : static double _bt_spools_heapscan(Relation heap, Relation index,
     255             :                                   BTBuildState *buildstate, IndexInfo *indexInfo);
     256             : static void _bt_spooldestroy(BTSpool *btspool);
     257             : static void _bt_spool(BTSpool *btspool, ItemPointer self,
     258             :                       Datum *values, bool *isnull);
     259             : static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
     260             : static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
     261             :                                bool *isnull, bool tupleIsAlive, void *state);
     262             : static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level);
     263             : static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
     264             : static void _bt_slideleft(Page rightmostpage);
     265             : static void _bt_sortaddtup(Page page, Size itemsize,
     266             :                            IndexTuple itup, OffsetNumber itup_off,
     267             :                            bool newfirstdataitem);
     268             : static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
     269             :                          IndexTuple itup, Size truncextra);
     270             : static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
     271             :                                           BTPageState *state,
     272             :                                           BTDedupState dstate);
     273             : static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
     274             : static void _bt_load(BTWriteState *wstate,
     275             :                      BTSpool *btspool, BTSpool *btspool2);
     276             : static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
     277             :                                int request);
     278             : static void _bt_end_parallel(BTLeader *btleader);
     279             : static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
     280             : static double _bt_parallel_heapscan(BTBuildState *buildstate,
     281             :                                     bool *brokenhotchain);
     282             : static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
     283             : static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
     284             :                                        BTShared *btshared, Sharedsort *sharedsort,
     285             :                                        Sharedsort *sharedsort2, int sortmem,
     286             :                                        bool progress);
     287             : 
     288             : 
     289             : /*
     290             :  *  btbuild() -- build a new btree index.
     291             :  */
     292             : IndexBuildResult *
     293       45682 : btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
     294             : {
     295             :     IndexBuildResult *result;
     296             :     BTBuildState buildstate;
     297             :     double      reltuples;
     298             : 
     299             : #ifdef BTREE_BUILD_STATS
     300             :     if (log_btree_build_stats)
     301             :         ResetUsage();
     302             : #endif                          /* BTREE_BUILD_STATS */
     303             : 
     304       45682 :     buildstate.isunique = indexInfo->ii_Unique;
     305       45682 :     buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
     306       45682 :     buildstate.havedead = false;
     307       45682 :     buildstate.heap = heap;
     308       45682 :     buildstate.spool = NULL;
     309       45682 :     buildstate.spool2 = NULL;
     310       45682 :     buildstate.indtuples = 0;
     311       45682 :     buildstate.btleader = NULL;
     312             : 
     313             :     /*
     314             :      * We expect to be called exactly once for any index relation. If that's
     315             :      * not the case, big trouble's what we have.
     316             :      */
     317       45682 :     if (RelationGetNumberOfBlocks(index) != 0)
     318           0 :         elog(ERROR, "index \"%s\" already contains data",
     319             :              RelationGetRelationName(index));
     320             : 
     321       45682 :     reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
     322             : 
     323             :     /*
     324             :      * Finish the build by (1) completing the sort of the spool file, (2)
     325             :      * inserting the sorted tuples into btree pages and (3) building the upper
     326             :      * levels.  Finally, it may also be necessary to end use of parallelism.
     327             :      */
     328       45670 :     _bt_leafbuild(buildstate.spool, buildstate.spool2);
     329       45586 :     _bt_spooldestroy(buildstate.spool);
     330       45586 :     if (buildstate.spool2)
     331          18 :         _bt_spooldestroy(buildstate.spool2);
     332       45586 :     if (buildstate.btleader)
     333         142 :         _bt_end_parallel(buildstate.btleader);
     334             : 
     335       45586 :     result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
     336             : 
     337       45586 :     result->heap_tuples = reltuples;
     338       45586 :     result->index_tuples = buildstate.indtuples;
     339             : 
     340             : #ifdef BTREE_BUILD_STATS
     341             :     if (log_btree_build_stats)
     342             :     {
     343             :         ShowUsage("BTREE BUILD STATS");
     344             :         ResetUsage();
     345             :     }
     346             : #endif                          /* BTREE_BUILD_STATS */
     347             : 
     348       45586 :     return result;
     349             : }
     350             : 
     351             : /*
     352             :  * Create and initialize one or two spool structures, and save them in caller's
     353             :  * buildstate argument.  May also fill-in fields within indexInfo used by index
     354             :  * builds.
     355             :  *
     356             :  * Scans the heap, possibly in parallel, filling spools with IndexTuples.  This
     357             :  * routine encapsulates all aspects of managing parallelism.  Caller need only
     358             :  * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
     359             :  *
     360             :  * Returns the total number of heap tuples scanned.
     361             :  */
     362             : static double
     363       45682 : _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
     364             :                     IndexInfo *indexInfo)
     365             : {
     366       45682 :     BTSpool    *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
     367       45682 :     SortCoordinate coordinate = NULL;
     368       45682 :     double      reltuples = 0;
     369             : 
     370             :     /*
     371             :      * We size the sort area as maintenance_work_mem rather than work_mem to
     372             :      * speed index creation.  This should be OK since a single backend can't
     373             :      * run multiple index creations in parallel (see also: notes on
     374             :      * parallelism and maintenance_work_mem below).
     375             :      */
     376       45682 :     btspool->heap = heap;
     377       45682 :     btspool->index = index;
     378       45682 :     btspool->isunique = indexInfo->ii_Unique;
     379       45682 :     btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
     380             : 
     381             :     /* Save as primary spool */
     382       45682 :     buildstate->spool = btspool;
     383             : 
     384             :     /* Report table scan phase started */
     385       45682 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     386             :                                  PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);
     387             : 
     388             :     /* Attempt to launch parallel worker scan when required */
     389       45682 :     if (indexInfo->ii_ParallelWorkers > 0)
     390         142 :         _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
     391             :                            indexInfo->ii_ParallelWorkers);
     392             : 
     393             :     /*
     394             :      * If parallel build requested and at least one worker process was
     395             :      * successfully launched, set up coordination state
     396             :      */
     397       45682 :     if (buildstate->btleader)
     398             :     {
     399         142 :         coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
     400         142 :         coordinate->isWorker = false;
     401         142 :         coordinate->nParticipants =
     402         142 :             buildstate->btleader->nparticipanttuplesorts;
     403         142 :         coordinate->sharedsort = buildstate->btleader->sharedsort;
     404             :     }
     405             : 
     406             :     /*
     407             :      * Begin serial/leader tuplesort.
     408             :      *
     409             :      * In cases where parallelism is involved, the leader receives the same
     410             :      * share of maintenance_work_mem as a serial sort (it is generally treated
     411             :      * in the same way as a serial sort once we return).  Parallel worker
     412             :      * Tuplesortstates will have received only a fraction of
     413             :      * maintenance_work_mem, though.
     414             :      *
     415             :      * We rely on the lifetime of the Leader Tuplesortstate almost not
     416             :      * overlapping with any worker Tuplesortstate's lifetime.  There may be
     417             :      * some small overlap, but that's okay because we rely on leader
     418             :      * Tuplesortstate only allocating a small, fixed amount of memory here.
     419             :      * When its tuplesort_performsort() is called (by our caller), and
     420             :      * significant amounts of memory are likely to be used, all workers must
     421             :      * have already freed almost all memory held by their Tuplesortstates
     422             :      * (they are about to go away completely, too).  The overall effect is
     423             :      * that maintenance_work_mem always represents an absolute high watermark
     424             :      * on the amount of memory used by a CREATE INDEX operation, regardless of
     425             :      * the use of parallelism or any other factor.
     426             :      */
     427       91364 :     buildstate->spool->sortstate =
     428       45682 :         tuplesort_begin_index_btree(heap, index, buildstate->isunique,
     429       45682 :                                     buildstate->nulls_not_distinct,
     430             :                                     maintenance_work_mem, coordinate,
     431             :                                     TUPLESORT_NONE);
     432             : 
     433             :     /*
     434             :      * If building a unique index, put dead tuples in a second spool to keep
     435             :      * them out of the uniqueness check.  We expect that the second spool (for
     436             :      * dead tuples) won't get very full, so we give it only work_mem.
     437             :      */
     438       45682 :     if (indexInfo->ii_Unique)
     439             :     {
     440       37066 :         BTSpool    *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
     441       37066 :         SortCoordinate coordinate2 = NULL;
     442             : 
     443             :         /* Initialize secondary spool */
     444       37066 :         btspool2->heap = heap;
     445       37066 :         btspool2->index = index;
     446       37066 :         btspool2->isunique = false;
     447             :         /* Save as secondary spool */
     448       37066 :         buildstate->spool2 = btspool2;
     449             : 
     450       37066 :         if (buildstate->btleader)
     451             :         {
     452             :             /*
     453             :              * Set up non-private state that is passed to
     454             :              * tuplesort_begin_index_btree() about the basic high level
     455             :              * coordination of a parallel sort.
     456             :              */
     457          64 :             coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
     458          64 :             coordinate2->isWorker = false;
     459          64 :             coordinate2->nParticipants =
     460          64 :                 buildstate->btleader->nparticipanttuplesorts;
     461          64 :             coordinate2->sharedsort = buildstate->btleader->sharedsort2;
     462             :         }
     463             : 
     464             :         /*
     465             :          * We expect that the second one (for dead tuples) won't get very
     466             :          * full, so we give it only work_mem
     467             :          */
     468       37066 :         buildstate->spool2->sortstate =
     469       37066 :             tuplesort_begin_index_btree(heap, index, false, false, work_mem,
     470             :                                         coordinate2, TUPLESORT_NONE);
     471             :     }
     472             : 
     473             :     /* Fill spool using either serial or parallel heap scan */
     474       45682 :     if (!buildstate->btleader)
     475       45540 :         reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
     476             :                                            _bt_build_callback, (void *) buildstate,
     477             :                                            NULL);
     478             :     else
     479         142 :         reltuples = _bt_parallel_heapscan(buildstate,
     480             :                                           &indexInfo->ii_BrokenHotChain);
     481             : 
     482             :     /*
     483             :      * Set the progress target for the next phase.  Reset the block number
     484             :      * values set by table_index_build_scan
     485             :      */
     486             :     {
     487       45670 :         const int   progress_index[] = {
     488             :             PROGRESS_CREATEIDX_TUPLES_TOTAL,
     489             :             PROGRESS_SCAN_BLOCKS_TOTAL,
     490             :             PROGRESS_SCAN_BLOCKS_DONE
     491             :         };
     492       45670 :         const int64 progress_vals[] = {
     493       45670 :             buildstate->indtuples,
     494             :             0, 0
     495             :         };
     496             : 
     497       45670 :         pgstat_progress_update_multi_param(3, progress_index, progress_vals);
     498             :     }
     499             : 
     500             :     /* okay, all heap tuples are spooled */
     501       45670 :     if (buildstate->spool2 && !buildstate->havedead)
     502             :     {
     503             :         /* spool2 turns out to be unnecessary */
     504       37048 :         _bt_spooldestroy(buildstate->spool2);
     505       37048 :         buildstate->spool2 = NULL;
     506             :     }
     507             : 
     508       45670 :     return reltuples;
     509             : }
     510             : 
     511             : /*
     512             :  * clean up a spool structure and its substructures.
     513             :  */
     514             : static void
     515       82652 : _bt_spooldestroy(BTSpool *btspool)
     516             : {
     517       82652 :     tuplesort_end(btspool->sortstate);
     518       82652 :     pfree(btspool);
     519       82652 : }
     520             : 
     521             : /*
     522             :  * spool an index entry into the sort file.
     523             :  */
     524             : static void
     525    11947808 : _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
     526             : {
     527    11947808 :     tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
     528             :                                   self, values, isnull);
     529    11947808 : }
     530             : 
     531             : /*
     532             :  * given a spool loaded by successive calls to _bt_spool,
     533             :  * create an entire btree.
     534             :  */
     535             : static void
     536       45670 : _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
     537             : {
     538             :     BTWriteState wstate;
     539             : 
     540             : #ifdef BTREE_BUILD_STATS
     541             :     if (log_btree_build_stats)
     542             :     {
     543             :         ShowUsage("BTREE BUILD (Spool) STATISTICS");
     544             :         ResetUsage();
     545             :     }
     546             : #endif                          /* BTREE_BUILD_STATS */
     547             : 
     548             :     /* Execute the sort */
     549       45670 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     550             :                                  PROGRESS_BTREE_PHASE_PERFORMSORT_1);
     551       45670 :     tuplesort_performsort(btspool->sortstate);
     552       45586 :     if (btspool2)
     553             :     {
     554          18 :         pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     555             :                                      PROGRESS_BTREE_PHASE_PERFORMSORT_2);
     556          18 :         tuplesort_performsort(btspool2->sortstate);
     557             :     }
     558             : 
     559       45586 :     wstate.heap = btspool->heap;
     560       45586 :     wstate.index = btspool->index;
     561       45586 :     wstate.inskey = _bt_mkscankey(wstate.index, NULL);
     562             :     /* _bt_mkscankey() won't set allequalimage without metapage */
     563       45586 :     wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
     564             : 
     565             :     /* reserve the metapage */
     566       45586 :     wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
     567             : 
     568       45586 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     569             :                                  PROGRESS_BTREE_PHASE_LEAF_LOAD);
     570       45586 :     _bt_load(&wstate, btspool, btspool2);
     571       45586 : }
     572             : 
     573             : /*
     574             :  * Per-tuple callback for table_index_build_scan
     575             :  */
     576             : static void
     577    11947808 : _bt_build_callback(Relation index,
     578             :                    ItemPointer tid,
     579             :                    Datum *values,
     580             :                    bool *isnull,
     581             :                    bool tupleIsAlive,
     582             :                    void *state)
     583             : {
     584    11947808 :     BTBuildState *buildstate = (BTBuildState *) state;
     585             : 
     586             :     /*
     587             :      * insert the index tuple into the appropriate spool file for subsequent
     588             :      * processing
     589             :      */
     590    11947808 :     if (tupleIsAlive || buildstate->spool2 == NULL)
     591    11947352 :         _bt_spool(buildstate->spool, tid, values, isnull);
     592             :     else
     593             :     {
     594             :         /* dead tuples are put into spool2 */
     595         456 :         buildstate->havedead = true;
     596         456 :         _bt_spool(buildstate->spool2, tid, values, isnull);
     597             :     }
     598             : 
     599    11947808 :     buildstate->indtuples += 1;
     600    11947808 : }
     601             : 
     602             : /*
     603             :  * allocate workspace for a new, clean btree page, not linked to any siblings.
     604             :  */
     605             : static BulkWriteBuffer
     606       48712 : _bt_blnewpage(BTWriteState *wstate, uint32 level)
     607             : {
     608             :     BulkWriteBuffer buf;
     609             :     Page        page;
     610             :     BTPageOpaque opaque;
     611             : 
     612       48712 :     buf = smgr_bulk_get_buf(wstate->bulkstate);
     613       48712 :     page = (Page) buf;
     614             : 
     615             :     /* Zero the page and set up standard page header info */
     616       48712 :     _bt_pageinit(page, BLCKSZ);
     617             : 
     618             :     /* Initialize BT opaque state */
     619       48712 :     opaque = BTPageGetOpaque(page);
     620       48712 :     opaque->btpo_prev = opaque->btpo_next = P_NONE;
     621       48712 :     opaque->btpo_level = level;
     622       48712 :     opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
     623       48712 :     opaque->btpo_cycleid = 0;
     624             : 
     625             :     /* Make the P_HIKEY line pointer appear allocated */
     626       48712 :     ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
     627             : 
     628       48712 :     return buf;
     629             : }
     630             : 
     631             : /*
     632             :  * emit a completed btree page, and release the working storage.
     633             :  */
     634             : static void
     635       94298 : _bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
     636             : {
     637       94298 :     smgr_bulk_write(wstate->bulkstate, blkno, buf, true);
     638             :     /* smgr_bulk_write took ownership of 'buf' */
     639       94298 : }
     640             : 
     641             : /*
     642             :  * allocate and initialize a new BTPageState.  the returned structure
     643             :  * is suitable for immediate use by _bt_buildadd.
     644             :  */
     645             : static BTPageState *
     646       11222 : _bt_pagestate(BTWriteState *wstate, uint32 level)
     647             : {
     648       11222 :     BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
     649             : 
     650             :     /* create initial page for level */
     651       11222 :     state->btps_buf = _bt_blnewpage(wstate, level);
     652             : 
     653             :     /* and assign it a page position */
     654       11222 :     state->btps_blkno = wstate->btws_pages_alloced++;
     655             : 
     656       11222 :     state->btps_lowkey = NULL;
     657             :     /* initialize lastoff so first item goes into P_FIRSTKEY */
     658       11222 :     state->btps_lastoff = P_HIKEY;
     659       11222 :     state->btps_lastextra = 0;
     660       11222 :     state->btps_level = level;
     661             :     /* set "full" threshold based on level.  See notes at head of file. */
     662       11222 :     if (level > 0)
     663        2266 :         state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
     664             :     else
     665        8956 :         state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
     666             : 
     667             :     /* no parent level, yet */
     668       11222 :     state->btps_next = NULL;
     669             : 
     670       11222 :     return state;
     671             : }
     672             : 
     673             : /*
     674             :  * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
     675             :  * P_HIKEY, overwriting P_HIKEY).
     676             :  *
     677             :  * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
     678             :  * rightmost page on its level is not supposed to get a high key.  Now that
     679             :  * it's clear that this page is a rightmost page, remove the unneeded empty
     680             :  * P_HIKEY line pointer space.
     681             :  */
     682             : static void
     683       11222 : _bt_slideleft(Page rightmostpage)
     684             : {
     685             :     OffsetNumber off;
     686             :     OffsetNumber maxoff;
     687             :     ItemId      previi;
     688             : 
     689       11222 :     maxoff = PageGetMaxOffsetNumber(rightmostpage);
     690             :     Assert(maxoff >= P_FIRSTKEY);
     691       11222 :     previi = PageGetItemId(rightmostpage, P_HIKEY);
     692      650326 :     for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
     693             :     {
     694      639104 :         ItemId      thisii = PageGetItemId(rightmostpage, off);
     695             : 
     696      639104 :         *previi = *thisii;
     697      639104 :         previi = thisii;
     698             :     }
     699       11222 :     ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
     700       11222 : }
     701             : 
     702             : /*
     703             :  * Add an item to a page being built.
     704             :  *
     705             :  * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
     706             :  * raises an error directly.
     707             :  *
     708             :  * Note that our nbtsort.c caller does not know yet if the page will be
     709             :  * rightmost.  Offset P_FIRSTKEY is always assumed to be the first data key by
     710             :  * caller.  Page that turns out to be the rightmost on its level is fixed by
     711             :  * calling _bt_slideleft().
     712             :  */
     713             : static void
     714    10721530 : _bt_sortaddtup(Page page,
     715             :                Size itemsize,
     716             :                IndexTuple itup,
     717             :                OffsetNumber itup_off,
     718             :                bool newfirstdataitem)
     719             : {
     720             :     IndexTupleData trunctuple;
     721             : 
     722    10721530 :     if (newfirstdataitem)
     723             :     {
     724        2390 :         trunctuple = *itup;
     725        2390 :         trunctuple.t_info = sizeof(IndexTupleData);
     726        2390 :         BTreeTupleSetNAtts(&trunctuple, 0, false);
     727        2390 :         itup = &trunctuple;
     728        2390 :         itemsize = sizeof(IndexTupleData);
     729             :     }
     730             : 
     731    10721530 :     if (PageAddItem(page, (Item) itup, itemsize, itup_off,
     732             :                     false, false) == InvalidOffsetNumber)
     733           0 :         elog(ERROR, "failed to add item to the index page");
     734    10721530 : }
     735             : 
     736             : /*----------
     737             :  * Add an item to a disk page from the sort output (or add a posting list
     738             :  * item formed from the sort output).
     739             :  *
     740             :  * We must be careful to observe the page layout conventions of nbtsearch.c:
     741             :  * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
     742             :  * - on non-leaf pages, the key portion of the first item need not be
     743             :  *   stored, we should store only the link.
     744             :  *
     745             :  * A leaf page being built looks like:
     746             :  *
     747             :  * +----------------+---------------------------------+
     748             :  * | PageHeaderData | linp0 linp1 linp2 ...           |
     749             :  * +-----------+----+---------------------------------+
     750             :  * | ... linpN |                                      |
     751             :  * +-----------+--------------------------------------+
     752             :  * |     ^ last                                       |
     753             :  * |                                                  |
     754             :  * +-------------+------------------------------------+
     755             :  * |             | itemN ...                          |
     756             :  * +-------------+------------------+-----------------+
     757             :  * |          ... item3 item2 item1 | "special space" |
     758             :  * +--------------------------------+-----------------+
     759             :  *
     760             :  * Contrast this with the diagram in bufpage.h; note the mismatch
     761             :  * between linps and items.  This is because we reserve linp0 as a
     762             :  * placeholder for the pointer to the "high key" item; when we have
     763             :  * filled up the page, we will set linp0 to point to itemN and clear
     764             :  * linpN.  On the other hand, if we find this is the last (rightmost)
     765             :  * page, we leave the items alone and slide the linp array over.  If
     766             :  * the high key is to be truncated, offset 1 is deleted, and we insert
     767             :  * the truncated high key at offset 1.
     768             :  *
     769             :  * 'last' pointer indicates the last offset added to the page.
     770             :  *
     771             :  * 'truncextra' is the size of the posting list in itup, if any.  This
     772             :  * information is stashed for the next call here, when we may benefit
     773             :  * from considering the impact of truncating away the posting list on
     774             :  * the page before deciding to finish the page off.  Posting lists are
     775             :  * often relatively large, so it is worth going to the trouble of
     776             :  * accounting for the saving from truncating away the posting list of
     777             :  * the tuple that becomes the high key (that may be the only way to
     778             :  * get close to target free space on the page).  Note that this is
     779             :  * only used for the soft fillfactor-wise limit, not the critical hard
     780             :  * limit.
     781             :  *----------
     782             :  */
     783             : static void
     784    10684040 : _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
     785             :              Size truncextra)
     786             : {
     787             :     BulkWriteBuffer nbuf;
     788             :     Page        npage;
     789             :     BlockNumber nblkno;
     790             :     OffsetNumber last_off;
     791             :     Size        last_truncextra;
     792             :     Size        pgspc;
     793             :     Size        itupsz;
     794             :     bool        isleaf;
     795             : 
     796             :     /*
     797             :      * This is a handy place to check for cancel interrupts during the btree
     798             :      * load phase of index creation.
     799             :      */
     800    10684040 :     CHECK_FOR_INTERRUPTS();
     801             : 
     802    10684040 :     nbuf = state->btps_buf;
     803    10684040 :     npage = (Page) nbuf;
     804    10684040 :     nblkno = state->btps_blkno;
     805    10684040 :     last_off = state->btps_lastoff;
     806    10684040 :     last_truncextra = state->btps_lastextra;
     807    10684040 :     state->btps_lastextra = truncextra;
     808             : 
     809    10684040 :     pgspc = PageGetFreeSpace(npage);
     810    10684040 :     itupsz = IndexTupleSize(itup);
     811    10684040 :     itupsz = MAXALIGN(itupsz);
     812             :     /* Leaf case has slightly different rules due to suffix truncation */
     813    10684040 :     isleaf = (state->btps_level == 0);
     814             : 
     815             :     /*
     816             :      * Check whether the new item can fit on a btree page on current level at
     817             :      * all.
     818             :      *
     819             :      * Every newly built index will treat heap TID as part of the keyspace,
     820             :      * which imposes the requirement that new high keys must occasionally have
     821             :      * a heap TID appended within _bt_truncate().  That may leave a new pivot
     822             :      * tuple one or two MAXALIGN() quantums larger than the original
     823             :      * firstright tuple it's derived from.  v4 deals with the problem by
     824             :      * decreasing the limit on the size of tuples inserted on the leaf level
     825             :      * by the same small amount.  Enforce the new v4+ limit on the leaf level,
     826             :      * and the old limit on internal levels, since pivot tuples may need to
     827             :      * make use of the reserved space.  This should never fail on internal
     828             :      * pages.
     829             :      */
     830    10684040 :     if (unlikely(itupsz > BTMaxItemSize(npage)))
     831         264 :         _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
     832             :                              itup);
     833             : 
     834             :     /*
     835             :      * Check to see if current page will fit new item, with space left over to
     836             :      * append a heap TID during suffix truncation when page is a leaf page.
     837             :      *
     838             :      * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
     839             :      * high key with heap TID when finishing off a leaf page, since we rely on
     840             :      * _bt_check_third_page() rejecting oversized non-pivot tuples.  On
     841             :      * internal pages we can always fit 3 pivot tuples with larger internal
     842             :      * page tuple limit (includes page high key).
     843             :      *
     844             :      * Most of the time, a page is only "full" in the sense that the soft
     845             :      * fillfactor-wise limit has been exceeded.  However, we must always leave
     846             :      * at least two items plus a high key on each page before starting a new
     847             :      * page.  Disregard fillfactor and insert on "full" current page if we
     848             :      * don't have the minimum number of items yet.  (Note that we deliberately
     849             :      * assume that suffix truncation neither enlarges nor shrinks new high key
     850             :      * when applying soft limit, except when last tuple has a posting list.)
     851             :      */
     852             :     Assert(last_truncextra == 0 || isleaf);
     853    10684040 :     if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
     854    10682892 :         (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
     855             :     {
     856             :         /*
     857             :          * Finish off the page and write it out.
     858             :          */
     859       37490 :         BulkWriteBuffer obuf = nbuf;
     860       37490 :         Page        opage = npage;
     861       37490 :         BlockNumber oblkno = nblkno;
     862             :         ItemId      ii;
     863             :         ItemId      hii;
     864             :         IndexTuple  oitup;
     865             : 
     866             :         /* Create new page of same level */
     867       37490 :         nbuf = _bt_blnewpage(wstate, state->btps_level);
     868       37490 :         npage = (Page) nbuf;
     869             : 
     870             :         /* and assign it a page position */
     871       37490 :         nblkno = wstate->btws_pages_alloced++;
     872             : 
     873             :         /*
     874             :          * We copy the last item on the page into the new page, and then
     875             :          * rearrange the old page so that the 'last item' becomes its high key
     876             :          * rather than a true data item.  There had better be at least two
     877             :          * items on the page already, else the page would be empty of useful
     878             :          * data.
     879             :          */
     880             :         Assert(last_off > P_FIRSTKEY);
     881       37490 :         ii = PageGetItemId(opage, last_off);
     882       37490 :         oitup = (IndexTuple) PageGetItem(opage, ii);
     883       37490 :         _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
     884       37490 :                        !isleaf);
     885             : 
     886             :         /*
     887             :          * Move 'last' into the high key position on opage.  _bt_blnewpage()
     888             :          * allocated empty space for a line pointer when opage was first
     889             :          * created, so this is a matter of rearranging already-allocated space
     890             :          * on page, and initializing high key line pointer. (Actually, leaf
     891             :          * pages must also swap oitup with a truncated version of oitup, which
     892             :          * is sometimes larger than oitup, though never by more than the space
     893             :          * needed to append a heap TID.)
     894             :          */
     895       37490 :         hii = PageGetItemId(opage, P_HIKEY);
     896       37490 :         *hii = *ii;
     897       37490 :         ItemIdSetUnused(ii);    /* redundant */
     898       37490 :         ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
     899             : 
     900       37490 :         if (isleaf)
     901             :         {
     902             :             IndexTuple  lastleft;
     903             :             IndexTuple  truncated;
     904             : 
     905             :             /*
     906             :              * Truncate away any unneeded attributes from high key on leaf
     907             :              * level.  This is only done at the leaf level because downlinks
     908             :              * in internal pages are either negative infinity items, or get
     909             :              * their contents from copying from one level down.  See also:
     910             :              * _bt_split().
     911             :              *
     912             :              * We don't try to bias our choice of split point to make it more
     913             :              * likely that _bt_truncate() can truncate away more attributes,
     914             :              * whereas the split point used within _bt_split() is chosen much
     915             :              * more delicately.  Even still, the lastleft and firstright
     916             :              * tuples passed to _bt_truncate() here are at least not fully
     917             :              * equal to each other when deduplication is used, unless there is
     918             :              * a large group of duplicates (also, unique index builds usually
     919             :              * have few or no spool2 duplicates).  When the split point is
     920             :              * between two unequal tuples, _bt_truncate() will avoid including
     921             :              * a heap TID in the new high key, which is the most important
     922             :              * benefit of suffix truncation.
     923             :              *
     924             :              * Overwrite the old item with new truncated high key directly.
     925             :              * oitup is already located at the physical beginning of tuple
     926             :              * space, so this should directly reuse the existing tuple space.
     927             :              */
     928       37366 :             ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
     929       37366 :             lastleft = (IndexTuple) PageGetItem(opage, ii);
     930             : 
     931             :             Assert(IndexTupleSize(oitup) > last_truncextra);
     932       37366 :             truncated = _bt_truncate(wstate->index, lastleft, oitup,
     933             :                                      wstate->inskey);
     934       37366 :             if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated,
     935       37366 :                                          IndexTupleSize(truncated)))
     936           0 :                 elog(ERROR, "failed to add high key to the index page");
     937       37366 :             pfree(truncated);
     938             : 
     939             :             /* oitup should continue to point to the page's high key */
     940       37366 :             hii = PageGetItemId(opage, P_HIKEY);
     941       37366 :             oitup = (IndexTuple) PageGetItem(opage, hii);
     942             :         }
     943             : 
     944             :         /*
     945             :          * Link the old page into its parent, using its low key.  If we don't
     946             :          * have a parent, we have to create one; this adds a new btree level.
     947             :          */
     948       37490 :         if (state->btps_next == NULL)
     949        2266 :             state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
     950             : 
     951             :         Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
     952             :                 IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
     953             :                 BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
     954             :                P_LEFTMOST(BTPageGetOpaque(opage)));
     955             :         Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
     956             :                !P_LEFTMOST(BTPageGetOpaque(opage)));
     957       37490 :         BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
     958       37490 :         _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
     959       37490 :         pfree(state->btps_lowkey);
     960             : 
     961             :         /*
     962             :          * Save a copy of the high key from the old page.  It is also the low
     963             :          * key for the new page.
     964             :          */
     965       37490 :         state->btps_lowkey = CopyIndexTuple(oitup);
     966             : 
     967             :         /*
     968             :          * Set the sibling links for both pages.
     969             :          */
     970             :         {
     971       37490 :             BTPageOpaque oopaque = BTPageGetOpaque(opage);
     972       37490 :             BTPageOpaque nopaque = BTPageGetOpaque(npage);
     973             : 
     974       37490 :             oopaque->btpo_next = nblkno;
     975       37490 :             nopaque->btpo_prev = oblkno;
     976       37490 :             nopaque->btpo_next = P_NONE; /* redundant */
     977             :         }
     978             : 
     979             :         /*
     980             :          * Write out the old page. _bt_blwritepage takes ownership of the
     981             :          * 'opage' buffer.
     982             :          */
     983       37490 :         _bt_blwritepage(wstate, obuf, oblkno);
     984             : 
     985             :         /*
     986             :          * Reset last_off to point to new page
     987             :          */
     988       37490 :         last_off = P_FIRSTKEY;
     989             :     }
     990             : 
     991             :     /*
     992             :      * By here, either original page is still the current page, or a new page
     993             :      * was created that became the current page.  Either way, the current page
     994             :      * definitely has space for new item.
     995             :      *
     996             :      * If the new item is the first for its page, it must also be the first
     997             :      * item on its entire level.  On later same-level pages, a low key for a
     998             :      * page will be copied from the prior page in the code above.  Generate a
     999             :      * minus infinity low key here instead.
    1000             :      */
    1001    10684040 :     if (last_off == P_HIKEY)
    1002             :     {
    1003             :         Assert(state->btps_lowkey == NULL);
    1004       11222 :         state->btps_lowkey = palloc0(sizeof(IndexTupleData));
    1005       11222 :         state->btps_lowkey->t_info = sizeof(IndexTupleData);
    1006       11222 :         BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
    1007             :     }
    1008             : 
    1009             :     /*
    1010             :      * Add the new item into the current page.
    1011             :      */
    1012    10684040 :     last_off = OffsetNumberNext(last_off);
    1013    10684040 :     _bt_sortaddtup(npage, itupsz, itup, last_off,
    1014    10684040 :                    !isleaf && last_off == P_FIRSTKEY);
    1015             : 
    1016    10684040 :     state->btps_buf = nbuf;
    1017    10684040 :     state->btps_blkno = nblkno;
    1018    10684040 :     state->btps_lastoff = last_off;
    1019    10684040 : }
    1020             : 
    1021             : /*
    1022             :  * Finalize pending posting list tuple, and add it to the index.  Final tuple
    1023             :  * is based on saved base tuple, and saved list of heap TIDs.
    1024             :  *
    1025             :  * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
    1026             :  * using _bt_buildadd().
    1027             :  */
    1028             : static void
    1029     4676382 : _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
    1030             :                               BTDedupState dstate)
    1031             : {
    1032             :     Assert(dstate->nitems > 0);
    1033             : 
    1034     4676382 :     if (dstate->nitems == 1)
    1035     4638660 :         _bt_buildadd(wstate, state, dstate->base, 0);
    1036             :     else
    1037             :     {
    1038             :         IndexTuple  postingtuple;
    1039             :         Size        truncextra;
    1040             : 
    1041             :         /* form a tuple with a posting list */
    1042       37722 :         postingtuple = _bt_form_posting(dstate->base,
    1043             :                                         dstate->htids,
    1044             :                                         dstate->nhtids);
    1045             :         /* Calculate posting list overhead */
    1046       75444 :         truncextra = IndexTupleSize(postingtuple) -
    1047       37722 :             BTreeTupleGetPostingOffset(postingtuple);
    1048             : 
    1049       37722 :         _bt_buildadd(wstate, state, postingtuple, truncextra);
    1050       37722 :         pfree(postingtuple);
    1051             :     }
    1052             : 
    1053     4676382 :     dstate->nmaxitems = 0;
    1054     4676382 :     dstate->nhtids = 0;
    1055     4676382 :     dstate->nitems = 0;
    1056     4676382 :     dstate->phystupsize = 0;
    1057     4676382 : }
    1058             : 
    1059             : /*
    1060             :  * Finish writing out the completed btree.
    1061             :  */
    1062             : static void
    1063       45586 : _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
    1064             : {
    1065             :     BTPageState *s;
    1066       45586 :     BlockNumber rootblkno = P_NONE;
    1067       45586 :     uint32      rootlevel = 0;
    1068             :     BulkWriteBuffer metabuf;
    1069             : 
    1070             :     /*
    1071             :      * Each iteration of this loop completes one more level of the tree.
    1072             :      */
    1073       56808 :     for (s = state; s != NULL; s = s->btps_next)
    1074             :     {
    1075             :         BlockNumber blkno;
    1076             :         BTPageOpaque opaque;
    1077             : 
    1078       11222 :         blkno = s->btps_blkno;
    1079       11222 :         opaque = BTPageGetOpaque((Page) s->btps_buf);
    1080             : 
    1081             :         /*
    1082             :          * We have to link the last page on this level to somewhere.
    1083             :          *
    1084             :          * If we're at the top, it's the root, so attach it to the metapage.
    1085             :          * Otherwise, add an entry for it to its parent using its low key.
    1086             :          * This may cause the last page of the parent level to split, but
    1087             :          * that's not a problem -- we haven't gotten to it yet.
    1088             :          */
    1089       11222 :         if (s->btps_next == NULL)
    1090             :         {
    1091        8956 :             opaque->btpo_flags |= BTP_ROOT;
    1092        8956 :             rootblkno = blkno;
    1093        8956 :             rootlevel = s->btps_level;
    1094             :         }
    1095             :         else
    1096             :         {
    1097             :             Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
    1098             :                     IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
    1099             :                     BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
    1100             :                    P_LEFTMOST(opaque));
    1101             :             Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
    1102             :                    !P_LEFTMOST(opaque));
    1103        2266 :             BTreeTupleSetDownLink(s->btps_lowkey, blkno);
    1104        2266 :             _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
    1105        2266 :             pfree(s->btps_lowkey);
    1106        2266 :             s->btps_lowkey = NULL;
    1107             :         }
    1108             : 
    1109             :         /*
    1110             :          * This is the rightmost page, so the ItemId array needs to be slid
    1111             :          * back one slot.  Then we can dump out the page.
    1112             :          */
    1113       11222 :         _bt_slideleft((Page) s->btps_buf);
    1114       11222 :         _bt_blwritepage(wstate, s->btps_buf, s->btps_blkno);
    1115       11222 :         s->btps_buf = NULL;      /* writepage took ownership of the buffer */
    1116             :     }
    1117             : 
    1118             :     /*
    1119             :      * As the last step in the process, construct the metapage and make it
    1120             :      * point to the new root (unless we had no data at all, in which case it's
    1121             :      * set to point to "P_NONE").  This changes the index to the "valid" state
    1122             :      * by filling in a valid magic number in the metapage.
    1123             :      */
    1124       45586 :     metabuf = smgr_bulk_get_buf(wstate->bulkstate);
    1125       45586 :     _bt_initmetapage((Page) metabuf, rootblkno, rootlevel,
    1126       45586 :                      wstate->inskey->allequalimage);
    1127       45586 :     _bt_blwritepage(wstate, metabuf, BTREE_METAPAGE);
    1128       45586 : }
    1129             : 
    1130             : /*
    1131             :  * Read tuples in correct sort order from tuplesort, and load them into
    1132             :  * btree leaves.
    1133             :  */
    1134             : static void
    1135       45586 : _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
    1136             : {
    1137       45586 :     BTPageState *state = NULL;
    1138       45586 :     bool        merge = (btspool2 != NULL);
    1139             :     IndexTuple  itup,
    1140       45586 :                 itup2 = NULL;
    1141             :     bool        load1;
    1142       45586 :     TupleDesc   tupdes = RelationGetDescr(wstate->index);
    1143             :     int         i,
    1144       45586 :                 keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
    1145             :     SortSupport sortKeys;
    1146       45586 :     int64       tuples_done = 0;
    1147             :     bool        deduplicate;
    1148             : 
    1149       45586 :     wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);
    1150             : 
    1151       53956 :     deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
    1152        8370 :         BTGetDeduplicateItems(wstate->index);
    1153             : 
    1154       45586 :     if (merge)
    1155             :     {
    1156             :         /*
    1157             :          * Another BTSpool for dead tuples exists. Now we have to merge
    1158             :          * btspool and btspool2.
    1159             :          */
    1160             : 
    1161             :         /* the preparation of merge */
    1162          18 :         itup = tuplesort_getindextuple(btspool->sortstate, true);
    1163          18 :         itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
    1164             : 
    1165             :         /* Prepare SortSupport data for each column */
    1166          18 :         sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
    1167             : 
    1168          38 :         for (i = 0; i < keysz; i++)
    1169             :         {
    1170          20 :             SortSupport sortKey = sortKeys + i;
    1171          20 :             ScanKey     scanKey = wstate->inskey->scankeys + i;
    1172             :             int16       strategy;
    1173             : 
    1174          20 :             sortKey->ssup_cxt = CurrentMemoryContext;
    1175          20 :             sortKey->ssup_collation = scanKey->sk_collation;
    1176          20 :             sortKey->ssup_nulls_first =
    1177          20 :                 (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
    1178          20 :             sortKey->ssup_attno = scanKey->sk_attno;
    1179             :             /* Abbreviation is not supported here */
    1180          20 :             sortKey->abbreviate = false;
    1181             : 
    1182             :             Assert(sortKey->ssup_attno != 0);
    1183             : 
    1184          20 :             strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
    1185             :                 BTGreaterStrategyNumber : BTLessStrategyNumber;
    1186             : 
    1187          20 :             PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
    1188             :         }
    1189             : 
    1190             :         for (;;)
    1191             :         {
    1192        3270 :             load1 = true;       /* load BTSpool next ? */
    1193        3270 :             if (itup2 == NULL)
    1194             :             {
    1195         150 :                 if (itup == NULL)
    1196          18 :                     break;
    1197             :             }
    1198        3120 :             else if (itup != NULL)
    1199             :             {
    1200        2950 :                 int32       compare = 0;
    1201             : 
    1202        3178 :                 for (i = 1; i <= keysz; i++)
    1203             :                 {
    1204             :                     SortSupport entry;
    1205             :                     Datum       attrDatum1,
    1206             :                                 attrDatum2;
    1207             :                     bool        isNull1,
    1208             :                                 isNull2;
    1209             : 
    1210        2998 :                     entry = sortKeys + i - 1;
    1211        2998 :                     attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
    1212        2998 :                     attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
    1213             : 
    1214        2998 :                     compare = ApplySortComparator(attrDatum1, isNull1,
    1215             :                                                   attrDatum2, isNull2,
    1216             :                                                   entry);
    1217        2998 :                     if (compare > 0)
    1218             :                     {
    1219         256 :                         load1 = false;
    1220        2770 :                         break;
    1221             :                     }
    1222        2742 :                     else if (compare < 0)
    1223        2514 :                         break;
    1224             :                 }
    1225             : 
    1226             :                 /*
    1227             :                  * If key values are equal, we sort on ItemPointer.  This is
    1228             :                  * required for btree indexes, since heap TID is treated as an
    1229             :                  * implicit last key attribute in order to ensure that all
    1230             :                  * keys in the index are physically unique.
    1231             :                  */
    1232        2950 :                 if (compare == 0)
    1233             :                 {
    1234         180 :                     compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
    1235             :                     Assert(compare != 0);
    1236         180 :                     if (compare > 0)
    1237          30 :                         load1 = false;
    1238             :                 }
    1239             :             }
    1240             :             else
    1241         170 :                 load1 = false;
    1242             : 
    1243             :             /* When we see first tuple, create first index page */
    1244        3252 :             if (state == NULL)
    1245          18 :                 state = _bt_pagestate(wstate, 0);
    1246             : 
    1247        3252 :             if (load1)
    1248             :             {
    1249        2796 :                 _bt_buildadd(wstate, state, itup, 0);
    1250        2796 :                 itup = tuplesort_getindextuple(btspool->sortstate, true);
    1251             :             }
    1252             :             else
    1253             :             {
    1254         456 :                 _bt_buildadd(wstate, state, itup2, 0);
    1255         456 :                 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
    1256             :             }
    1257             : 
    1258             :             /* Report progress */
    1259        3252 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1260             :                                          ++tuples_done);
    1261             :         }
    1262          18 :         pfree(sortKeys);
    1263             :     }
    1264       45568 :     else if (deduplicate)
    1265             :     {
    1266             :         /* merge is unnecessary, deduplicate into posting lists */
    1267             :         BTDedupState dstate;
    1268             : 
    1269        8370 :         dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
    1270        8370 :         dstate->deduplicate = true; /* unused */
    1271        8370 :         dstate->nmaxitems = 0;   /* unused */
    1272        8370 :         dstate->maxpostingsize = 0; /* set later */
    1273             :         /* Metadata about base tuple of current pending posting list */
    1274        8370 :         dstate->base = NULL;
    1275        8370 :         dstate->baseoff = InvalidOffsetNumber;   /* unused */
    1276        8370 :         dstate->basetupsize = 0;
    1277             :         /* Metadata about current pending posting list TIDs */
    1278        8370 :         dstate->htids = NULL;
    1279        8370 :         dstate->nhtids = 0;
    1280        8370 :         dstate->nitems = 0;
    1281        8370 :         dstate->phystupsize = 0; /* unused */
    1282        8370 :         dstate->nintervals = 0; /* unused */
    1283             : 
    1284     5987898 :         while ((itup = tuplesort_getindextuple(btspool->sortstate,
    1285             :                                                true)) != NULL)
    1286             :         {
    1287             :             /* When we see first tuple, create first index page */
    1288     5979528 :             if (state == NULL)
    1289             :             {
    1290        2296 :                 state = _bt_pagestate(wstate, 0);
    1291             : 
    1292             :                 /*
    1293             :                  * Limit size of posting list tuples to 1/10 space we want to
    1294             :                  * leave behind on the page, plus space for final item's line
    1295             :                  * pointer.  This is equal to the space that we'd like to
    1296             :                  * leave behind on each leaf page when fillfactor is 90,
    1297             :                  * allowing us to get close to fillfactor% space utilization
    1298             :                  * when there happen to be a great many duplicates.  (This
    1299             :                  * makes higher leaf fillfactor settings ineffective when
    1300             :                  * building indexes that have many duplicates, but packing
    1301             :                  * leaf pages full with few very large tuples doesn't seem
    1302             :                  * like a useful goal.)
    1303             :                  */
    1304        2296 :                 dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
    1305             :                     sizeof(ItemIdData);
    1306             :                 Assert(dstate->maxpostingsize <= BTMaxItemSize((Page) state->btps_buf) &&
    1307             :                        dstate->maxpostingsize <= INDEX_SIZE_MASK);
    1308        2296 :                 dstate->htids = palloc(dstate->maxpostingsize);
    1309             : 
    1310             :                 /* start new pending posting list with itup copy */
    1311        2296 :                 _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
    1312             :                                         InvalidOffsetNumber);
    1313             :             }
    1314     5977232 :             else if (_bt_keep_natts_fast(wstate->index, dstate->base,
    1315     1310846 :                                          itup) > keysz &&
    1316     1310846 :                      _bt_dedup_save_htid(dstate, itup))
    1317             :             {
    1318             :                 /*
    1319             :                  * Tuple is equal to base tuple of pending posting list.  Heap
    1320             :                  * TID from itup has been saved in state.
    1321             :                  */
    1322             :             }
    1323             :             else
    1324             :             {
    1325             :                 /*
    1326             :                  * Tuple is not equal to pending posting list tuple, or
    1327             :                  * _bt_dedup_save_htid() opted to not merge current item into
    1328             :                  * pending posting list.
    1329             :                  */
    1330     4674086 :                 _bt_sort_dedup_finish_pending(wstate, state, dstate);
    1331     4674086 :                 pfree(dstate->base);
    1332             : 
    1333             :                 /* start new pending posting list with itup copy */
    1334     4674086 :                 _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
    1335             :                                         InvalidOffsetNumber);
    1336             :             }
    1337             : 
    1338             :             /* Report progress */
    1339     5979528 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1340             :                                          ++tuples_done);
    1341             :         }
    1342             : 
    1343        8370 :         if (state)
    1344             :         {
    1345             :             /*
    1346             :              * Handle the last item (there must be a last item when the
    1347             :              * tuplesort returned one or more tuples)
    1348             :              */
    1349        2296 :             _bt_sort_dedup_finish_pending(wstate, state, dstate);
    1350        2296 :             pfree(dstate->base);
    1351        2296 :             pfree(dstate->htids);
    1352             :         }
    1353             : 
    1354        8370 :         pfree(dstate);
    1355             :     }
    1356             :     else
    1357             :     {
    1358             :         /* merging and deduplication are both unnecessary */
    1359     6001848 :         while ((itup = tuplesort_getindextuple(btspool->sortstate,
    1360             :                                                true)) != NULL)
    1361             :         {
    1362             :             /* When we see first tuple, create first index page */
    1363     5964650 :             if (state == NULL)
    1364        6642 :                 state = _bt_pagestate(wstate, 0);
    1365             : 
    1366     5964650 :             _bt_buildadd(wstate, state, itup, 0);
    1367             : 
    1368             :             /* Report progress */
    1369     5964650 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1370             :                                          ++tuples_done);
    1371             :         }
    1372             :     }
    1373             : 
    1374             :     /* Close down final pages and write the metapage */
    1375       45586 :     _bt_uppershutdown(wstate, state);
    1376       45586 :     smgr_bulk_finish(wstate->bulkstate);
    1377       45586 : }
    1378             : 
    1379             : /*
    1380             :  * Create parallel context, and launch workers for leader.
    1381             :  *
    1382             :  * buildstate argument should be initialized (with the exception of the
    1383             :  * tuplesort state in spools, which may later be created based on shared
    1384             :  * state initially set up here).
    1385             :  *
    1386             :  * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
    1387             :  *
    1388             :  * request is the target number of parallel worker processes to launch.
    1389             :  *
    1390             :  * Sets buildstate's BTLeader, which caller must use to shut down parallel
    1391             :  * mode by passing it to _bt_end_parallel() at the very end of its index
    1392             :  * build.  If not even a single worker process can be launched, this is
    1393             :  * never set, and caller should proceed with a serial index build.
    1394             :  */
    1395             : static void
    1396         142 : _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
    1397             : {
    1398             :     ParallelContext *pcxt;
    1399             :     int         scantuplesortstates;
    1400             :     Snapshot    snapshot;
    1401             :     Size        estbtshared;
    1402             :     Size        estsort;
    1403             :     BTShared   *btshared;
    1404             :     Sharedsort *sharedsort;
    1405             :     Sharedsort *sharedsort2;
    1406         142 :     BTSpool    *btspool = buildstate->spool;
    1407         142 :     BTLeader   *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
    1408             :     WalUsage   *walusage;
    1409             :     BufferUsage *bufferusage;
    1410         142 :     bool        leaderparticipates = true;
    1411             :     int         querylen;
    1412             : 
    1413             : #ifdef DISABLE_LEADER_PARTICIPATION
    1414             :     leaderparticipates = false;
    1415             : #endif
    1416             : 
    1417             :     /*
    1418             :      * Enter parallel mode, and create context for parallel build of btree
    1419             :      * index
    1420             :      */
    1421         142 :     EnterParallelMode();
    1422             :     Assert(request > 0);
    1423         142 :     pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
    1424             :                                  request);
    1425             : 
    1426         142 :     scantuplesortstates = leaderparticipates ? request + 1 : request;
    1427             : 
    1428             :     /*
    1429             :      * Prepare for scan of the base relation.  In a normal index build, we use
    1430             :      * SnapshotAny because we must retrieve all tuples and do our own time
    1431             :      * qual checks (because we have to index RECENTLY_DEAD tuples).  In a
    1432             :      * concurrent build, we take a regular MVCC snapshot and index whatever's
    1433             :      * live according to that.
    1434             :      */
    1435         142 :     if (!isconcurrent)
    1436         142 :         snapshot = SnapshotAny;
    1437             :     else
    1438           0 :         snapshot = RegisterSnapshot(GetTransactionSnapshot());
    1439             : 
    1440             :     /*
    1441             :      * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
    1442             :      * PARALLEL_KEY_TUPLESORT tuplesort workspace
    1443             :      */
    1444         142 :     estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
    1445         142 :     shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
    1446         142 :     estsort = tuplesort_estimate_shared(scantuplesortstates);
    1447         142 :     shm_toc_estimate_chunk(&pcxt->estimator, estsort);
    1448             : 
    1449             :     /*
    1450             :      * Unique case requires a second spool, and so we may have to account for
    1451             :      * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
    1452             :      */
    1453         142 :     if (!btspool->isunique)
    1454          78 :         shm_toc_estimate_keys(&pcxt->estimator, 2);
    1455             :     else
    1456             :     {
    1457          64 :         shm_toc_estimate_chunk(&pcxt->estimator, estsort);
    1458          64 :         shm_toc_estimate_keys(&pcxt->estimator, 3);
    1459             :     }
    1460             : 
    1461             :     /*
    1462             :      * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
    1463             :      * and PARALLEL_KEY_BUFFER_USAGE.
    1464             :      *
    1465             :      * If there are no extensions loaded that care, we could skip this.  We
    1466             :      * have no way of knowing whether anyone's looking at pgWalUsage or
    1467             :      * pgBufferUsage, so do it unconditionally.
    1468             :      */
    1469         142 :     shm_toc_estimate_chunk(&pcxt->estimator,
    1470             :                            mul_size(sizeof(WalUsage), pcxt->nworkers));
    1471         142 :     shm_toc_estimate_keys(&pcxt->estimator, 1);
    1472         142 :     shm_toc_estimate_chunk(&pcxt->estimator,
    1473             :                            mul_size(sizeof(BufferUsage), pcxt->nworkers));
    1474         142 :     shm_toc_estimate_keys(&pcxt->estimator, 1);
    1475             : 
    1476             :     /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
    1477         142 :     if (debug_query_string)
    1478             :     {
    1479         142 :         querylen = strlen(debug_query_string);
    1480         142 :         shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
    1481         142 :         shm_toc_estimate_keys(&pcxt->estimator, 1);
    1482             :     }
    1483             :     else
    1484           0 :         querylen = 0;           /* keep compiler quiet */
    1485             : 
    1486             :     /* Everyone's had a chance to ask for space, so now create the DSM */
    1487         142 :     InitializeParallelDSM(pcxt);
    1488             : 
    1489             :     /* If no DSM segment was available, back out (do serial build) */
    1490         142 :     if (pcxt->seg == NULL)
    1491             :     {
    1492           0 :         if (IsMVCCSnapshot(snapshot))
    1493           0 :             UnregisterSnapshot(snapshot);
    1494           0 :         DestroyParallelContext(pcxt);
    1495           0 :         ExitParallelMode();
    1496           0 :         return;
    1497             :     }
    1498             : 
    1499             :     /* Store shared build state, for which we reserved space */
    1500         142 :     btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
    1501             :     /* Initialize immutable state */
    1502         142 :     btshared->heaprelid = RelationGetRelid(btspool->heap);
    1503         142 :     btshared->indexrelid = RelationGetRelid(btspool->index);
    1504         142 :     btshared->isunique = btspool->isunique;
    1505         142 :     btshared->nulls_not_distinct = btspool->nulls_not_distinct;
    1506         142 :     btshared->isconcurrent = isconcurrent;
    1507         142 :     btshared->scantuplesortstates = scantuplesortstates;
    1508         142 :     ConditionVariableInit(&btshared->workersdonecv);
    1509         142 :     SpinLockInit(&btshared->mutex);
    1510             :     /* Initialize mutable state */
    1511         142 :     btshared->nparticipantsdone = 0;
    1512         142 :     btshared->reltuples = 0.0;
    1513         142 :     btshared->havedead = false;
    1514         142 :     btshared->indtuples = 0.0;
    1515         142 :     btshared->brokenhotchain = false;
    1516         142 :     table_parallelscan_initialize(btspool->heap,
    1517             :                                   ParallelTableScanFromBTShared(btshared),
    1518             :                                   snapshot);
    1519             : 
    1520             :     /*
    1521             :      * Store shared tuplesort-private state, for which we reserved space.
    1522             :      * Then, initialize opaque state using tuplesort routine.
    1523             :      */
    1524         142 :     sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
    1525         142 :     tuplesort_initialize_shared(sharedsort, scantuplesortstates,
    1526             :                                 pcxt->seg);
    1527             : 
    1528         142 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
    1529         142 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
    1530             : 
    1531             :     /* Unique case requires a second spool, and associated shared state */
    1532         142 :     if (!btspool->isunique)
    1533          78 :         sharedsort2 = NULL;
    1534             :     else
    1535             :     {
    1536             :         /*
    1537             :          * Store additional shared tuplesort-private state, for which we
    1538             :          * reserved space.  Then, initialize opaque state using tuplesort
    1539             :          * routine.
    1540             :          */
    1541          64 :         sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
    1542          64 :         tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
    1543             :                                     pcxt->seg);
    1544             : 
    1545          64 :         shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
    1546             :     }
    1547             : 
    1548             :     /* Store query string for workers */
    1549         142 :     if (debug_query_string)
    1550             :     {
    1551             :         char       *sharedquery;
    1552             : 
    1553         142 :         sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
    1554         142 :         memcpy(sharedquery, debug_query_string, querylen + 1);
    1555         142 :         shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
    1556             :     }
    1557             : 
    1558             :     /*
    1559             :      * Allocate space for each worker's WalUsage and BufferUsage; no need to
    1560             :      * initialize.
    1561             :      */
    1562         142 :     walusage = shm_toc_allocate(pcxt->toc,
    1563         142 :                                 mul_size(sizeof(WalUsage), pcxt->nworkers));
    1564         142 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
    1565         142 :     bufferusage = shm_toc_allocate(pcxt->toc,
    1566         142 :                                    mul_size(sizeof(BufferUsage), pcxt->nworkers));
    1567         142 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
    1568             : 
    1569             :     /* Launch workers, saving status for leader/caller */
    1570         142 :     LaunchParallelWorkers(pcxt);
    1571         142 :     btleader->pcxt = pcxt;
    1572         142 :     btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
    1573         142 :     if (leaderparticipates)
    1574         142 :         btleader->nparticipanttuplesorts++;
    1575         142 :     btleader->btshared = btshared;
    1576         142 :     btleader->sharedsort = sharedsort;
    1577         142 :     btleader->sharedsort2 = sharedsort2;
    1578         142 :     btleader->snapshot = snapshot;
    1579         142 :     btleader->walusage = walusage;
    1580         142 :     btleader->bufferusage = bufferusage;
    1581             : 
    1582             :     /* If no workers were successfully launched, back out (do serial build) */
    1583         142 :     if (pcxt->nworkers_launched == 0)
    1584             :     {
    1585           0 :         _bt_end_parallel(btleader);
    1586           0 :         return;
    1587             :     }
    1588             : 
    1589             :     /* Save leader state now that it's clear build will be parallel */
    1590         142 :     buildstate->btleader = btleader;
    1591             : 
    1592             :     /* Join heap scan ourselves */
    1593         142 :     if (leaderparticipates)
    1594         142 :         _bt_leader_participate_as_worker(buildstate);
    1595             : 
    1596             :     /*
    1597             :      * Caller needs to wait for all launched workers when we return.  Make
    1598             :      * sure that the failure-to-start case will not hang forever.
    1599             :      */
    1600         142 :     WaitForParallelWorkersToAttach(pcxt);
    1601             : }
    1602             : 
    1603             : /*
    1604             :  * Shut down workers, destroy parallel context, and end parallel mode.
    1605             :  */
    1606             : static void
    1607         142 : _bt_end_parallel(BTLeader *btleader)
    1608             : {
    1609             :     int         i;
    1610             : 
    1611             :     /* Shutdown worker processes */
    1612         142 :     WaitForParallelWorkersToFinish(btleader->pcxt);
    1613             : 
    1614             :     /*
    1615             :      * Next, accumulate WAL usage.  (This must wait for the workers to finish,
    1616             :      * or we might get incomplete data.)
    1617             :      */
    1618         284 :     for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
    1619         142 :         InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
    1620             : 
    1621             :     /* Free last reference to MVCC snapshot, if one was used */
    1622         142 :     if (IsMVCCSnapshot(btleader->snapshot))
    1623           0 :         UnregisterSnapshot(btleader->snapshot);
    1624         142 :     DestroyParallelContext(btleader->pcxt);
    1625         142 :     ExitParallelMode();
    1626         142 : }
    1627             : 
    1628             : /*
    1629             :  * Returns size of shared memory required to store state for a parallel
    1630             :  * btree index build based on the snapshot its parallel scan will use.
    1631             :  */
    1632             : static Size
    1633         142 : _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
    1634             : {
    1635             :     /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
    1636         142 :     return add_size(BUFFERALIGN(sizeof(BTShared)),
    1637             :                     table_parallelscan_estimate(heap, snapshot));
    1638             : }
    1639             : 
    1640             : /*
    1641             :  * Within leader, wait for end of heap scan.
    1642             :  *
    1643             :  * When called, parallel heap scan started by _bt_begin_parallel() will
    1644             :  * already be underway within worker processes (when leader participates
    1645             :  * as a worker, we should end up here just as workers are finishing).
    1646             :  *
    1647             :  * Fills in fields needed for ambuild statistics, and lets caller set
    1648             :  * field indicating that some worker encountered a broken HOT chain.
    1649             :  *
    1650             :  * Returns the total number of heap tuples scanned.
    1651             :  */
    1652             : static double
    1653         142 : _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
    1654             : {
    1655         142 :     BTShared   *btshared = buildstate->btleader->btshared;
    1656             :     int         nparticipanttuplesorts;
    1657             :     double      reltuples;
    1658             : 
    1659         142 :     nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
    1660             :     for (;;)
    1661             :     {
    1662         366 :         SpinLockAcquire(&btshared->mutex);
    1663         366 :         if (btshared->nparticipantsdone == nparticipanttuplesorts)
    1664             :         {
    1665         142 :             buildstate->havedead = btshared->havedead;
    1666         142 :             buildstate->indtuples = btshared->indtuples;
    1667         142 :             *brokenhotchain = btshared->brokenhotchain;
    1668         142 :             reltuples = btshared->reltuples;
    1669         142 :             SpinLockRelease(&btshared->mutex);
    1670         142 :             break;
    1671             :         }
    1672         224 :         SpinLockRelease(&btshared->mutex);
    1673             : 
    1674         224 :         ConditionVariableSleep(&btshared->workersdonecv,
    1675             :                                WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
    1676             :     }
    1677             : 
    1678         142 :     ConditionVariableCancelSleep();
    1679             : 
    1680         142 :     return reltuples;
    1681             : }
    1682             : 
    1683             : /*
    1684             :  * Within leader, participate as a parallel worker.
    1685             :  */
    1686             : static void
    1687         142 : _bt_leader_participate_as_worker(BTBuildState *buildstate)
    1688             : {
    1689         142 :     BTLeader   *btleader = buildstate->btleader;
    1690             :     BTSpool    *leaderworker;
    1691             :     BTSpool    *leaderworker2;
    1692             :     int         sortmem;
    1693             : 
    1694             :     /* Allocate memory and initialize private spool */
    1695         142 :     leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
    1696         142 :     leaderworker->heap = buildstate->spool->heap;
    1697         142 :     leaderworker->index = buildstate->spool->index;
    1698         142 :     leaderworker->isunique = buildstate->spool->isunique;
    1699         142 :     leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;
    1700             : 
    1701             :     /* Initialize second spool, if required */
    1702         142 :     if (!btleader->btshared->isunique)
    1703          78 :         leaderworker2 = NULL;
    1704             :     else
    1705             :     {
    1706             :         /* Allocate memory for worker's own private secondary spool */
    1707          64 :         leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
    1708             : 
    1709             :         /* Initialize worker's own secondary spool */
    1710          64 :         leaderworker2->heap = leaderworker->heap;
    1711          64 :         leaderworker2->index = leaderworker->index;
    1712          64 :         leaderworker2->isunique = false;
    1713             :     }
    1714             : 
    1715             :     /*
    1716             :      * Might as well use reliable figure when doling out maintenance_work_mem
    1717             :      * (when requested number of workers were not launched, this will be
    1718             :      * somewhat higher than it is for other workers).
    1719             :      */
    1720         142 :     sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
    1721             : 
    1722             :     /* Perform work common to all participants */
    1723         142 :     _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
    1724             :                                btleader->sharedsort, btleader->sharedsort2,
    1725             :                                sortmem, true);
    1726             : 
    1727             : #ifdef BTREE_BUILD_STATS
    1728             :     if (log_btree_build_stats)
    1729             :     {
    1730             :         ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
    1731             :         ResetUsage();
    1732             :     }
    1733             : #endif                          /* BTREE_BUILD_STATS */
    1734         142 : }
    1735             : 
    1736             : /*
    1737             :  * Perform work within a launched parallel process.
    1738             :  */
    1739             : void
    1740         142 : _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
    1741             : {
    1742             :     char       *sharedquery;
    1743             :     BTSpool    *btspool;
    1744             :     BTSpool    *btspool2;
    1745             :     BTShared   *btshared;
    1746             :     Sharedsort *sharedsort;
    1747             :     Sharedsort *sharedsort2;
    1748             :     Relation    heapRel;
    1749             :     Relation    indexRel;
    1750             :     LOCKMODE    heapLockmode;
    1751             :     LOCKMODE    indexLockmode;
    1752             :     WalUsage   *walusage;
    1753             :     BufferUsage *bufferusage;
    1754             :     int         sortmem;
    1755             : 
    1756             : #ifdef BTREE_BUILD_STATS
    1757             :     if (log_btree_build_stats)
    1758             :         ResetUsage();
    1759             : #endif                          /* BTREE_BUILD_STATS */
    1760             : 
    1761             :     /*
    1762             :      * The only possible status flag that can be set to the parallel worker is
    1763             :      * PROC_IN_SAFE_IC.
    1764             :      */
    1765             :     Assert((MyProc->statusFlags == 0) ||
    1766             :            (MyProc->statusFlags == PROC_IN_SAFE_IC));
    1767             : 
    1768             :     /* Set debug_query_string for individual workers first */
    1769         142 :     sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
    1770         142 :     debug_query_string = sharedquery;
    1771             : 
    1772             :     /* Report the query string from leader */
    1773         142 :     pgstat_report_activity(STATE_RUNNING, debug_query_string);
    1774             : 
    1775             :     /* Look up nbtree shared state */
    1776         142 :     btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
    1777             : 
    1778             :     /* Open relations using lock modes known to be obtained by index.c */
    1779         142 :     if (!btshared->isconcurrent)
    1780             :     {
    1781         142 :         heapLockmode = ShareLock;
    1782         142 :         indexLockmode = AccessExclusiveLock;
    1783             :     }
    1784             :     else
    1785             :     {
    1786           0 :         heapLockmode = ShareUpdateExclusiveLock;
    1787           0 :         indexLockmode = RowExclusiveLock;
    1788             :     }
    1789             : 
    1790             :     /* Open relations within worker */
    1791         142 :     heapRel = table_open(btshared->heaprelid, heapLockmode);
    1792         142 :     indexRel = index_open(btshared->indexrelid, indexLockmode);
    1793             : 
    1794             :     /* Initialize worker's own spool */
    1795         142 :     btspool = (BTSpool *) palloc0(sizeof(BTSpool));
    1796         142 :     btspool->heap = heapRel;
    1797         142 :     btspool->index = indexRel;
    1798         142 :     btspool->isunique = btshared->isunique;
    1799         142 :     btspool->nulls_not_distinct = btshared->nulls_not_distinct;
    1800             : 
    1801             :     /* Look up shared state private to tuplesort.c */
    1802         142 :     sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
    1803         142 :     tuplesort_attach_shared(sharedsort, seg);
    1804         142 :     if (!btshared->isunique)
    1805             :     {
    1806          78 :         btspool2 = NULL;
    1807          78 :         sharedsort2 = NULL;
    1808             :     }
    1809             :     else
    1810             :     {
    1811             :         /* Allocate memory for worker's own private secondary spool */
    1812          64 :         btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
    1813             : 
    1814             :         /* Initialize worker's own secondary spool */
    1815          64 :         btspool2->heap = btspool->heap;
    1816          64 :         btspool2->index = btspool->index;
    1817          64 :         btspool2->isunique = false;
    1818             :         /* Look up shared state private to tuplesort.c */
    1819          64 :         sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
    1820          64 :         tuplesort_attach_shared(sharedsort2, seg);
    1821             :     }
    1822             : 
    1823             :     /* Prepare to track buffer usage during parallel execution */
    1824         142 :     InstrStartParallelQuery();
    1825             : 
    1826             :     /* Perform sorting of spool, and possibly a spool2 */
    1827         142 :     sortmem = maintenance_work_mem / btshared->scantuplesortstates;
    1828         142 :     _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
    1829             :                                sharedsort2, sortmem, false);
    1830             : 
    1831             :     /* Report WAL/buffer usage during parallel execution */
    1832         142 :     bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
    1833         142 :     walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
    1834         142 :     InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
    1835         142 :                           &walusage[ParallelWorkerNumber]);
    1836             : 
    1837             : #ifdef BTREE_BUILD_STATS
    1838             :     if (log_btree_build_stats)
    1839             :     {
    1840             :         ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
    1841             :         ResetUsage();
    1842             :     }
    1843             : #endif                          /* BTREE_BUILD_STATS */
    1844             : 
    1845         142 :     index_close(indexRel, indexLockmode);
    1846         142 :     table_close(heapRel, heapLockmode);
    1847         142 : }
    1848             : 
    1849             : /*
    1850             :  * Perform a worker's portion of a parallel sort.
    1851             :  *
    1852             :  * This generates a tuplesort for passed btspool, and a second tuplesort
    1853             :  * state if a second btspool is need (i.e. for unique index builds).  All
    1854             :  * other spool fields should already be set when this is called.
    1855             :  *
    1856             :  * sortmem is the amount of working memory to use within each worker,
    1857             :  * expressed in KBs.
    1858             :  *
    1859             :  * When this returns, workers are done, and need only release resources.
    1860             :  */
    1861             : static void
    1862         284 : _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
    1863             :                            BTShared *btshared, Sharedsort *sharedsort,
    1864             :                            Sharedsort *sharedsort2, int sortmem, bool progress)
    1865             : {
    1866             :     SortCoordinate coordinate;
    1867             :     BTBuildState buildstate;
    1868             :     TableScanDesc scan;
    1869             :     double      reltuples;
    1870             :     IndexInfo  *indexInfo;
    1871             : 
    1872             :     /* Initialize local tuplesort coordination state */
    1873         284 :     coordinate = palloc0(sizeof(SortCoordinateData));
    1874         284 :     coordinate->isWorker = true;
    1875         284 :     coordinate->nParticipants = -1;
    1876         284 :     coordinate->sharedsort = sharedsort;
    1877             : 
    1878             :     /* Begin "partial" tuplesort */
    1879         568 :     btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
    1880             :                                                      btspool->index,
    1881         284 :                                                      btspool->isunique,
    1882         284 :                                                      btspool->nulls_not_distinct,
    1883             :                                                      sortmem, coordinate,
    1884             :                                                      TUPLESORT_NONE);
    1885             : 
    1886             :     /*
    1887             :      * Just as with serial case, there may be a second spool.  If so, a
    1888             :      * second, dedicated spool2 partial tuplesort is required.
    1889             :      */
    1890         284 :     if (btspool2)
    1891             :     {
    1892             :         SortCoordinate coordinate2;
    1893             : 
    1894             :         /*
    1895             :          * We expect that the second one (for dead tuples) won't get very
    1896             :          * full, so we give it only work_mem (unless sortmem is less for
    1897             :          * worker).  Worker processes are generally permitted to allocate
    1898             :          * work_mem independently.
    1899             :          */
    1900         128 :         coordinate2 = palloc0(sizeof(SortCoordinateData));
    1901         128 :         coordinate2->isWorker = true;
    1902         128 :         coordinate2->nParticipants = -1;
    1903         128 :         coordinate2->sharedsort = sharedsort2;
    1904         128 :         btspool2->sortstate =
    1905         128 :             tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,
    1906             :                                         Min(sortmem, work_mem), coordinate2,
    1907             :                                         false);
    1908             :     }
    1909             : 
    1910             :     /* Fill in buildstate for _bt_build_callback() */
    1911         284 :     buildstate.isunique = btshared->isunique;
    1912         284 :     buildstate.nulls_not_distinct = btshared->nulls_not_distinct;
    1913         284 :     buildstate.havedead = false;
    1914         284 :     buildstate.heap = btspool->heap;
    1915         284 :     buildstate.spool = btspool;
    1916         284 :     buildstate.spool2 = btspool2;
    1917         284 :     buildstate.indtuples = 0;
    1918         284 :     buildstate.btleader = NULL;
    1919             : 
    1920             :     /* Join parallel scan */
    1921         284 :     indexInfo = BuildIndexInfo(btspool->index);
    1922         284 :     indexInfo->ii_Concurrent = btshared->isconcurrent;
    1923         284 :     scan = table_beginscan_parallel(btspool->heap,
    1924             :                                     ParallelTableScanFromBTShared(btshared));
    1925         284 :     reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
    1926             :                                        true, progress, _bt_build_callback,
    1927             :                                        (void *) &buildstate, scan);
    1928             : 
    1929             :     /* Execute this worker's part of the sort */
    1930         284 :     if (progress)
    1931         142 :         pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
    1932             :                                      PROGRESS_BTREE_PHASE_PERFORMSORT_1);
    1933         284 :     tuplesort_performsort(btspool->sortstate);
    1934         284 :     if (btspool2)
    1935             :     {
    1936         128 :         if (progress)
    1937          64 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
    1938             :                                          PROGRESS_BTREE_PHASE_PERFORMSORT_2);
    1939         128 :         tuplesort_performsort(btspool2->sortstate);
    1940             :     }
    1941             : 
    1942             :     /*
    1943             :      * Done.  Record ambuild statistics, and whether we encountered a broken
    1944             :      * HOT chain.
    1945             :      */
    1946         284 :     SpinLockAcquire(&btshared->mutex);
    1947         284 :     btshared->nparticipantsdone++;
    1948         284 :     btshared->reltuples += reltuples;
    1949         284 :     if (buildstate.havedead)
    1950           0 :         btshared->havedead = true;
    1951         284 :     btshared->indtuples += buildstate.indtuples;
    1952         284 :     if (indexInfo->ii_BrokenHotChain)
    1953           0 :         btshared->brokenhotchain = true;
    1954         284 :     SpinLockRelease(&btshared->mutex);
    1955             : 
    1956             :     /* Notify leader */
    1957         284 :     ConditionVariableSignal(&btshared->workersdonecv);
    1958             : 
    1959             :     /* We can end tuplesorts immediately */
    1960         284 :     tuplesort_end(btspool->sortstate);
    1961         284 :     if (btspool2)
    1962         128 :         tuplesort_end(btspool2->sortstate);
    1963         284 : }

Generated by: LCOV version 1.14