LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtsort.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 96.9 % 555 538
Test Date: 2026-03-04 02:14:45 Functions: 100.0 % 22 22
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * nbtsort.c
       4              :  *      Build a btree from sorted input by loading leaf pages sequentially.
       5              :  *
       6              :  * NOTES
       7              :  *
       8              :  * We use tuplesort.c to sort the given index tuples into order.
       9              :  * Then we scan the index tuples in order and build the btree pages
      10              :  * for each level.  We load source tuples into leaf-level pages.
      11              :  * Whenever we fill a page at one level, we add a link to it to its
      12              :  * parent level (starting a new parent level if necessary).  When
      13              :  * done, we write out each final page on each level, adding it to
      14              :  * its parent level.  When we have only one page on a level, it must be
      15              :  * the root -- it can be attached to the btree metapage and we are done.
      16              :  *
      17              :  * It is not wise to pack the pages entirely full, since then *any*
      18              :  * insertion would cause a split (and not only of the leaf page; the need
      19              :  * for a split would cascade right up the tree).  The steady-state load
      20              :  * factor for btrees is usually estimated at 70%.  We choose to pack leaf
      21              :  * pages to the user-controllable fill factor (default 90%) while upper pages
      22              :  * are always packed to 70%.  This gives us reasonable density (there aren't
      23              :  * many upper pages if the keys are reasonable-size) without risking a lot of
      24              :  * cascading splits during early insertions.
      25              :  *
      26              :  * We use the bulk smgr loading facility to bypass the buffer cache and
      27              :  * WAL-log the pages efficiently.
      28              :  *
      29              :  * This code isn't concerned about the FSM at all. The caller is responsible
      30              :  * for initializing that.
      31              :  *
      32              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      33              :  * Portions Copyright (c) 1994, Regents of the University of California
      34              :  *
      35              :  * IDENTIFICATION
      36              :  *    src/backend/access/nbtree/nbtsort.c
      37              :  *
      38              :  *-------------------------------------------------------------------------
      39              :  */
      40              : 
      41              : #include "postgres.h"
      42              : 
      43              : #include "access/nbtree.h"
      44              : #include "access/parallel.h"
      45              : #include "access/relscan.h"
      46              : #include "access/table.h"
      47              : #include "access/tableam.h"
      48              : #include "access/xact.h"
      49              : #include "catalog/index.h"
      50              : #include "commands/progress.h"
      51              : #include "executor/instrument.h"
      52              : #include "miscadmin.h"
      53              : #include "pgstat.h"
      54              : #include "storage/bulk_write.h"
      55              : #include "storage/proc.h"
      56              : #include "tcop/tcopprot.h"
      57              : #include "utils/rel.h"
      58              : #include "utils/sortsupport.h"
      59              : #include "utils/tuplesort.h"
      60              : 
      61              : 
      62              : /* Magic numbers for parallel state sharing */
      63              : #define PARALLEL_KEY_BTREE_SHARED       UINT64CONST(0xA000000000000001)
      64              : #define PARALLEL_KEY_TUPLESORT          UINT64CONST(0xA000000000000002)
      65              : #define PARALLEL_KEY_TUPLESORT_SPOOL2   UINT64CONST(0xA000000000000003)
      66              : #define PARALLEL_KEY_QUERY_TEXT         UINT64CONST(0xA000000000000004)
      67              : #define PARALLEL_KEY_WAL_USAGE          UINT64CONST(0xA000000000000005)
      68              : #define PARALLEL_KEY_BUFFER_USAGE       UINT64CONST(0xA000000000000006)
      69              : 
      70              : /*
      71              :  * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
      72              :  * parallel index builds.  This may be useful as a debugging aid.
      73              :  */
      74              : /* #define DISABLE_LEADER_PARTICIPATION */
      75              : 
      76              : /*
      77              :  * Status record for spooling/sorting phase.  (Note we may have two of
      78              :  * these due to the special requirements for uniqueness-checking with
      79              :  * dead tuples.)
      80              :  */
      81              : typedef struct BTSpool
      82              : {
      83              :     Tuplesortstate *sortstate;  /* state data for tuplesort.c */
      84              :     Relation    heap;
      85              :     Relation    index;
      86              :     bool        isunique;
      87              :     bool        nulls_not_distinct;
      88              : } BTSpool;
      89              : 
      90              : /*
      91              :  * Status for index builds performed in parallel.  This is allocated in a
      92              :  * dynamic shared memory segment.  Note that there is a separate tuplesort TOC
      93              :  * entry, private to tuplesort.c but allocated by this module on its behalf.
      94              :  */
      95              : typedef struct BTShared
      96              : {
      97              :     /*
      98              :      * These fields are not modified during the sort.  They primarily exist
      99              :      * for the benefit of worker processes that need to create BTSpool state
     100              :      * corresponding to that used by the leader.
     101              :      */
     102              :     Oid         heaprelid;
     103              :     Oid         indexrelid;
     104              :     bool        isunique;
     105              :     bool        nulls_not_distinct;
     106              :     bool        isconcurrent;
     107              :     int         scantuplesortstates;
     108              : 
     109              :     /* Query ID, for report in worker processes */
     110              :     int64       queryid;
     111              : 
     112              :     /*
     113              :      * workersdonecv is used to monitor the progress of workers.  All parallel
     114              :      * participants must indicate that they are done before leader can use
     115              :      * mutable state that workers maintain during scan (and before leader can
     116              :      * proceed to tuplesort_performsort()).
     117              :      */
     118              :     ConditionVariable workersdonecv;
     119              : 
     120              :     /*
     121              :      * mutex protects all fields before heapdesc.
     122              :      *
     123              :      * These fields contain status information of interest to B-Tree index
     124              :      * builds that must work just the same when an index is built in parallel.
     125              :      */
     126              :     slock_t     mutex;
     127              : 
     128              :     /*
     129              :      * Mutable state that is maintained by workers, and reported back to
     130              :      * leader at end of parallel scan.
     131              :      *
     132              :      * nparticipantsdone is number of worker processes finished.
     133              :      *
     134              :      * reltuples is the total number of input heap tuples.
     135              :      *
     136              :      * havedead indicates if RECENTLY_DEAD tuples were encountered during
     137              :      * build.
     138              :      *
     139              :      * indtuples is the total number of tuples that made it into the index.
     140              :      *
     141              :      * brokenhotchain indicates if any worker detected a broken HOT chain
     142              :      * during build.
     143              :      */
     144              :     int         nparticipantsdone;
     145              :     double      reltuples;
     146              :     bool        havedead;
     147              :     double      indtuples;
     148              :     bool        brokenhotchain;
     149              : 
     150              :     /*
     151              :      * ParallelTableScanDescData data follows. Can't directly embed here, as
     152              :      * implementations of the parallel table scan desc interface might need
     153              :      * stronger alignment.
     154              :      */
     155              : } BTShared;
     156              : 
     157              : /*
     158              :  * Return pointer to a BTShared's parallel table scan.
     159              :  *
     160              :  * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
     161              :  * MAXALIGN.
     162              :  */
     163              : #define ParallelTableScanFromBTShared(shared) \
     164              :     (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
     165              : 
     166              : /*
     167              :  * Status for leader in parallel index build.
     168              :  */
     169              : typedef struct BTLeader
     170              : {
     171              :     /* parallel context itself */
     172              :     ParallelContext *pcxt;
     173              : 
     174              :     /*
     175              :      * nparticipanttuplesorts is the exact number of worker processes
     176              :      * successfully launched, plus one leader process if it participates as a
     177              :      * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
     178              :      * participating as a worker).
     179              :      */
     180              :     int         nparticipanttuplesorts;
     181              : 
     182              :     /*
     183              :      * Leader process convenience pointers to shared state (leader avoids TOC
     184              :      * lookups).
     185              :      *
     186              :      * btshared is the shared state for entire build.  sharedsort is the
     187              :      * shared, tuplesort-managed state passed to each process tuplesort.
     188              :      * sharedsort2 is the corresponding btspool2 shared state, used only when
     189              :      * building unique indexes.  snapshot is the snapshot used by the scan iff
     190              :      * an MVCC snapshot is required.
     191              :      */
     192              :     BTShared   *btshared;
     193              :     Sharedsort *sharedsort;
     194              :     Sharedsort *sharedsort2;
     195              :     Snapshot    snapshot;
     196              :     WalUsage   *walusage;
     197              :     BufferUsage *bufferusage;
     198              : } BTLeader;
     199              : 
     200              : /*
     201              :  * Working state for btbuild and its callback.
     202              :  *
     203              :  * When parallel CREATE INDEX is used, there is a BTBuildState for each
     204              :  * participant.
     205              :  */
     206              : typedef struct BTBuildState
     207              : {
     208              :     bool        isunique;
     209              :     bool        nulls_not_distinct;
     210              :     bool        havedead;
     211              :     Relation    heap;
     212              :     BTSpool    *spool;
     213              : 
     214              :     /*
     215              :      * spool2 is needed only when the index is a unique index. Dead tuples are
     216              :      * put into spool2 instead of spool in order to avoid uniqueness check.
     217              :      */
     218              :     BTSpool    *spool2;
     219              :     double      indtuples;
     220              : 
     221              :     /*
     222              :      * btleader is only present when a parallel index build is performed, and
     223              :      * only in the leader process. (Actually, only the leader has a
     224              :      * BTBuildState.  Workers have their own spool and spool2, though.)
     225              :      */
     226              :     BTLeader   *btleader;
     227              : } BTBuildState;
     228              : 
     229              : /*
     230              :  * Status record for a btree page being built.  We have one of these
     231              :  * for each active tree level.
     232              :  */
     233              : typedef struct BTPageState
     234              : {
     235              :     BulkWriteBuffer btps_buf;   /* workspace for page building */
     236              :     BlockNumber btps_blkno;     /* block # to write this page at */
     237              :     IndexTuple  btps_lowkey;    /* page's strict lower bound pivot tuple */
     238              :     OffsetNumber btps_lastoff;  /* last item offset loaded */
     239              :     Size        btps_lastextra; /* last item's extra posting list space */
     240              :     uint32      btps_level;     /* tree level (0 = leaf) */
     241              :     Size        btps_full;      /* "full" if less than this much free space */
     242              :     struct BTPageState *btps_next;  /* link to parent level, if any */
     243              : } BTPageState;
     244              : 
     245              : /*
     246              :  * Overall status record for index writing phase.
     247              :  */
     248              : typedef struct BTWriteState
     249              : {
     250              :     Relation    heap;
     251              :     Relation    index;
     252              :     BulkWriteState *bulkstate;
     253              :     BTScanInsert inskey;        /* generic insertion scankey */
     254              :     BlockNumber btws_pages_alloced; /* # pages allocated */
     255              : } BTWriteState;
     256              : 
     257              : 
     258              : static double _bt_spools_heapscan(Relation heap, Relation index,
     259              :                                   BTBuildState *buildstate, IndexInfo *indexInfo);
     260              : static void _bt_spooldestroy(BTSpool *btspool);
     261              : static void _bt_spool(BTSpool *btspool, const ItemPointerData *self,
     262              :                       const Datum *values, const bool *isnull);
     263              : static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
     264              : static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
     265              :                                bool *isnull, bool tupleIsAlive, void *state);
     266              : static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level);
     267              : static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
     268              : static void _bt_slideleft(Page rightmostpage);
     269              : static void _bt_sortaddtup(Page page, Size itemsize,
     270              :                            const IndexTupleData *itup, OffsetNumber itup_off,
     271              :                            bool newfirstdataitem);
     272              : static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
     273              :                          IndexTuple itup, Size truncextra);
     274              : static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
     275              :                                           BTPageState *state,
     276              :                                           BTDedupState dstate);
     277              : static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
     278              : static void _bt_load(BTWriteState *wstate,
     279              :                      BTSpool *btspool, BTSpool *btspool2);
     280              : static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
     281              :                                int request);
     282              : static void _bt_end_parallel(BTLeader *btleader);
     283              : static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
     284              : static double _bt_parallel_heapscan(BTBuildState *buildstate,
     285              :                                     bool *brokenhotchain);
     286              : static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
     287              : static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
     288              :                                        BTShared *btshared, Sharedsort *sharedsort,
     289              :                                        Sharedsort *sharedsort2, int sortmem,
     290              :                                        bool progress);
     291              : 
     292              : 
     293              : /*
     294              :  *  btbuild() -- build a new btree index.
     295              :  */
     296              : IndexBuildResult *
     297        26176 : btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
     298              : {
     299              :     IndexBuildResult *result;
     300              :     BTBuildState buildstate;
     301              :     double      reltuples;
     302              : 
     303              : #ifdef BTREE_BUILD_STATS
     304              :     if (log_btree_build_stats)
     305              :         ResetUsage();
     306              : #endif                          /* BTREE_BUILD_STATS */
     307              : 
     308        26176 :     buildstate.isunique = indexInfo->ii_Unique;
     309        26176 :     buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
     310        26176 :     buildstate.havedead = false;
     311        26176 :     buildstate.heap = heap;
     312        26176 :     buildstate.spool = NULL;
     313        26176 :     buildstate.spool2 = NULL;
     314        26176 :     buildstate.indtuples = 0;
     315        26176 :     buildstate.btleader = NULL;
     316              : 
     317              :     /*
     318              :      * We expect to be called exactly once for any index relation. If that's
     319              :      * not the case, big trouble's what we have.
     320              :      */
     321        26176 :     if (RelationGetNumberOfBlocks(index) != 0)
     322            0 :         elog(ERROR, "index \"%s\" already contains data",
     323              :              RelationGetRelationName(index));
     324              : 
     325        26176 :     reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
     326              : 
     327              :     /*
     328              :      * Finish the build by (1) completing the sort of the spool file, (2)
     329              :      * inserting the sorted tuples into btree pages and (3) building the upper
     330              :      * levels.  Finally, it may also be necessary to end use of parallelism.
     331              :      */
     332        26170 :     _bt_leafbuild(buildstate.spool, buildstate.spool2);
     333        26128 :     _bt_spooldestroy(buildstate.spool);
     334        26128 :     if (buildstate.spool2)
     335            7 :         _bt_spooldestroy(buildstate.spool2);
     336        26128 :     if (buildstate.btleader)
     337           80 :         _bt_end_parallel(buildstate.btleader);
     338              : 
     339        26128 :     result = palloc_object(IndexBuildResult);
     340              : 
     341        26128 :     result->heap_tuples = reltuples;
     342        26128 :     result->index_tuples = buildstate.indtuples;
     343              : 
     344              : #ifdef BTREE_BUILD_STATS
     345              :     if (log_btree_build_stats)
     346              :     {
     347              :         ShowUsage("BTREE BUILD STATS");
     348              :         ResetUsage();
     349              :     }
     350              : #endif                          /* BTREE_BUILD_STATS */
     351              : 
     352        26128 :     return result;
     353              : }
     354              : 
     355              : /*
     356              :  * Create and initialize one or two spool structures, and save them in caller's
     357              :  * buildstate argument.  May also fill-in fields within indexInfo used by index
     358              :  * builds.
     359              :  *
     360              :  * Scans the heap, possibly in parallel, filling spools with IndexTuples.  This
     361              :  * routine encapsulates all aspects of managing parallelism.  Caller need only
     362              :  * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
     363              :  *
     364              :  * Returns the total number of heap tuples scanned.
     365              :  */
     366              : static double
     367        26176 : _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
     368              :                     IndexInfo *indexInfo)
     369              : {
     370        26176 :     BTSpool    *btspool = palloc0_object(BTSpool);
     371        26176 :     SortCoordinate coordinate = NULL;
     372        26176 :     double      reltuples = 0;
     373              : 
     374              :     /*
     375              :      * We size the sort area as maintenance_work_mem rather than work_mem to
     376              :      * speed index creation.  This should be OK since a single backend can't
     377              :      * run multiple index creations in parallel (see also: notes on
     378              :      * parallelism and maintenance_work_mem below).
     379              :      */
     380        26176 :     btspool->heap = heap;
     381        26176 :     btspool->index = index;
     382        26176 :     btspool->isunique = indexInfo->ii_Unique;
     383        26176 :     btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
     384              : 
     385              :     /* Save as primary spool */
     386        26176 :     buildstate->spool = btspool;
     387              : 
     388              :     /* Report table scan phase started */
     389        26176 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     390              :                                  PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);
     391              : 
     392              :     /* Attempt to launch parallel worker scan when required */
     393        26176 :     if (indexInfo->ii_ParallelWorkers > 0)
     394           80 :         _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
     395              :                            indexInfo->ii_ParallelWorkers);
     396              : 
     397              :     /*
     398              :      * If parallel build requested and at least one worker process was
     399              :      * successfully launched, set up coordination state
     400              :      */
     401        26176 :     if (buildstate->btleader)
     402              :     {
     403           80 :         coordinate = palloc0_object(SortCoordinateData);
     404           80 :         coordinate->isWorker = false;
     405           80 :         coordinate->nParticipants =
     406           80 :             buildstate->btleader->nparticipanttuplesorts;
     407           80 :         coordinate->sharedsort = buildstate->btleader->sharedsort;
     408              :     }
     409              : 
     410              :     /*
     411              :      * Begin serial/leader tuplesort.
     412              :      *
     413              :      * In cases where parallelism is involved, the leader receives the same
     414              :      * share of maintenance_work_mem as a serial sort (it is generally treated
     415              :      * in the same way as a serial sort once we return).  Parallel worker
     416              :      * Tuplesortstates will have received only a fraction of
     417              :      * maintenance_work_mem, though.
     418              :      *
     419              :      * We rely on the lifetime of the Leader Tuplesortstate almost not
     420              :      * overlapping with any worker Tuplesortstate's lifetime.  There may be
     421              :      * some small overlap, but that's okay because we rely on leader
     422              :      * Tuplesortstate only allocating a small, fixed amount of memory here.
     423              :      * When its tuplesort_performsort() is called (by our caller), and
     424              :      * significant amounts of memory are likely to be used, all workers must
     425              :      * have already freed almost all memory held by their Tuplesortstates
     426              :      * (they are about to go away completely, too).  The overall effect is
     427              :      * that maintenance_work_mem always represents an absolute high watermark
     428              :      * on the amount of memory used by a CREATE INDEX operation, regardless of
     429              :      * the use of parallelism or any other factor.
     430              :      */
     431        52352 :     buildstate->spool->sortstate =
     432        26176 :         tuplesort_begin_index_btree(heap, index, buildstate->isunique,
     433        26176 :                                     buildstate->nulls_not_distinct,
     434              :                                     maintenance_work_mem, coordinate,
     435              :                                     TUPLESORT_NONE);
     436              : 
     437              :     /*
     438              :      * If building a unique index, put dead tuples in a second spool to keep
     439              :      * them out of the uniqueness check.  We expect that the second spool (for
     440              :      * dead tuples) won't get very full, so we give it only work_mem.
     441              :      */
     442        26176 :     if (indexInfo->ii_Unique)
     443              :     {
     444        21241 :         BTSpool    *btspool2 = palloc0_object(BTSpool);
     445        21241 :         SortCoordinate coordinate2 = NULL;
     446              : 
     447              :         /* Initialize secondary spool */
     448        21241 :         btspool2->heap = heap;
     449        21241 :         btspool2->index = index;
     450        21241 :         btspool2->isunique = false;
     451              :         /* Save as secondary spool */
     452        21241 :         buildstate->spool2 = btspool2;
     453              : 
     454        21241 :         if (buildstate->btleader)
     455              :         {
     456              :             /*
     457              :              * Set up non-private state that is passed to
     458              :              * tuplesort_begin_index_btree() about the basic high level
     459              :              * coordination of a parallel sort.
     460              :              */
     461           35 :             coordinate2 = palloc0_object(SortCoordinateData);
     462           35 :             coordinate2->isWorker = false;
     463           35 :             coordinate2->nParticipants =
     464           35 :                 buildstate->btleader->nparticipanttuplesorts;
     465           35 :             coordinate2->sharedsort = buildstate->btleader->sharedsort2;
     466              :         }
     467              : 
     468              :         /*
     469              :          * We expect that the second one (for dead tuples) won't get very
     470              :          * full, so we give it only work_mem
     471              :          */
     472        21241 :         buildstate->spool2->sortstate =
     473        21241 :             tuplesort_begin_index_btree(heap, index, false, false, work_mem,
     474              :                                         coordinate2, TUPLESORT_NONE);
     475              :     }
     476              : 
     477              :     /* Fill spool using either serial or parallel heap scan */
     478        26176 :     if (!buildstate->btleader)
     479        26096 :         reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
     480              :                                            _bt_build_callback, buildstate,
     481              :                                            NULL);
     482              :     else
     483           80 :         reltuples = _bt_parallel_heapscan(buildstate,
     484              :                                           &indexInfo->ii_BrokenHotChain);
     485              : 
     486              :     /*
     487              :      * Set the progress target for the next phase.  Reset the block number
     488              :      * values set by table_index_build_scan
     489              :      */
     490              :     {
     491        26170 :         const int   progress_index[] = {
     492              :             PROGRESS_CREATEIDX_TUPLES_TOTAL,
     493              :             PROGRESS_SCAN_BLOCKS_TOTAL,
     494              :             PROGRESS_SCAN_BLOCKS_DONE
     495              :         };
     496        26170 :         const int64 progress_vals[] = {
     497        26170 :             buildstate->indtuples,
     498              :             0, 0
     499              :         };
     500              : 
     501        26170 :         pgstat_progress_update_multi_param(3, progress_index, progress_vals);
     502              :     }
     503              : 
     504              :     /* okay, all heap tuples are spooled */
     505        26170 :     if (buildstate->spool2 && !buildstate->havedead)
     506              :     {
     507              :         /* spool2 turns out to be unnecessary */
     508        21234 :         _bt_spooldestroy(buildstate->spool2);
     509        21234 :         buildstate->spool2 = NULL;
     510              :     }
     511              : 
     512        26170 :     return reltuples;
     513              : }
     514              : 
     515              : /*
     516              :  * clean up a spool structure and its substructures.
     517              :  */
     518              : static void
     519        47369 : _bt_spooldestroy(BTSpool *btspool)
     520              : {
     521        47369 :     tuplesort_end(btspool->sortstate);
     522        47369 :     pfree(btspool);
     523        47369 : }
     524              : 
     525              : /*
     526              :  * spool an index entry into the sort file.
     527              :  */
     528              : static void
     529      6439087 : _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull)
     530              : {
     531      6439087 :     tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
     532              :                                   self, values, isnull);
     533      6439087 : }
     534              : 
     535              : /*
     536              :  * given a spool loaded by successive calls to _bt_spool,
     537              :  * create an entire btree.
     538              :  */
     539              : static void
     540        26170 : _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
     541              : {
     542              :     BTWriteState wstate;
     543              : 
     544              : #ifdef BTREE_BUILD_STATS
     545              :     if (log_btree_build_stats)
     546              :     {
     547              :         ShowUsage("BTREE BUILD (Spool) STATISTICS");
     548              :         ResetUsage();
     549              :     }
     550              : #endif                          /* BTREE_BUILD_STATS */
     551              : 
     552              :     /* Execute the sort */
     553        26170 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     554              :                                  PROGRESS_BTREE_PHASE_PERFORMSORT_1);
     555        26170 :     tuplesort_performsort(btspool->sortstate);
     556        26128 :     if (btspool2)
     557              :     {
     558            7 :         pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     559              :                                      PROGRESS_BTREE_PHASE_PERFORMSORT_2);
     560            7 :         tuplesort_performsort(btspool2->sortstate);
     561              :     }
     562              : 
     563        26128 :     wstate.heap = btspool->heap;
     564        26128 :     wstate.index = btspool->index;
     565        26128 :     wstate.inskey = _bt_mkscankey(wstate.index, NULL);
     566              :     /* _bt_mkscankey() won't set allequalimage without metapage */
     567        26128 :     wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
     568              : 
     569              :     /* reserve the metapage */
     570        26128 :     wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
     571              : 
     572        26128 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     573              :                                  PROGRESS_BTREE_PHASE_LEAF_LOAD);
     574        26128 :     _bt_load(&wstate, btspool, btspool2);
     575        26128 : }
     576              : 
     577              : /*
     578              :  * Per-tuple callback for table_index_build_scan
     579              :  */
     580              : static void
     581      6439087 : _bt_build_callback(Relation index,
     582              :                    ItemPointer tid,
     583              :                    Datum *values,
     584              :                    bool *isnull,
     585              :                    bool tupleIsAlive,
     586              :                    void *state)
     587              : {
     588      6439087 :     BTBuildState *buildstate = (BTBuildState *) state;
     589              : 
     590              :     /*
     591              :      * insert the index tuple into the appropriate spool file for subsequent
     592              :      * processing
     593              :      */
     594      6439087 :     if (tupleIsAlive || buildstate->spool2 == NULL)
     595      6438850 :         _bt_spool(buildstate->spool, tid, values, isnull);
     596              :     else
     597              :     {
     598              :         /* dead tuples are put into spool2 */
     599          237 :         buildstate->havedead = true;
     600          237 :         _bt_spool(buildstate->spool2, tid, values, isnull);
     601              :     }
     602              : 
     603      6439087 :     buildstate->indtuples += 1;
     604      6439087 : }
     605              : 
     606              : /*
     607              :  * allocate workspace for a new, clean btree page, not linked to any siblings.
     608              :  */
     609              : static BulkWriteBuffer
     610        26799 : _bt_blnewpage(BTWriteState *wstate, uint32 level)
     611              : {
     612              :     BulkWriteBuffer buf;
     613              :     Page        page;
     614              :     BTPageOpaque opaque;
     615              : 
     616        26799 :     buf = smgr_bulk_get_buf(wstate->bulkstate);
     617        26799 :     page = (Page) buf;
     618              : 
     619              :     /* Zero the page and set up standard page header info */
     620        26799 :     _bt_pageinit(page, BLCKSZ);
     621              : 
     622              :     /* Initialize BT opaque state */
     623        26799 :     opaque = BTPageGetOpaque(page);
     624        26799 :     opaque->btpo_prev = opaque->btpo_next = P_NONE;
     625        26799 :     opaque->btpo_level = level;
     626        26799 :     opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
     627        26799 :     opaque->btpo_cycleid = 0;
     628              : 
     629              :     /* Make the P_HIKEY line pointer appear allocated */
     630        26799 :     ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
     631              : 
     632        26799 :     return buf;
     633              : }
     634              : 
     635              : /*
     636              :  * emit a completed btree page, and release the working storage.
     637              :  */
     638              : static void
     639        52927 : _bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
     640              : {
     641        52927 :     smgr_bulk_write(wstate->bulkstate, blkno, buf, true);
     642              :     /* smgr_bulk_write took ownership of 'buf' */
     643        52927 : }
     644              : 
     645              : /*
     646              :  * allocate and initialize a new BTPageState.  the returned structure
     647              :  * is suitable for immediate use by _bt_buildadd.
     648              :  */
     649              : static BTPageState *
     650         6649 : _bt_pagestate(BTWriteState *wstate, uint32 level)
     651              : {
     652         6649 :     BTPageState *state = palloc0_object(BTPageState);
     653              : 
     654              :     /* create initial page for level */
     655         6649 :     state->btps_buf = _bt_blnewpage(wstate, level);
     656              : 
     657              :     /* and assign it a page position */
     658         6649 :     state->btps_blkno = wstate->btws_pages_alloced++;
     659              : 
     660         6649 :     state->btps_lowkey = NULL;
     661              :     /* initialize lastoff so first item goes into P_FIRSTKEY */
     662         6649 :     state->btps_lastoff = P_HIKEY;
     663         6649 :     state->btps_lastextra = 0;
     664         6649 :     state->btps_level = level;
     665              :     /* set "full" threshold based on level.  See notes at head of file. */
     666         6649 :     if (level > 0)
     667         1367 :         state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
     668              :     else
     669         5282 :         state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
     670              : 
     671              :     /* no parent level, yet */
     672         6649 :     state->btps_next = NULL;
     673              : 
     674         6649 :     return state;
     675              : }
     676              : 
     677              : /*
     678              :  * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
     679              :  * P_HIKEY, overwriting P_HIKEY).
     680              :  *
     681              :  * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
     682              :  * rightmost page on its level is not supposed to get a high key.  Now that
     683              :  * it's clear that this page is a rightmost page, remove the unneeded empty
     684              :  * P_HIKEY line pointer space.
     685              :  */
     686              : static void
     687         6649 : _bt_slideleft(Page rightmostpage)
     688              : {
     689              :     OffsetNumber off;
     690              :     OffsetNumber maxoff;
     691              :     ItemId      previi;
     692              : 
     693         6649 :     maxoff = PageGetMaxOffsetNumber(rightmostpage);
     694              :     Assert(maxoff >= P_FIRSTKEY);
     695         6649 :     previi = PageGetItemId(rightmostpage, P_HIKEY);
     696       399535 :     for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
     697              :     {
     698       392886 :         ItemId      thisii = PageGetItemId(rightmostpage, off);
     699              : 
     700       392886 :         *previi = *thisii;
     701       392886 :         previi = thisii;
     702              :     }
     703         6649 :     ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
     704         6649 : }
     705              : 
     706              : /*
     707              :  * Add an item to a page being built.
     708              :  *
     709              :  * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
     710              :  * raises an error directly.
     711              :  *
     712              :  * Note that our nbtsort.c caller does not know yet if the page will be
     713              :  * rightmost.  Offset P_FIRSTKEY is always assumed to be the first data key by
     714              :  * caller.  Page that turns out to be the rightmost on its level is fixed by
     715              :  * calling _bt_slideleft().
     716              :  */
     717              : static void
     718      5791878 : _bt_sortaddtup(Page page,
     719              :                Size itemsize,
     720              :                const IndexTupleData *itup,
     721              :                OffsetNumber itup_off,
     722              :                bool newfirstdataitem)
     723              : {
     724              :     IndexTupleData trunctuple;
     725              : 
     726      5791878 :     if (newfirstdataitem)
     727              :     {
     728         1429 :         trunctuple = *itup;
     729         1429 :         trunctuple.t_info = sizeof(IndexTupleData);
     730         1429 :         BTreeTupleSetNAtts(&trunctuple, 0, false);
     731         1429 :         itup = &trunctuple;
     732         1429 :         itemsize = sizeof(IndexTupleData);
     733              :     }
     734              : 
     735      5791878 :     if (PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber)
     736            0 :         elog(ERROR, "failed to add item to the index page");
     737      5791878 : }
     738              : 
     739              : /*----------
     740              :  * Add an item to a disk page from the sort output (or add a posting list
     741              :  * item formed from the sort output).
     742              :  *
     743              :  * We must be careful to observe the page layout conventions of nbtsearch.c:
     744              :  * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
     745              :  * - on non-leaf pages, the key portion of the first item need not be
     746              :  *   stored, we should store only the link.
     747              :  *
     748              :  * A leaf page being built looks like:
     749              :  *
     750              :  * +----------------+---------------------------------+
     751              :  * | PageHeaderData | linp0 linp1 linp2 ...           |
     752              :  * +-----------+----+---------------------------------+
     753              :  * | ... linpN |                                      |
     754              :  * +-----------+--------------------------------------+
     755              :  * |     ^ last                                       |
     756              :  * |                                                  |
     757              :  * +-------------+------------------------------------+
     758              :  * |             | itemN ...                          |
     759              :  * +-------------+------------------+-----------------+
     760              :  * |          ... item3 item2 item1 | "special space" |
     761              :  * +--------------------------------+-----------------+
     762              :  *
     763              :  * Contrast this with the diagram in bufpage.h; note the mismatch
     764              :  * between linps and items.  This is because we reserve linp0 as a
     765              :  * placeholder for the pointer to the "high key" item; when we have
     766              :  * filled up the page, we will set linp0 to point to itemN and clear
     767              :  * linpN.  On the other hand, if we find this is the last (rightmost)
     768              :  * page, we leave the items alone and slide the linp array over.  If
     769              :  * the high key is to be truncated, offset 1 is deleted, and we insert
     770              :  * the truncated high key at offset 1.
     771              :  *
     772              :  * 'last' pointer indicates the last offset added to the page.
     773              :  *
     774              :  * 'truncextra' is the size of the posting list in itup, if any.  This
     775              :  * information is stashed for the next call here, when we may benefit
     776              :  * from considering the impact of truncating away the posting list on
     777              :  * the page before deciding to finish the page off.  Posting lists are
     778              :  * often relatively large, so it is worth going to the trouble of
     779              :  * accounting for the saving from truncating away the posting list of
     780              :  * the tuple that becomes the high key (that may be the only way to
     781              :  * get close to target free space on the page).  Note that this is
     782              :  * only used for the soft fillfactor-wise limit, not the critical hard
     783              :  * limit.
     784              :  *----------
     785              :  */
     786              : static void
     787      5771728 : _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
     788              :              Size truncextra)
     789              : {
     790              :     BulkWriteBuffer nbuf;
     791              :     Page        npage;
     792              :     BlockNumber nblkno;
     793              :     OffsetNumber last_off;
     794              :     Size        last_truncextra;
     795              :     Size        pgspc;
     796              :     Size        itupsz;
     797              :     bool        isleaf;
     798              : 
     799              :     /*
     800              :      * This is a handy place to check for cancel interrupts during the btree
     801              :      * load phase of index creation.
     802              :      */
     803      5771728 :     CHECK_FOR_INTERRUPTS();
     804              : 
     805      5771728 :     nbuf = state->btps_buf;
     806      5771728 :     npage = (Page) nbuf;
     807      5771728 :     nblkno = state->btps_blkno;
     808      5771728 :     last_off = state->btps_lastoff;
     809      5771728 :     last_truncextra = state->btps_lastextra;
     810      5771728 :     state->btps_lastextra = truncextra;
     811              : 
     812      5771728 :     pgspc = PageGetFreeSpace(npage);
     813      5771728 :     itupsz = IndexTupleSize(itup);
     814      5771728 :     itupsz = MAXALIGN(itupsz);
     815              :     /* Leaf case has slightly different rules due to suffix truncation */
     816      5771728 :     isleaf = (state->btps_level == 0);
     817              : 
     818              :     /*
     819              :      * Check whether the new item can fit on a btree page on current level at
     820              :      * all.
     821              :      *
     822              :      * Every newly built index will treat heap TID as part of the keyspace,
     823              :      * which imposes the requirement that new high keys must occasionally have
     824              :      * a heap TID appended within _bt_truncate().  That may leave a new pivot
     825              :      * tuple one or two MAXALIGN() quantums larger than the original
     826              :      * firstright tuple it's derived from.  v4 deals with the problem by
     827              :      * decreasing the limit on the size of tuples inserted on the leaf level
     828              :      * by the same small amount.  Enforce the new v4+ limit on the leaf level,
     829              :      * and the old limit on internal levels, since pivot tuples may need to
     830              :      * make use of the reserved space.  This should never fail on internal
     831              :      * pages.
     832              :      */
     833      5771728 :     if (unlikely(itupsz > BTMaxItemSize))
     834          132 :         _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
     835              :                              itup);
     836              : 
     837              :     /*
     838              :      * Check to see if current page will fit new item, with space left over to
     839              :      * append a heap TID during suffix truncation when page is a leaf page.
     840              :      *
     841              :      * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
     842              :      * high key with heap TID when finishing off a leaf page, since we rely on
     843              :      * _bt_check_third_page() rejecting oversized non-pivot tuples.  On
     844              :      * internal pages we can always fit 3 pivot tuples with larger internal
     845              :      * page tuple limit (includes page high key).
     846              :      *
     847              :      * Most of the time, a page is only "full" in the sense that the soft
     848              :      * fillfactor-wise limit has been exceeded.  However, we must always leave
     849              :      * at least two items plus a high key on each page before starting a new
     850              :      * page.  Disregard fillfactor and insert on "full" current page if we
     851              :      * don't have the minimum number of items yet.  (Note that we deliberately
     852              :      * assume that suffix truncation neither enlarges nor shrinks new high key
     853              :      * when applying soft limit, except when last tuple has a posting list.)
     854              :      */
     855              :     Assert(last_truncextra == 0 || isleaf);
     856      5771728 :     if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
     857      5771150 :         (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
     858              :     {
     859              :         /*
     860              :          * Finish off the page and write it out.
     861              :          */
     862        20150 :         BulkWriteBuffer obuf = nbuf;
     863        20150 :         Page        opage = npage;
     864        20150 :         BlockNumber oblkno = nblkno;
     865              :         ItemId      ii;
     866              :         ItemId      hii;
     867              :         IndexTuple  oitup;
     868              : 
     869              :         /* Create new page of same level */
     870        20150 :         nbuf = _bt_blnewpage(wstate, state->btps_level);
     871        20150 :         npage = (Page) nbuf;
     872              : 
     873              :         /* and assign it a page position */
     874        20150 :         nblkno = wstate->btws_pages_alloced++;
     875              : 
     876              :         /*
     877              :          * We copy the last item on the page into the new page, and then
     878              :          * rearrange the old page so that the 'last item' becomes its high key
     879              :          * rather than a true data item.  There had better be at least two
     880              :          * items on the page already, else the page would be empty of useful
     881              :          * data.
     882              :          */
     883              :         Assert(last_off > P_FIRSTKEY);
     884        20150 :         ii = PageGetItemId(opage, last_off);
     885        20150 :         oitup = (IndexTuple) PageGetItem(opage, ii);
     886        20150 :         _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
     887        20150 :                        !isleaf);
     888              : 
     889              :         /*
     890              :          * Move 'last' into the high key position on opage.  _bt_blnewpage()
     891              :          * allocated empty space for a line pointer when opage was first
     892              :          * created, so this is a matter of rearranging already-allocated space
     893              :          * on page, and initializing high key line pointer. (Actually, leaf
     894              :          * pages must also swap oitup with a truncated version of oitup, which
     895              :          * is sometimes larger than oitup, though never by more than the space
     896              :          * needed to append a heap TID.)
     897              :          */
     898        20150 :         hii = PageGetItemId(opage, P_HIKEY);
     899        20150 :         *hii = *ii;
     900        20150 :         ItemIdSetUnused(ii);    /* redundant */
     901        20150 :         ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
     902              : 
     903        20150 :         if (isleaf)
     904              :         {
     905              :             IndexTuple  lastleft;
     906              :             IndexTuple  truncated;
     907              : 
     908              :             /*
     909              :              * Truncate away any unneeded attributes from high key on leaf
     910              :              * level.  This is only done at the leaf level because downlinks
     911              :              * in internal pages are either negative infinity items, or get
     912              :              * their contents from copying from one level down.  See also:
     913              :              * _bt_split().
     914              :              *
     915              :              * We don't try to bias our choice of split point to make it more
     916              :              * likely that _bt_truncate() can truncate away more attributes,
     917              :              * whereas the split point used within _bt_split() is chosen much
     918              :              * more delicately.  Even still, the lastleft and firstright
     919              :              * tuples passed to _bt_truncate() here are at least not fully
     920              :              * equal to each other when deduplication is used, unless there is
     921              :              * a large group of duplicates (also, unique index builds usually
     922              :              * have few or no spool2 duplicates).  When the split point is
     923              :              * between two unequal tuples, _bt_truncate() will avoid including
     924              :              * a heap TID in the new high key, which is the most important
     925              :              * benefit of suffix truncation.
     926              :              *
     927              :              * Overwrite the old item with new truncated high key directly.
     928              :              * oitup is already located at the physical beginning of tuple
     929              :              * space, so this should directly reuse the existing tuple space.
     930              :              */
     931        20088 :             ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
     932        20088 :             lastleft = (IndexTuple) PageGetItem(opage, ii);
     933              : 
     934              :             Assert(IndexTupleSize(oitup) > last_truncextra);
     935        20088 :             truncated = _bt_truncate(wstate->index, lastleft, oitup,
     936              :                                      wstate->inskey);
     937        20088 :             if (!PageIndexTupleOverwrite(opage, P_HIKEY, truncated, IndexTupleSize(truncated)))
     938            0 :                 elog(ERROR, "failed to add high key to the index page");
     939        20088 :             pfree(truncated);
     940              : 
     941              :             /* oitup should continue to point to the page's high key */
     942        20088 :             hii = PageGetItemId(opage, P_HIKEY);
     943        20088 :             oitup = (IndexTuple) PageGetItem(opage, hii);
     944              :         }
     945              : 
     946              :         /*
     947              :          * Link the old page into its parent, using its low key.  If we don't
     948              :          * have a parent, we have to create one; this adds a new btree level.
     949              :          */
     950        20150 :         if (state->btps_next == NULL)
     951         1367 :             state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
     952              : 
     953              :         Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
     954              :                 IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
     955              :                 BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
     956              :                P_LEFTMOST(BTPageGetOpaque(opage)));
     957              :         Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
     958              :                !P_LEFTMOST(BTPageGetOpaque(opage)));
     959        20150 :         BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
     960        20150 :         _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
     961        20150 :         pfree(state->btps_lowkey);
     962              : 
     963              :         /*
     964              :          * Save a copy of the high key from the old page.  It is also the low
     965              :          * key for the new page.
     966              :          */
     967        20150 :         state->btps_lowkey = CopyIndexTuple(oitup);
     968              : 
     969              :         /*
     970              :          * Set the sibling links for both pages.
     971              :          */
     972              :         {
     973        20150 :             BTPageOpaque oopaque = BTPageGetOpaque(opage);
     974        20150 :             BTPageOpaque nopaque = BTPageGetOpaque(npage);
     975              : 
     976        20150 :             oopaque->btpo_next = nblkno;
     977        20150 :             nopaque->btpo_prev = oblkno;
     978        20150 :             nopaque->btpo_next = P_NONE; /* redundant */
     979              :         }
     980              : 
     981              :         /*
     982              :          * Write out the old page. _bt_blwritepage takes ownership of the
     983              :          * 'opage' buffer.
     984              :          */
     985        20150 :         _bt_blwritepage(wstate, obuf, oblkno);
     986              : 
     987              :         /*
     988              :          * Reset last_off to point to new page
     989              :          */
     990        20150 :         last_off = P_FIRSTKEY;
     991              :     }
     992              : 
     993              :     /*
     994              :      * By here, either original page is still the current page, or a new page
     995              :      * was created that became the current page.  Either way, the current page
     996              :      * definitely has space for new item.
     997              :      *
     998              :      * If the new item is the first for its page, it must also be the first
     999              :      * item on its entire level.  On later same-level pages, a low key for a
    1000              :      * page will be copied from the prior page in the code above.  Generate a
    1001              :      * minus infinity low key here instead.
    1002              :      */
    1003      5771728 :     if (last_off == P_HIKEY)
    1004              :     {
    1005              :         Assert(state->btps_lowkey == NULL);
    1006         6649 :         state->btps_lowkey = palloc0_object(IndexTupleData);
    1007         6649 :         state->btps_lowkey->t_info = sizeof(IndexTupleData);
    1008         6649 :         BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
    1009              :     }
    1010              : 
    1011              :     /*
    1012              :      * Add the new item into the current page.
    1013              :      */
    1014      5771728 :     last_off = OffsetNumberNext(last_off);
    1015      5771728 :     _bt_sortaddtup(npage, itupsz, itup, last_off,
    1016      5771728 :                    !isleaf && last_off == P_FIRSTKEY);
    1017              : 
    1018      5771728 :     state->btps_buf = nbuf;
    1019      5771728 :     state->btps_blkno = nblkno;
    1020      5771728 :     state->btps_lastoff = last_off;
    1021      5771728 : }
    1022              : 
    1023              : /*
    1024              :  * Finalize pending posting list tuple, and add it to the index.  Final tuple
    1025              :  * is based on saved base tuple, and saved list of heap TIDs.
    1026              :  *
    1027              :  * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
    1028              :  * using _bt_buildadd().
    1029              :  */
    1030              : static void
    1031      2379209 : _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
    1032              :                               BTDedupState dstate)
    1033              : {
    1034              :     Assert(dstate->nitems > 0);
    1035              : 
    1036      2379209 :     if (dstate->nitems == 1)
    1037      2359022 :         _bt_buildadd(wstate, state, dstate->base, 0);
    1038              :     else
    1039              :     {
    1040              :         IndexTuple  postingtuple;
    1041              :         Size        truncextra;
    1042              : 
    1043              :         /* form a tuple with a posting list */
    1044        20187 :         postingtuple = _bt_form_posting(dstate->base,
    1045        20187 :                                         dstate->htids,
    1046              :                                         dstate->nhtids);
    1047              :         /* Calculate posting list overhead */
    1048        20187 :         truncextra = IndexTupleSize(postingtuple) -
    1049        20187 :             BTreeTupleGetPostingOffset(postingtuple);
    1050              : 
    1051        20187 :         _bt_buildadd(wstate, state, postingtuple, truncextra);
    1052        20187 :         pfree(postingtuple);
    1053              :     }
    1054              : 
    1055      2379209 :     dstate->nmaxitems = 0;
    1056      2379209 :     dstate->nhtids = 0;
    1057      2379209 :     dstate->nitems = 0;
    1058      2379209 :     dstate->phystupsize = 0;
    1059      2379209 : }
    1060              : 
    1061              : /*
    1062              :  * Finish writing out the completed btree.
    1063              :  */
    1064              : static void
    1065        26128 : _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
    1066              : {
    1067              :     BTPageState *s;
    1068        26128 :     BlockNumber rootblkno = P_NONE;
    1069        26128 :     uint32      rootlevel = 0;
    1070              :     BulkWriteBuffer metabuf;
    1071              : 
    1072              :     /*
    1073              :      * Each iteration of this loop completes one more level of the tree.
    1074              :      */
    1075        32777 :     for (s = state; s != NULL; s = s->btps_next)
    1076              :     {
    1077              :         BlockNumber blkno;
    1078              :         BTPageOpaque opaque;
    1079              : 
    1080         6649 :         blkno = s->btps_blkno;
    1081         6649 :         opaque = BTPageGetOpaque((Page) s->btps_buf);
    1082              : 
    1083              :         /*
    1084              :          * We have to link the last page on this level to somewhere.
    1085              :          *
    1086              :          * If we're at the top, it's the root, so attach it to the metapage.
    1087              :          * Otherwise, add an entry for it to its parent using its low key.
    1088              :          * This may cause the last page of the parent level to split, but
    1089              :          * that's not a problem -- we haven't gotten to it yet.
    1090              :          */
    1091         6649 :         if (s->btps_next == NULL)
    1092              :         {
    1093         5282 :             opaque->btpo_flags |= BTP_ROOT;
    1094         5282 :             rootblkno = blkno;
    1095         5282 :             rootlevel = s->btps_level;
    1096              :         }
    1097              :         else
    1098              :         {
    1099              :             Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
    1100              :                     IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
    1101              :                     BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
    1102              :                    P_LEFTMOST(opaque));
    1103              :             Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
    1104              :                    !P_LEFTMOST(opaque));
    1105         1367 :             BTreeTupleSetDownLink(s->btps_lowkey, blkno);
    1106         1367 :             _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
    1107         1367 :             pfree(s->btps_lowkey);
    1108         1367 :             s->btps_lowkey = NULL;
    1109              :         }
    1110              : 
    1111              :         /*
    1112              :          * This is the rightmost page, so the ItemId array needs to be slid
    1113              :          * back one slot.  Then we can dump out the page.
    1114              :          */
    1115         6649 :         _bt_slideleft((Page) s->btps_buf);
    1116         6649 :         _bt_blwritepage(wstate, s->btps_buf, s->btps_blkno);
    1117         6649 :         s->btps_buf = NULL;      /* writepage took ownership of the buffer */
    1118              :     }
    1119              : 
    1120              :     /*
    1121              :      * As the last step in the process, construct the metapage and make it
    1122              :      * point to the new root (unless we had no data at all, in which case it's
    1123              :      * set to point to "P_NONE").  This changes the index to the "valid" state
    1124              :      * by filling in a valid magic number in the metapage.
    1125              :      */
    1126        26128 :     metabuf = smgr_bulk_get_buf(wstate->bulkstate);
    1127        26128 :     _bt_initmetapage((Page) metabuf, rootblkno, rootlevel,
    1128        26128 :                      wstate->inskey->allequalimage);
    1129        26128 :     _bt_blwritepage(wstate, metabuf, BTREE_METAPAGE);
    1130        26128 : }
    1131              : 
    1132              : /*
    1133              :  * Read tuples in correct sort order from tuplesort, and load them into
    1134              :  * btree leaves.
    1135              :  */
    1136              : static void
    1137        26128 : _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
    1138              : {
    1139        26128 :     BTPageState *state = NULL;
    1140        26128 :     bool        merge = (btspool2 != NULL);
    1141              :     IndexTuple  itup,
    1142        26128 :                 itup2 = NULL;
    1143              :     bool        load1;
    1144        26128 :     TupleDesc   tupdes = RelationGetDescr(wstate->index);
    1145              :     int         i,
    1146        26128 :                 keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
    1147              :     SortSupport sortKeys;
    1148        26128 :     int64       tuples_done = 0;
    1149              :     bool        deduplicate;
    1150              : 
    1151        26128 :     wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);
    1152              : 
    1153        30929 :     deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
    1154         4801 :         BTGetDeduplicateItems(wstate->index);
    1155              : 
    1156        26128 :     if (merge)
    1157              :     {
    1158              :         /*
    1159              :          * Another BTSpool for dead tuples exists. Now we have to merge
    1160              :          * btspool and btspool2.
    1161              :          */
    1162              : 
    1163              :         /* the preparation of merge */
    1164            7 :         itup = tuplesort_getindextuple(btspool->sortstate, true);
    1165            7 :         itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
    1166              : 
    1167              :         /* Prepare SortSupport data for each column */
    1168            7 :         sortKeys = palloc0_array(SortSupportData, keysz);
    1169              : 
    1170           16 :         for (i = 0; i < keysz; i++)
    1171              :         {
    1172            9 :             SortSupport sortKey = sortKeys + i;
    1173            9 :             ScanKey     scanKey = wstate->inskey->scankeys + i;
    1174              :             bool        reverse;
    1175              : 
    1176            9 :             sortKey->ssup_cxt = CurrentMemoryContext;
    1177            9 :             sortKey->ssup_collation = scanKey->sk_collation;
    1178            9 :             sortKey->ssup_nulls_first =
    1179            9 :                 (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
    1180            9 :             sortKey->ssup_attno = scanKey->sk_attno;
    1181              :             /* Abbreviation is not supported here */
    1182            9 :             sortKey->abbreviate = false;
    1183              : 
    1184              :             Assert(sortKey->ssup_attno != 0);
    1185              : 
    1186            9 :             reverse = (scanKey->sk_flags & SK_BT_DESC) != 0;
    1187              : 
    1188            9 :             PrepareSortSupportFromIndexRel(wstate->index, reverse, sortKey);
    1189              :         }
    1190              : 
    1191              :         for (;;)
    1192              :         {
    1193         1635 :             load1 = true;       /* load BTSpool next ? */
    1194         1635 :             if (itup2 == NULL)
    1195              :             {
    1196           72 :                 if (itup == NULL)
    1197            7 :                     break;
    1198              :             }
    1199         1563 :             else if (itup != NULL)
    1200              :             {
    1201         1469 :                 int32       compare = 0;
    1202              : 
    1203         1582 :                 for (i = 1; i <= keysz; i++)
    1204              :                 {
    1205              :                     SortSupport entry;
    1206              :                     Datum       attrDatum1,
    1207              :                                 attrDatum2;
    1208              :                     bool        isNull1,
    1209              :                                 isNull2;
    1210              : 
    1211         1496 :                     entry = sortKeys + i - 1;
    1212         1496 :                     attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
    1213         1496 :                     attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
    1214              : 
    1215         1496 :                     compare = ApplySortComparator(attrDatum1, isNull1,
    1216              :                                                   attrDatum2, isNull2,
    1217              :                                                   entry);
    1218         1496 :                     if (compare > 0)
    1219              :                     {
    1220          137 :                         load1 = false;
    1221         1383 :                         break;
    1222              :                     }
    1223         1359 :                     else if (compare < 0)
    1224         1246 :                         break;
    1225              :                 }
    1226              : 
    1227              :                 /*
    1228              :                  * If key values are equal, we sort on ItemPointer.  This is
    1229              :                  * required for btree indexes, since heap TID is treated as an
    1230              :                  * implicit last key attribute in order to ensure that all
    1231              :                  * keys in the index are physically unique.
    1232              :                  */
    1233         1469 :                 if (compare == 0)
    1234              :                 {
    1235           86 :                     compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
    1236              :                     Assert(compare != 0);
    1237           86 :                     if (compare > 0)
    1238            6 :                         load1 = false;
    1239              :                 }
    1240              :             }
    1241              :             else
    1242           94 :                 load1 = false;
    1243              : 
    1244              :             /* When we see first tuple, create first index page */
    1245         1628 :             if (state == NULL)
    1246            7 :                 state = _bt_pagestate(wstate, 0);
    1247              : 
    1248         1628 :             if (load1)
    1249              :             {
    1250         1391 :                 _bt_buildadd(wstate, state, itup, 0);
    1251         1391 :                 itup = tuplesort_getindextuple(btspool->sortstate, true);
    1252              :             }
    1253              :             else
    1254              :             {
    1255          237 :                 _bt_buildadd(wstate, state, itup2, 0);
    1256          237 :                 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
    1257              :             }
    1258              : 
    1259              :             /* Report progress */
    1260         1628 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1261              :                                          ++tuples_done);
    1262              :         }
    1263            7 :         pfree(sortKeys);
    1264              :     }
    1265        26121 :     else if (deduplicate)
    1266              :     {
    1267              :         /* merge is unnecessary, deduplicate into posting lists */
    1268              :         BTDedupState dstate;
    1269              : 
    1270         4801 :         dstate = palloc_object(BTDedupStateData);
    1271         4801 :         dstate->deduplicate = true; /* unused */
    1272         4801 :         dstate->nmaxitems = 0;   /* unused */
    1273         4801 :         dstate->maxpostingsize = 0; /* set later */
    1274              :         /* Metadata about base tuple of current pending posting list */
    1275         4801 :         dstate->base = NULL;
    1276         4801 :         dstate->baseoff = InvalidOffsetNumber;   /* unused */
    1277         4801 :         dstate->basetupsize = 0;
    1278              :         /* Metadata about current pending posting list TIDs */
    1279         4801 :         dstate->htids = NULL;
    1280         4801 :         dstate->nhtids = 0;
    1281         4801 :         dstate->nitems = 0;
    1282         4801 :         dstate->phystupsize = 0; /* unused */
    1283         4801 :         dstate->nintervals = 0; /* unused */
    1284              : 
    1285      3072697 :         while ((itup = tuplesort_getindextuple(btspool->sortstate,
    1286      3072697 :                                                true)) != NULL)
    1287              :         {
    1288              :             /* When we see first tuple, create first index page */
    1289      3067896 :             if (state == NULL)
    1290              :             {
    1291         1325 :                 state = _bt_pagestate(wstate, 0);
    1292              : 
    1293              :                 /*
    1294              :                  * Limit size of posting list tuples to 1/10 space we want to
    1295              :                  * leave behind on the page, plus space for final item's line
    1296              :                  * pointer.  This is equal to the space that we'd like to
    1297              :                  * leave behind on each leaf page when fillfactor is 90,
    1298              :                  * allowing us to get close to fillfactor% space utilization
    1299              :                  * when there happen to be a great many duplicates.  (This
    1300              :                  * makes higher leaf fillfactor settings ineffective when
    1301              :                  * building indexes that have many duplicates, but packing
    1302              :                  * leaf pages full with few very large tuples doesn't seem
    1303              :                  * like a useful goal.)
    1304              :                  */
    1305         1325 :                 dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
    1306              :                     sizeof(ItemIdData);
    1307              :                 Assert(dstate->maxpostingsize <= BTMaxItemSize &&
    1308              :                        dstate->maxpostingsize <= INDEX_SIZE_MASK);
    1309         1325 :                 dstate->htids = palloc(dstate->maxpostingsize);
    1310              : 
    1311              :                 /* start new pending posting list with itup copy */
    1312         1325 :                 _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
    1313              :                                         InvalidOffsetNumber);
    1314              :             }
    1315      3066571 :             else if (_bt_keep_natts_fast(wstate->index, dstate->base,
    1316       692677 :                                          itup) > keysz &&
    1317       692677 :                      _bt_dedup_save_htid(dstate, itup))
    1318              :             {
    1319              :                 /*
    1320              :                  * Tuple is equal to base tuple of pending posting list.  Heap
    1321              :                  * TID from itup has been saved in state.
    1322              :                  */
    1323              :             }
    1324              :             else
    1325              :             {
    1326              :                 /*
    1327              :                  * Tuple is not equal to pending posting list tuple, or
    1328              :                  * _bt_dedup_save_htid() opted to not merge current item into
    1329              :                  * pending posting list.
    1330              :                  */
    1331      2377884 :                 _bt_sort_dedup_finish_pending(wstate, state, dstate);
    1332      2377884 :                 pfree(dstate->base);
    1333              : 
    1334              :                 /* start new pending posting list with itup copy */
    1335      2377884 :                 _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
    1336              :                                         InvalidOffsetNumber);
    1337              :             }
    1338              : 
    1339              :             /* Report progress */
    1340      3067896 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1341              :                                          ++tuples_done);
    1342              :         }
    1343              : 
    1344         4801 :         if (state)
    1345              :         {
    1346              :             /*
    1347              :              * Handle the last item (there must be a last item when the
    1348              :              * tuplesort returned one or more tuples)
    1349              :              */
    1350         1325 :             _bt_sort_dedup_finish_pending(wstate, state, dstate);
    1351         1325 :             pfree(dstate->base);
    1352         1325 :             pfree(dstate->htids);
    1353              :         }
    1354              : 
    1355         4801 :         pfree(dstate);
    1356              :     }
    1357              :     else
    1358              :     {
    1359              :         /* merging and deduplication are both unnecessary */
    1360      3390694 :         while ((itup = tuplesort_getindextuple(btspool->sortstate,
    1361      3390694 :                                                true)) != NULL)
    1362              :         {
    1363              :             /* When we see first tuple, create first index page */
    1364      3369374 :             if (state == NULL)
    1365         3950 :                 state = _bt_pagestate(wstate, 0);
    1366              : 
    1367      3369374 :             _bt_buildadd(wstate, state, itup, 0);
    1368              : 
    1369              :             /* Report progress */
    1370      3369374 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1371              :                                          ++tuples_done);
    1372              :         }
    1373              :     }
    1374              : 
    1375              :     /* Close down final pages and write the metapage */
    1376        26128 :     _bt_uppershutdown(wstate, state);
    1377        26128 :     smgr_bulk_finish(wstate->bulkstate);
    1378        26128 : }
    1379              : 
    1380              : /*
    1381              :  * Create parallel context, and launch workers for leader.
    1382              :  *
    1383              :  * buildstate argument should be initialized (with the exception of the
    1384              :  * tuplesort state in spools, which may later be created based on shared
    1385              :  * state initially set up here).
    1386              :  *
    1387              :  * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
    1388              :  *
    1389              :  * request is the target number of parallel worker processes to launch.
    1390              :  *
    1391              :  * Sets buildstate's BTLeader, which caller must use to shut down parallel
    1392              :  * mode by passing it to _bt_end_parallel() at the very end of its index
    1393              :  * build.  If not even a single worker process can be launched, this is
    1394              :  * never set, and caller should proceed with a serial index build.
    1395              :  */
    1396              : static void
    1397           80 : _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
    1398              : {
    1399              :     ParallelContext *pcxt;
    1400              :     int         scantuplesortstates;
    1401              :     Snapshot    snapshot;
    1402              :     Size        estbtshared;
    1403              :     Size        estsort;
    1404              :     BTShared   *btshared;
    1405              :     Sharedsort *sharedsort;
    1406              :     Sharedsort *sharedsort2;
    1407           80 :     BTSpool    *btspool = buildstate->spool;
    1408           80 :     BTLeader   *btleader = palloc0_object(BTLeader);
    1409              :     WalUsage   *walusage;
    1410              :     BufferUsage *bufferusage;
    1411           80 :     bool        leaderparticipates = true;
    1412              :     int         querylen;
    1413              : 
    1414              : #ifdef DISABLE_LEADER_PARTICIPATION
    1415              :     leaderparticipates = false;
    1416              : #endif
    1417              : 
    1418              :     /*
    1419              :      * Enter parallel mode, and create context for parallel build of btree
    1420              :      * index
    1421              :      */
    1422           80 :     EnterParallelMode();
    1423              :     Assert(request > 0);
    1424           80 :     pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
    1425              :                                  request);
    1426              : 
    1427           80 :     scantuplesortstates = leaderparticipates ? request + 1 : request;
    1428              : 
    1429              :     /*
    1430              :      * Prepare for scan of the base relation.  In a normal index build, we use
    1431              :      * SnapshotAny because we must retrieve all tuples and do our own time
    1432              :      * qual checks (because we have to index RECENTLY_DEAD tuples).  In a
    1433              :      * concurrent build, we take a regular MVCC snapshot and index whatever's
    1434              :      * live according to that.
    1435              :      */
    1436           80 :     if (!isconcurrent)
    1437           80 :         snapshot = SnapshotAny;
    1438              :     else
    1439            0 :         snapshot = RegisterSnapshot(GetTransactionSnapshot());
    1440              : 
    1441              :     /*
    1442              :      * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
    1443              :      * PARALLEL_KEY_TUPLESORT tuplesort workspace
    1444              :      */
    1445           80 :     estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
    1446           80 :     shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
    1447           80 :     estsort = tuplesort_estimate_shared(scantuplesortstates);
    1448           80 :     shm_toc_estimate_chunk(&pcxt->estimator, estsort);
    1449              : 
    1450              :     /*
    1451              :      * Unique case requires a second spool, and so we may have to account for
    1452              :      * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
    1453              :      */
    1454           80 :     if (!btspool->isunique)
    1455           45 :         shm_toc_estimate_keys(&pcxt->estimator, 2);
    1456              :     else
    1457              :     {
    1458           35 :         shm_toc_estimate_chunk(&pcxt->estimator, estsort);
    1459           35 :         shm_toc_estimate_keys(&pcxt->estimator, 3);
    1460              :     }
    1461              : 
    1462              :     /*
    1463              :      * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
    1464              :      * and PARALLEL_KEY_BUFFER_USAGE.
    1465              :      *
    1466              :      * If there are no extensions loaded that care, we could skip this.  We
    1467              :      * have no way of knowing whether anyone's looking at pgWalUsage or
    1468              :      * pgBufferUsage, so do it unconditionally.
    1469              :      */
    1470           80 :     shm_toc_estimate_chunk(&pcxt->estimator,
    1471              :                            mul_size(sizeof(WalUsage), pcxt->nworkers));
    1472           80 :     shm_toc_estimate_keys(&pcxt->estimator, 1);
    1473           80 :     shm_toc_estimate_chunk(&pcxt->estimator,
    1474              :                            mul_size(sizeof(BufferUsage), pcxt->nworkers));
    1475           80 :     shm_toc_estimate_keys(&pcxt->estimator, 1);
    1476              : 
    1477              :     /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
    1478           80 :     if (debug_query_string)
    1479              :     {
    1480           80 :         querylen = strlen(debug_query_string);
    1481           80 :         shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
    1482           80 :         shm_toc_estimate_keys(&pcxt->estimator, 1);
    1483              :     }
    1484              :     else
    1485            0 :         querylen = 0;           /* keep compiler quiet */
    1486              : 
    1487              :     /* Everyone's had a chance to ask for space, so now create the DSM */
    1488           80 :     InitializeParallelDSM(pcxt);
    1489              : 
    1490              :     /* If no DSM segment was available, back out (do serial build) */
    1491           80 :     if (pcxt->seg == NULL)
    1492              :     {
    1493            0 :         if (IsMVCCSnapshot(snapshot))
    1494            0 :             UnregisterSnapshot(snapshot);
    1495            0 :         DestroyParallelContext(pcxt);
    1496            0 :         ExitParallelMode();
    1497            0 :         return;
    1498              :     }
    1499              : 
    1500              :     /* Store shared build state, for which we reserved space */
    1501           80 :     btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
    1502              :     /* Initialize immutable state */
    1503           80 :     btshared->heaprelid = RelationGetRelid(btspool->heap);
    1504           80 :     btshared->indexrelid = RelationGetRelid(btspool->index);
    1505           80 :     btshared->isunique = btspool->isunique;
    1506           80 :     btshared->nulls_not_distinct = btspool->nulls_not_distinct;
    1507           80 :     btshared->isconcurrent = isconcurrent;
    1508           80 :     btshared->scantuplesortstates = scantuplesortstates;
    1509           80 :     btshared->queryid = pgstat_get_my_query_id();
    1510           80 :     ConditionVariableInit(&btshared->workersdonecv);
    1511           80 :     SpinLockInit(&btshared->mutex);
    1512              :     /* Initialize mutable state */
    1513           80 :     btshared->nparticipantsdone = 0;
    1514           80 :     btshared->reltuples = 0.0;
    1515           80 :     btshared->havedead = false;
    1516           80 :     btshared->indtuples = 0.0;
    1517           80 :     btshared->brokenhotchain = false;
    1518           80 :     table_parallelscan_initialize(btspool->heap,
    1519              :                                   ParallelTableScanFromBTShared(btshared),
    1520              :                                   snapshot);
    1521              : 
    1522              :     /*
    1523              :      * Store shared tuplesort-private state, for which we reserved space.
    1524              :      * Then, initialize opaque state using tuplesort routine.
    1525              :      */
    1526           80 :     sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
    1527           80 :     tuplesort_initialize_shared(sharedsort, scantuplesortstates,
    1528              :                                 pcxt->seg);
    1529              : 
    1530           80 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
    1531           80 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
    1532              : 
    1533              :     /* Unique case requires a second spool, and associated shared state */
    1534           80 :     if (!btspool->isunique)
    1535           45 :         sharedsort2 = NULL;
    1536              :     else
    1537              :     {
    1538              :         /*
    1539              :          * Store additional shared tuplesort-private state, for which we
    1540              :          * reserved space.  Then, initialize opaque state using tuplesort
    1541              :          * routine.
    1542              :          */
    1543           35 :         sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
    1544           35 :         tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
    1545              :                                     pcxt->seg);
    1546              : 
    1547           35 :         shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
    1548              :     }
    1549              : 
    1550              :     /* Store query string for workers */
    1551           80 :     if (debug_query_string)
    1552              :     {
    1553              :         char       *sharedquery;
    1554              : 
    1555           80 :         sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
    1556           80 :         memcpy(sharedquery, debug_query_string, querylen + 1);
    1557           80 :         shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
    1558              :     }
    1559              : 
    1560              :     /*
    1561              :      * Allocate space for each worker's WalUsage and BufferUsage; no need to
    1562              :      * initialize.
    1563              :      */
    1564           80 :     walusage = shm_toc_allocate(pcxt->toc,
    1565           80 :                                 mul_size(sizeof(WalUsage), pcxt->nworkers));
    1566           80 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
    1567           80 :     bufferusage = shm_toc_allocate(pcxt->toc,
    1568           80 :                                    mul_size(sizeof(BufferUsage), pcxt->nworkers));
    1569           80 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
    1570              : 
    1571              :     /* Launch workers, saving status for leader/caller */
    1572           80 :     LaunchParallelWorkers(pcxt);
    1573           80 :     btleader->pcxt = pcxt;
    1574           80 :     btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
    1575           80 :     if (leaderparticipates)
    1576           80 :         btleader->nparticipanttuplesorts++;
    1577           80 :     btleader->btshared = btshared;
    1578           80 :     btleader->sharedsort = sharedsort;
    1579           80 :     btleader->sharedsort2 = sharedsort2;
    1580           80 :     btleader->snapshot = snapshot;
    1581           80 :     btleader->walusage = walusage;
    1582           80 :     btleader->bufferusage = bufferusage;
    1583              : 
    1584              :     /* If no workers were successfully launched, back out (do serial build) */
    1585           80 :     if (pcxt->nworkers_launched == 0)
    1586              :     {
    1587            0 :         _bt_end_parallel(btleader);
    1588            0 :         return;
    1589              :     }
    1590              : 
    1591              :     /* Save leader state now that it's clear build will be parallel */
    1592           80 :     buildstate->btleader = btleader;
    1593              : 
    1594              :     /* Join heap scan ourselves */
    1595           80 :     if (leaderparticipates)
    1596           80 :         _bt_leader_participate_as_worker(buildstate);
    1597              : 
    1598              :     /*
    1599              :      * Caller needs to wait for all launched workers when we return.  Make
    1600              :      * sure that the failure-to-start case will not hang forever.
    1601              :      */
    1602           80 :     WaitForParallelWorkersToAttach(pcxt);
    1603              : }
    1604              : 
    1605              : /*
    1606              :  * Shut down workers, destroy parallel context, and end parallel mode.
    1607              :  */
    1608              : static void
    1609           80 : _bt_end_parallel(BTLeader *btleader)
    1610              : {
    1611              :     int         i;
    1612              : 
    1613              :     /* Shutdown worker processes */
    1614           80 :     WaitForParallelWorkersToFinish(btleader->pcxt);
    1615              : 
    1616              :     /*
    1617              :      * Next, accumulate WAL usage.  (This must wait for the workers to finish,
    1618              :      * or we might get incomplete data.)
    1619              :      */
    1620          163 :     for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
    1621           83 :         InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
    1622              : 
    1623              :     /* Free last reference to MVCC snapshot, if one was used */
    1624           80 :     if (IsMVCCSnapshot(btleader->snapshot))
    1625            0 :         UnregisterSnapshot(btleader->snapshot);
    1626           80 :     DestroyParallelContext(btleader->pcxt);
    1627           80 :     ExitParallelMode();
    1628           80 : }
    1629              : 
    1630              : /*
    1631              :  * Returns size of shared memory required to store state for a parallel
    1632              :  * btree index build based on the snapshot its parallel scan will use.
    1633              :  */
    1634              : static Size
    1635           80 : _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
    1636              : {
    1637              :     /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
    1638           80 :     return add_size(BUFFERALIGN(sizeof(BTShared)),
    1639              :                     table_parallelscan_estimate(heap, snapshot));
    1640              : }
    1641              : 
    1642              : /*
    1643              :  * Within leader, wait for end of heap scan.
    1644              :  *
    1645              :  * When called, parallel heap scan started by _bt_begin_parallel() will
    1646              :  * already be underway within worker processes (when leader participates
    1647              :  * as a worker, we should end up here just as workers are finishing).
    1648              :  *
    1649              :  * Fills in fields needed for ambuild statistics, and lets caller set
    1650              :  * field indicating that some worker encountered a broken HOT chain.
    1651              :  *
    1652              :  * Returns the total number of heap tuples scanned.
    1653              :  */
    1654              : static double
    1655           80 : _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
    1656              : {
    1657           80 :     BTShared   *btshared = buildstate->btleader->btshared;
    1658              :     int         nparticipanttuplesorts;
    1659              :     double      reltuples;
    1660              : 
    1661           80 :     nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
    1662              :     for (;;)
    1663              :     {
    1664          213 :         SpinLockAcquire(&btshared->mutex);
    1665          213 :         if (btshared->nparticipantsdone == nparticipanttuplesorts)
    1666              :         {
    1667           80 :             buildstate->havedead = btshared->havedead;
    1668           80 :             buildstate->indtuples = btshared->indtuples;
    1669           80 :             *brokenhotchain = btshared->brokenhotchain;
    1670           80 :             reltuples = btshared->reltuples;
    1671           80 :             SpinLockRelease(&btshared->mutex);
    1672           80 :             break;
    1673              :         }
    1674          133 :         SpinLockRelease(&btshared->mutex);
    1675              : 
    1676          133 :         ConditionVariableSleep(&btshared->workersdonecv,
    1677              :                                WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
    1678              :     }
    1679              : 
    1680           80 :     ConditionVariableCancelSleep();
    1681              : 
    1682           80 :     return reltuples;
    1683              : }
    1684              : 
    1685              : /*
    1686              :  * Within leader, participate as a parallel worker.
    1687              :  */
    1688              : static void
    1689           80 : _bt_leader_participate_as_worker(BTBuildState *buildstate)
    1690              : {
    1691           80 :     BTLeader   *btleader = buildstate->btleader;
    1692              :     BTSpool    *leaderworker;
    1693              :     BTSpool    *leaderworker2;
    1694              :     int         sortmem;
    1695              : 
    1696              :     /* Allocate memory and initialize private spool */
    1697           80 :     leaderworker = palloc0_object(BTSpool);
    1698           80 :     leaderworker->heap = buildstate->spool->heap;
    1699           80 :     leaderworker->index = buildstate->spool->index;
    1700           80 :     leaderworker->isunique = buildstate->spool->isunique;
    1701           80 :     leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;
    1702              : 
    1703              :     /* Initialize second spool, if required */
    1704           80 :     if (!btleader->btshared->isunique)
    1705           45 :         leaderworker2 = NULL;
    1706              :     else
    1707              :     {
    1708              :         /* Allocate memory for worker's own private secondary spool */
    1709           35 :         leaderworker2 = palloc0_object(BTSpool);
    1710              : 
    1711              :         /* Initialize worker's own secondary spool */
    1712           35 :         leaderworker2->heap = leaderworker->heap;
    1713           35 :         leaderworker2->index = leaderworker->index;
    1714           35 :         leaderworker2->isunique = false;
    1715              :     }
    1716              : 
    1717              :     /*
    1718              :      * Might as well use reliable figure when doling out maintenance_work_mem
    1719              :      * (when requested number of workers were not launched, this will be
    1720              :      * somewhat higher than it is for other workers).
    1721              :      */
    1722           80 :     sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
    1723              : 
    1724              :     /* Perform work common to all participants */
    1725           80 :     _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
    1726              :                                btleader->sharedsort, btleader->sharedsort2,
    1727              :                                sortmem, true);
    1728              : 
    1729              : #ifdef BTREE_BUILD_STATS
    1730              :     if (log_btree_build_stats)
    1731              :     {
    1732              :         ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
    1733              :         ResetUsage();
    1734              :     }
    1735              : #endif                          /* BTREE_BUILD_STATS */
    1736           80 : }
    1737              : 
    1738              : /*
    1739              :  * Perform work within a launched parallel process.
    1740              :  */
    1741              : void
    1742           83 : _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
    1743              : {
    1744              :     char       *sharedquery;
    1745              :     BTSpool    *btspool;
    1746              :     BTSpool    *btspool2;
    1747              :     BTShared   *btshared;
    1748              :     Sharedsort *sharedsort;
    1749              :     Sharedsort *sharedsort2;
    1750              :     Relation    heapRel;
    1751              :     Relation    indexRel;
    1752              :     LOCKMODE    heapLockmode;
    1753              :     LOCKMODE    indexLockmode;
    1754              :     WalUsage   *walusage;
    1755              :     BufferUsage *bufferusage;
    1756              :     int         sortmem;
    1757              : 
    1758              : #ifdef BTREE_BUILD_STATS
    1759              :     if (log_btree_build_stats)
    1760              :         ResetUsage();
    1761              : #endif                          /* BTREE_BUILD_STATS */
    1762              : 
    1763              :     /*
    1764              :      * The only possible status flag that can be set to the parallel worker is
    1765              :      * PROC_IN_SAFE_IC.
    1766              :      */
    1767              :     Assert((MyProc->statusFlags == 0) ||
    1768              :            (MyProc->statusFlags == PROC_IN_SAFE_IC));
    1769              : 
    1770              :     /* Set debug_query_string for individual workers first */
    1771           83 :     sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
    1772           83 :     debug_query_string = sharedquery;
    1773              : 
    1774              :     /* Report the query string from leader */
    1775           83 :     pgstat_report_activity(STATE_RUNNING, debug_query_string);
    1776              : 
    1777              :     /* Look up nbtree shared state */
    1778           83 :     btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
    1779              : 
    1780              :     /* Open relations using lock modes known to be obtained by index.c */
    1781           83 :     if (!btshared->isconcurrent)
    1782              :     {
    1783           83 :         heapLockmode = ShareLock;
    1784           83 :         indexLockmode = AccessExclusiveLock;
    1785              :     }
    1786              :     else
    1787              :     {
    1788            0 :         heapLockmode = ShareUpdateExclusiveLock;
    1789            0 :         indexLockmode = RowExclusiveLock;
    1790              :     }
    1791              : 
    1792              :     /* Track query ID */
    1793           83 :     pgstat_report_query_id(btshared->queryid, false);
    1794              : 
    1795              :     /* Open relations within worker */
    1796           83 :     heapRel = table_open(btshared->heaprelid, heapLockmode);
    1797           83 :     indexRel = index_open(btshared->indexrelid, indexLockmode);
    1798              : 
    1799              :     /* Initialize worker's own spool */
    1800           83 :     btspool = palloc0_object(BTSpool);
    1801           83 :     btspool->heap = heapRel;
    1802           83 :     btspool->index = indexRel;
    1803           83 :     btspool->isunique = btshared->isunique;
    1804           83 :     btspool->nulls_not_distinct = btshared->nulls_not_distinct;
    1805              : 
    1806              :     /* Look up shared state private to tuplesort.c */
    1807           83 :     sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
    1808           83 :     tuplesort_attach_shared(sharedsort, seg);
    1809           83 :     if (!btshared->isunique)
    1810              :     {
    1811           48 :         btspool2 = NULL;
    1812           48 :         sharedsort2 = NULL;
    1813              :     }
    1814              :     else
    1815              :     {
    1816              :         /* Allocate memory for worker's own private secondary spool */
    1817           35 :         btspool2 = palloc0_object(BTSpool);
    1818              : 
    1819              :         /* Initialize worker's own secondary spool */
    1820           35 :         btspool2->heap = btspool->heap;
    1821           35 :         btspool2->index = btspool->index;
    1822           35 :         btspool2->isunique = false;
    1823              :         /* Look up shared state private to tuplesort.c */
    1824           35 :         sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
    1825           35 :         tuplesort_attach_shared(sharedsort2, seg);
    1826              :     }
    1827              : 
    1828              :     /* Prepare to track buffer usage during parallel execution */
    1829           83 :     InstrStartParallelQuery();
    1830              : 
    1831              :     /* Perform sorting of spool, and possibly a spool2 */
    1832           83 :     sortmem = maintenance_work_mem / btshared->scantuplesortstates;
    1833           83 :     _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
    1834              :                                sharedsort2, sortmem, false);
    1835              : 
    1836              :     /* Report WAL/buffer usage during parallel execution */
    1837           83 :     bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
    1838           83 :     walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
    1839           83 :     InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
    1840           83 :                           &walusage[ParallelWorkerNumber]);
    1841              : 
    1842              : #ifdef BTREE_BUILD_STATS
    1843              :     if (log_btree_build_stats)
    1844              :     {
    1845              :         ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
    1846              :         ResetUsage();
    1847              :     }
    1848              : #endif                          /* BTREE_BUILD_STATS */
    1849              : 
    1850           83 :     index_close(indexRel, indexLockmode);
    1851           83 :     table_close(heapRel, heapLockmode);
    1852           83 : }
    1853              : 
    1854              : /*
    1855              :  * Perform a worker's portion of a parallel sort.
    1856              :  *
    1857              :  * This generates a tuplesort for passed btspool, and a second tuplesort
    1858              :  * state if a second btspool is need (i.e. for unique index builds).  All
    1859              :  * other spool fields should already be set when this is called.
    1860              :  *
    1861              :  * sortmem is the amount of working memory to use within each worker,
    1862              :  * expressed in KBs.
    1863              :  *
    1864              :  * When this returns, workers are done, and need only release resources.
    1865              :  */
    1866              : static void
    1867          163 : _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
    1868              :                            BTShared *btshared, Sharedsort *sharedsort,
    1869              :                            Sharedsort *sharedsort2, int sortmem, bool progress)
    1870              : {
    1871              :     SortCoordinate coordinate;
    1872              :     BTBuildState buildstate;
    1873              :     TableScanDesc scan;
    1874              :     double      reltuples;
    1875              :     IndexInfo  *indexInfo;
    1876              : 
    1877              :     /* Initialize local tuplesort coordination state */
    1878          163 :     coordinate = palloc0_object(SortCoordinateData);
    1879          163 :     coordinate->isWorker = true;
    1880          163 :     coordinate->nParticipants = -1;
    1881          163 :     coordinate->sharedsort = sharedsort;
    1882              : 
    1883              :     /* Begin "partial" tuplesort */
    1884          326 :     btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
    1885              :                                                      btspool->index,
    1886          163 :                                                      btspool->isunique,
    1887          163 :                                                      btspool->nulls_not_distinct,
    1888              :                                                      sortmem, coordinate,
    1889              :                                                      TUPLESORT_NONE);
    1890              : 
    1891              :     /*
    1892              :      * Just as with serial case, there may be a second spool.  If so, a
    1893              :      * second, dedicated spool2 partial tuplesort is required.
    1894              :      */
    1895          163 :     if (btspool2)
    1896              :     {
    1897              :         SortCoordinate coordinate2;
    1898              : 
    1899              :         /*
    1900              :          * We expect that the second one (for dead tuples) won't get very
    1901              :          * full, so we give it only work_mem (unless sortmem is less for
    1902              :          * worker).  Worker processes are generally permitted to allocate
    1903              :          * work_mem independently.
    1904              :          */
    1905           70 :         coordinate2 = palloc0_object(SortCoordinateData);
    1906           70 :         coordinate2->isWorker = true;
    1907           70 :         coordinate2->nParticipants = -1;
    1908           70 :         coordinate2->sharedsort = sharedsort2;
    1909           70 :         btspool2->sortstate =
    1910           70 :             tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,
    1911              :                                         Min(sortmem, work_mem), coordinate2,
    1912              :                                         false);
    1913              :     }
    1914              : 
    1915              :     /* Fill in buildstate for _bt_build_callback() */
    1916          163 :     buildstate.isunique = btshared->isunique;
    1917          163 :     buildstate.nulls_not_distinct = btshared->nulls_not_distinct;
    1918          163 :     buildstate.havedead = false;
    1919          163 :     buildstate.heap = btspool->heap;
    1920          163 :     buildstate.spool = btspool;
    1921          163 :     buildstate.spool2 = btspool2;
    1922          163 :     buildstate.indtuples = 0;
    1923          163 :     buildstate.btleader = NULL;
    1924              : 
    1925              :     /* Join parallel scan */
    1926          163 :     indexInfo = BuildIndexInfo(btspool->index);
    1927          163 :     indexInfo->ii_Concurrent = btshared->isconcurrent;
    1928          163 :     scan = table_beginscan_parallel(btspool->heap,
    1929              :                                     ParallelTableScanFromBTShared(btshared));
    1930          163 :     reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
    1931              :                                        true, progress, _bt_build_callback,
    1932              :                                        &buildstate, scan);
    1933              : 
    1934              :     /* Execute this worker's part of the sort */
    1935          163 :     if (progress)
    1936           80 :         pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
    1937              :                                      PROGRESS_BTREE_PHASE_PERFORMSORT_1);
    1938          163 :     tuplesort_performsort(btspool->sortstate);
    1939          163 :     if (btspool2)
    1940              :     {
    1941           70 :         if (progress)
    1942           35 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
    1943              :                                          PROGRESS_BTREE_PHASE_PERFORMSORT_2);
    1944           70 :         tuplesort_performsort(btspool2->sortstate);
    1945              :     }
    1946              : 
    1947              :     /*
    1948              :      * Done.  Record ambuild statistics, and whether we encountered a broken
    1949              :      * HOT chain.
    1950              :      */
    1951          163 :     SpinLockAcquire(&btshared->mutex);
    1952          163 :     btshared->nparticipantsdone++;
    1953          163 :     btshared->reltuples += reltuples;
    1954          163 :     if (buildstate.havedead)
    1955            0 :         btshared->havedead = true;
    1956          163 :     btshared->indtuples += buildstate.indtuples;
    1957          163 :     if (indexInfo->ii_BrokenHotChain)
    1958            0 :         btshared->brokenhotchain = true;
    1959          163 :     SpinLockRelease(&btshared->mutex);
    1960              : 
    1961              :     /* Notify leader */
    1962          163 :     ConditionVariableSignal(&btshared->workersdonecv);
    1963              : 
    1964              :     /* We can end tuplesorts immediately */
    1965          163 :     tuplesort_end(btspool->sortstate);
    1966          163 :     if (btspool2)
    1967           70 :         tuplesort_end(btspool2->sortstate);
    1968          163 : }
        

Generated by: LCOV version 2.0-1