LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtsort.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 96.9 % 555 538
Test Date: 2026-03-27 22:16:19 Functions: 100.0 % 22 22
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * nbtsort.c
       4              :  *      Build a btree from sorted input by loading leaf pages sequentially.
       5              :  *
       6              :  * NOTES
       7              :  *
       8              :  * We use tuplesort.c to sort the given index tuples into order.
       9              :  * Then we scan the index tuples in order and build the btree pages
      10              :  * for each level.  We load source tuples into leaf-level pages.
      11              :  * Whenever we fill a page at one level, we add a link to it to its
      12              :  * parent level (starting a new parent level if necessary).  When
      13              :  * done, we write out each final page on each level, adding it to
      14              :  * its parent level.  When we have only one page on a level, it must be
      15              :  * the root -- it can be attached to the btree metapage and we are done.
      16              :  *
      17              :  * It is not wise to pack the pages entirely full, since then *any*
      18              :  * insertion would cause a split (and not only of the leaf page; the need
      19              :  * for a split would cascade right up the tree).  The steady-state load
      20              :  * factor for btrees is usually estimated at 70%.  We choose to pack leaf
      21              :  * pages to the user-controllable fill factor (default 90%) while upper pages
      22              :  * are always packed to 70%.  This gives us reasonable density (there aren't
      23              :  * many upper pages if the keys are reasonable-size) without risking a lot of
      24              :  * cascading splits during early insertions.
      25              :  *
      26              :  * We use the bulk smgr loading facility to bypass the buffer cache and
      27              :  * WAL-log the pages efficiently.
      28              :  *
      29              :  * This code isn't concerned about the FSM at all. The caller is responsible
      30              :  * for initializing that.
      31              :  *
      32              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      33              :  * Portions Copyright (c) 1994, Regents of the University of California
      34              :  *
      35              :  * IDENTIFICATION
      36              :  *    src/backend/access/nbtree/nbtsort.c
      37              :  *
      38              :  *-------------------------------------------------------------------------
      39              :  */
      40              : 
      41              : #include "postgres.h"
      42              : 
      43              : #include "access/nbtree.h"
      44              : #include "access/parallel.h"
      45              : #include "access/relscan.h"
      46              : #include "access/table.h"
      47              : #include "access/tableam.h"
      48              : #include "access/xact.h"
      49              : #include "catalog/index.h"
      50              : #include "commands/progress.h"
      51              : #include "executor/instrument.h"
      52              : #include "miscadmin.h"
      53              : #include "pgstat.h"
      54              : #include "storage/bulk_write.h"
      55              : #include "storage/condition_variable.h"
      56              : #include "storage/proc.h"
      57              : #include "tcop/tcopprot.h"
      58              : #include "utils/rel.h"
      59              : #include "utils/sortsupport.h"
      60              : #include "utils/tuplesort.h"
      61              : #include "utils/wait_event.h"
      62              : 
      63              : 
      64              : /* Magic numbers for parallel state sharing */
      65              : #define PARALLEL_KEY_BTREE_SHARED       UINT64CONST(0xA000000000000001)
      66              : #define PARALLEL_KEY_TUPLESORT          UINT64CONST(0xA000000000000002)
      67              : #define PARALLEL_KEY_TUPLESORT_SPOOL2   UINT64CONST(0xA000000000000003)
      68              : #define PARALLEL_KEY_QUERY_TEXT         UINT64CONST(0xA000000000000004)
      69              : #define PARALLEL_KEY_WAL_USAGE          UINT64CONST(0xA000000000000005)
      70              : #define PARALLEL_KEY_BUFFER_USAGE       UINT64CONST(0xA000000000000006)
      71              : 
      72              : /*
      73              :  * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
      74              :  * parallel index builds.  This may be useful as a debugging aid.
      75              :  */
      76              : /* #define DISABLE_LEADER_PARTICIPATION */
      77              : 
      78              : /*
      79              :  * Status record for spooling/sorting phase.  (Note we may have two of
      80              :  * these due to the special requirements for uniqueness-checking with
      81              :  * dead tuples.)
      82              :  */
      83              : typedef struct BTSpool
      84              : {
      85              :     Tuplesortstate *sortstate;  /* state data for tuplesort.c */
      86              :     Relation    heap;
      87              :     Relation    index;
      88              :     bool        isunique;
      89              :     bool        nulls_not_distinct;
      90              : } BTSpool;
      91              : 
      92              : /*
      93              :  * Status for index builds performed in parallel.  This is allocated in a
      94              :  * dynamic shared memory segment.  Note that there is a separate tuplesort TOC
      95              :  * entry, private to tuplesort.c but allocated by this module on its behalf.
      96              :  */
      97              : typedef struct BTShared
      98              : {
      99              :     /*
     100              :      * These fields are not modified during the sort.  They primarily exist
     101              :      * for the benefit of worker processes that need to create BTSpool state
     102              :      * corresponding to that used by the leader.
     103              :      */
     104              :     Oid         heaprelid;
     105              :     Oid         indexrelid;
     106              :     bool        isunique;
     107              :     bool        nulls_not_distinct;
     108              :     bool        isconcurrent;
     109              :     int         scantuplesortstates;
     110              : 
     111              :     /* Query ID, for report in worker processes */
     112              :     int64       queryid;
     113              : 
     114              :     /*
     115              :      * workersdonecv is used to monitor the progress of workers.  All parallel
     116              :      * participants must indicate that they are done before leader can use
     117              :      * mutable state that workers maintain during scan (and before leader can
     118              :      * proceed to tuplesort_performsort()).
     119              :      */
     120              :     ConditionVariable workersdonecv;
     121              : 
     122              :     /*
     123              :      * mutex protects all fields before heapdesc.
     124              :      *
     125              :      * These fields contain status information of interest to B-Tree index
     126              :      * builds that must work just the same when an index is built in parallel.
     127              :      */
     128              :     slock_t     mutex;
     129              : 
     130              :     /*
     131              :      * Mutable state that is maintained by workers, and reported back to
     132              :      * leader at end of parallel scan.
     133              :      *
     134              :      * nparticipantsdone is number of worker processes finished.
     135              :      *
     136              :      * reltuples is the total number of input heap tuples.
     137              :      *
     138              :      * havedead indicates if RECENTLY_DEAD tuples were encountered during
     139              :      * build.
     140              :      *
     141              :      * indtuples is the total number of tuples that made it into the index.
     142              :      *
     143              :      * brokenhotchain indicates if any worker detected a broken HOT chain
     144              :      * during build.
     145              :      */
     146              :     int         nparticipantsdone;
     147              :     double      reltuples;
     148              :     bool        havedead;
     149              :     double      indtuples;
     150              :     bool        brokenhotchain;
     151              : 
     152              :     /*
     153              :      * ParallelTableScanDescData data follows. Can't directly embed here, as
     154              :      * implementations of the parallel table scan desc interface might need
     155              :      * stronger alignment.
     156              :      */
     157              : } BTShared;
     158              : 
     159              : /*
     160              :  * Return pointer to a BTShared's parallel table scan.
     161              :  *
     162              :  * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
     163              :  * MAXALIGN.
     164              :  */
     165              : #define ParallelTableScanFromBTShared(shared) \
     166              :     (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
     167              : 
     168              : /*
     169              :  * Status for leader in parallel index build.
     170              :  */
     171              : typedef struct BTLeader
     172              : {
     173              :     /* parallel context itself */
     174              :     ParallelContext *pcxt;
     175              : 
     176              :     /*
     177              :      * nparticipanttuplesorts is the exact number of worker processes
     178              :      * successfully launched, plus one leader process if it participates as a
     179              :      * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
     180              :      * participating as a worker).
     181              :      */
     182              :     int         nparticipanttuplesorts;
     183              : 
     184              :     /*
     185              :      * Leader process convenience pointers to shared state (leader avoids TOC
     186              :      * lookups).
     187              :      *
     188              :      * btshared is the shared state for entire build.  sharedsort is the
     189              :      * shared, tuplesort-managed state passed to each process tuplesort.
     190              :      * sharedsort2 is the corresponding btspool2 shared state, used only when
     191              :      * building unique indexes.  snapshot is the snapshot used by the scan iff
     192              :      * an MVCC snapshot is required.
     193              :      */
     194              :     BTShared   *btshared;
     195              :     Sharedsort *sharedsort;
     196              :     Sharedsort *sharedsort2;
     197              :     Snapshot    snapshot;
     198              :     WalUsage   *walusage;
     199              :     BufferUsage *bufferusage;
     200              : } BTLeader;
     201              : 
     202              : /*
     203              :  * Working state for btbuild and its callback.
     204              :  *
     205              :  * When parallel CREATE INDEX is used, there is a BTBuildState for each
     206              :  * participant.
     207              :  */
     208              : typedef struct BTBuildState
     209              : {
     210              :     bool        isunique;
     211              :     bool        nulls_not_distinct;
     212              :     bool        havedead;
     213              :     Relation    heap;
     214              :     BTSpool    *spool;
     215              : 
     216              :     /*
     217              :      * spool2 is needed only when the index is a unique index. Dead tuples are
     218              :      * put into spool2 instead of spool in order to avoid uniqueness check.
     219              :      */
     220              :     BTSpool    *spool2;
     221              :     double      indtuples;
     222              : 
     223              :     /*
     224              :      * btleader is only present when a parallel index build is performed, and
     225              :      * only in the leader process. (Actually, only the leader has a
     226              :      * BTBuildState.  Workers have their own spool and spool2, though.)
     227              :      */
     228              :     BTLeader   *btleader;
     229              : } BTBuildState;
     230              : 
     231              : /*
     232              :  * Status record for a btree page being built.  We have one of these
     233              :  * for each active tree level.
     234              :  */
     235              : typedef struct BTPageState
     236              : {
     237              :     BulkWriteBuffer btps_buf;   /* workspace for page building */
     238              :     BlockNumber btps_blkno;     /* block # to write this page at */
     239              :     IndexTuple  btps_lowkey;    /* page's strict lower bound pivot tuple */
     240              :     OffsetNumber btps_lastoff;  /* last item offset loaded */
     241              :     Size        btps_lastextra; /* last item's extra posting list space */
     242              :     uint32      btps_level;     /* tree level (0 = leaf) */
     243              :     Size        btps_full;      /* "full" if less than this much free space */
     244              :     struct BTPageState *btps_next;  /* link to parent level, if any */
     245              : } BTPageState;
     246              : 
     247              : /*
     248              :  * Overall status record for index writing phase.
     249              :  */
     250              : typedef struct BTWriteState
     251              : {
     252              :     Relation    heap;
     253              :     Relation    index;
     254              :     BulkWriteState *bulkstate;
     255              :     BTScanInsert inskey;        /* generic insertion scankey */
     256              :     BlockNumber btws_pages_alloced; /* # pages allocated */
     257              : } BTWriteState;
     258              : 
     259              : 
     260              : static double _bt_spools_heapscan(Relation heap, Relation index,
     261              :                                   BTBuildState *buildstate, IndexInfo *indexInfo);
     262              : static void _bt_spooldestroy(BTSpool *btspool);
     263              : static void _bt_spool(BTSpool *btspool, const ItemPointerData *self,
     264              :                       const Datum *values, const bool *isnull);
     265              : static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
     266              : static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
     267              :                                bool *isnull, bool tupleIsAlive, void *state);
     268              : static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level);
     269              : static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
     270              : static void _bt_slideleft(Page rightmostpage);
     271              : static void _bt_sortaddtup(Page page, Size itemsize,
     272              :                            const IndexTupleData *itup, OffsetNumber itup_off,
     273              :                            bool newfirstdataitem);
     274              : static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
     275              :                          IndexTuple itup, Size truncextra);
     276              : static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
     277              :                                           BTPageState *state,
     278              :                                           BTDedupState dstate);
     279              : static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
     280              : static void _bt_load(BTWriteState *wstate,
     281              :                      BTSpool *btspool, BTSpool *btspool2);
     282              : static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
     283              :                                int request);
     284              : static void _bt_end_parallel(BTLeader *btleader);
     285              : static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
     286              : static double _bt_parallel_heapscan(BTBuildState *buildstate,
     287              :                                     bool *brokenhotchain);
     288              : static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
     289              : static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
     290              :                                        BTShared *btshared, Sharedsort *sharedsort,
     291              :                                        Sharedsort *sharedsort2, int sortmem,
     292              :                                        bool progress);
     293              : 
     294              : 
     295              : /*
     296              :  *  btbuild() -- build a new btree index.
     297              :  */
     298              : IndexBuildResult *
     299        30934 : btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
     300              : {
     301              :     IndexBuildResult *result;
     302              :     BTBuildState buildstate;
     303              :     double      reltuples;
     304              : 
     305              : #ifdef BTREE_BUILD_STATS
     306              :     if (log_btree_build_stats)
     307              :         ResetUsage();
     308              : #endif                          /* BTREE_BUILD_STATS */
     309              : 
     310        30934 :     buildstate.isunique = indexInfo->ii_Unique;
     311        30934 :     buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
     312        30934 :     buildstate.havedead = false;
     313        30934 :     buildstate.heap = heap;
     314        30934 :     buildstate.spool = NULL;
     315        30934 :     buildstate.spool2 = NULL;
     316        30934 :     buildstate.indtuples = 0;
     317        30934 :     buildstate.btleader = NULL;
     318              : 
     319              :     /*
     320              :      * We expect to be called exactly once for any index relation. If that's
     321              :      * not the case, big trouble's what we have.
     322              :      */
     323        30934 :     if (RelationGetNumberOfBlocks(index) != 0)
     324            0 :         elog(ERROR, "index \"%s\" already contains data",
     325              :              RelationGetRelationName(index));
     326              : 
     327        30934 :     reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
     328              : 
     329              :     /*
     330              :      * Finish the build by (1) completing the sort of the spool file, (2)
     331              :      * inserting the sorted tuples into btree pages and (3) building the upper
     332              :      * levels.  Finally, it may also be necessary to end use of parallelism.
     333              :      */
     334        30926 :     _bt_leafbuild(buildstate.spool, buildstate.spool2);
     335        30870 :     _bt_spooldestroy(buildstate.spool);
     336        30870 :     if (buildstate.spool2)
     337           10 :         _bt_spooldestroy(buildstate.spool2);
     338        30870 :     if (buildstate.btleader)
     339          108 :         _bt_end_parallel(buildstate.btleader);
     340              : 
     341        30870 :     result = palloc_object(IndexBuildResult);
     342              : 
     343        30870 :     result->heap_tuples = reltuples;
     344        30870 :     result->index_tuples = buildstate.indtuples;
     345              : 
     346              : #ifdef BTREE_BUILD_STATS
     347              :     if (log_btree_build_stats)
     348              :     {
     349              :         ShowUsage("BTREE BUILD STATS");
     350              :         ResetUsage();
     351              :     }
     352              : #endif                          /* BTREE_BUILD_STATS */
     353              : 
     354        30870 :     return result;
     355              : }
     356              : 
     357              : /*
     358              :  * Create and initialize one or two spool structures, and save them in caller's
     359              :  * buildstate argument.  May also fill-in fields within indexInfo used by index
     360              :  * builds.
     361              :  *
     362              :  * Scans the heap, possibly in parallel, filling spools with IndexTuples.  This
     363              :  * routine encapsulates all aspects of managing parallelism.  Caller need only
     364              :  * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
     365              :  *
     366              :  * Returns the total number of heap tuples scanned.
     367              :  */
     368              : static double
     369        30934 : _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
     370              :                     IndexInfo *indexInfo)
     371              : {
     372        30934 :     BTSpool    *btspool = palloc0_object(BTSpool);
     373        30934 :     SortCoordinate coordinate = NULL;
     374        30934 :     double      reltuples = 0;
     375              : 
     376              :     /*
     377              :      * We size the sort area as maintenance_work_mem rather than work_mem to
     378              :      * speed index creation.  This should be OK since a single backend can't
     379              :      * run multiple index creations in parallel (see also: notes on
     380              :      * parallelism and maintenance_work_mem below).
     381              :      */
     382        30934 :     btspool->heap = heap;
     383        30934 :     btspool->index = index;
     384        30934 :     btspool->isunique = indexInfo->ii_Unique;
     385        30934 :     btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
     386              : 
     387              :     /* Save as primary spool */
     388        30934 :     buildstate->spool = btspool;
     389              : 
     390              :     /* Report table scan phase started */
     391        30934 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     392              :                                  PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);
     393              : 
     394              :     /* Attempt to launch parallel worker scan when required */
     395        30934 :     if (indexInfo->ii_ParallelWorkers > 0)
     396          108 :         _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
     397              :                            indexInfo->ii_ParallelWorkers);
     398              : 
     399              :     /*
     400              :      * If parallel build requested and at least one worker process was
     401              :      * successfully launched, set up coordination state
     402              :      */
     403        30934 :     if (buildstate->btleader)
     404              :     {
     405          108 :         coordinate = palloc0_object(SortCoordinateData);
     406          108 :         coordinate->isWorker = false;
     407          108 :         coordinate->nParticipants =
     408          108 :             buildstate->btleader->nparticipanttuplesorts;
     409          108 :         coordinate->sharedsort = buildstate->btleader->sharedsort;
     410              :     }
     411              : 
     412              :     /*
     413              :      * Begin serial/leader tuplesort.
     414              :      *
     415              :      * In cases where parallelism is involved, the leader receives the same
     416              :      * share of maintenance_work_mem as a serial sort (it is generally treated
     417              :      * in the same way as a serial sort once we return).  Parallel worker
     418              :      * Tuplesortstates will have received only a fraction of
     419              :      * maintenance_work_mem, though.
     420              :      *
     421              :      * We rely on the lifetime of the Leader Tuplesortstate almost not
     422              :      * overlapping with any worker Tuplesortstate's lifetime.  There may be
     423              :      * some small overlap, but that's okay because we rely on leader
     424              :      * Tuplesortstate only allocating a small, fixed amount of memory here.
     425              :      * When its tuplesort_performsort() is called (by our caller), and
     426              :      * significant amounts of memory are likely to be used, all workers must
     427              :      * have already freed almost all memory held by their Tuplesortstates
     428              :      * (they are about to go away completely, too).  The overall effect is
     429              :      * that maintenance_work_mem always represents an absolute high watermark
     430              :      * on the amount of memory used by a CREATE INDEX operation, regardless of
     431              :      * the use of parallelism or any other factor.
     432              :      */
     433        61868 :     buildstate->spool->sortstate =
     434        30934 :         tuplesort_begin_index_btree(heap, index, buildstate->isunique,
     435        30934 :                                     buildstate->nulls_not_distinct,
     436              :                                     maintenance_work_mem, coordinate,
     437              :                                     TUPLESORT_NONE);
     438              : 
     439              :     /*
     440              :      * If building a unique index, put dead tuples in a second spool to keep
     441              :      * them out of the uniqueness check.  We expect that the second spool (for
     442              :      * dead tuples) won't get very full, so we give it only work_mem.
     443              :      */
     444        30934 :     if (indexInfo->ii_Unique)
     445              :     {
     446        24719 :         BTSpool    *btspool2 = palloc0_object(BTSpool);
     447        24719 :         SortCoordinate coordinate2 = NULL;
     448              : 
     449              :         /* Initialize secondary spool */
     450        24719 :         btspool2->heap = heap;
     451        24719 :         btspool2->index = index;
     452        24719 :         btspool2->isunique = false;
     453              :         /* Save as secondary spool */
     454        24719 :         buildstate->spool2 = btspool2;
     455              : 
     456        24719 :         if (buildstate->btleader)
     457              :         {
     458              :             /*
     459              :              * Set up non-private state that is passed to
     460              :              * tuplesort_begin_index_btree() about the basic high level
     461              :              * coordination of a parallel sort.
     462              :              */
     463           48 :             coordinate2 = palloc0_object(SortCoordinateData);
     464           48 :             coordinate2->isWorker = false;
     465           48 :             coordinate2->nParticipants =
     466           48 :                 buildstate->btleader->nparticipanttuplesorts;
     467           48 :             coordinate2->sharedsort = buildstate->btleader->sharedsort2;
     468              :         }
     469              : 
     470              :         /*
     471              :          * We expect that the second one (for dead tuples) won't get very
     472              :          * full, so we give it only work_mem
     473              :          */
     474        24719 :         buildstate->spool2->sortstate =
     475        24719 :             tuplesort_begin_index_btree(heap, index, false, false, work_mem,
     476              :                                         coordinate2, TUPLESORT_NONE);
     477              :     }
     478              : 
     479              :     /* Fill spool using either serial or parallel heap scan */
     480        30934 :     if (!buildstate->btleader)
     481        30826 :         reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
     482              :                                            _bt_build_callback, buildstate,
     483              :                                            NULL);
     484              :     else
     485          108 :         reltuples = _bt_parallel_heapscan(buildstate,
     486              :                                           &indexInfo->ii_BrokenHotChain);
     487              : 
     488              :     /*
     489              :      * Set the progress target for the next phase.  Reset the block number
     490              :      * values set by table_index_build_scan
     491              :      */
     492              :     {
     493        30926 :         const int   progress_index[] = {
     494              :             PROGRESS_CREATEIDX_TUPLES_TOTAL,
     495              :             PROGRESS_SCAN_BLOCKS_TOTAL,
     496              :             PROGRESS_SCAN_BLOCKS_DONE
     497              :         };
     498        30926 :         const int64 progress_vals[] = {
     499        30926 :             buildstate->indtuples,
     500              :             0, 0
     501              :         };
     502              : 
     503        30926 :         pgstat_progress_update_multi_param(3, progress_index, progress_vals);
     504              :     }
     505              : 
     506              :     /* okay, all heap tuples are spooled */
     507        30926 :     if (buildstate->spool2 && !buildstate->havedead)
     508              :     {
     509              :         /* spool2 turns out to be unnecessary */
     510        24709 :         _bt_spooldestroy(buildstate->spool2);
     511        24709 :         buildstate->spool2 = NULL;
     512              :     }
     513              : 
     514        30926 :     return reltuples;
     515              : }
     516              : 
     517              : /*
     518              :  * clean up a spool structure and its substructures.
     519              :  */
     520              : static void
     521        55589 : _bt_spooldestroy(BTSpool *btspool)
     522              : {
     523        55589 :     tuplesort_end(btspool->sortstate);
     524        55589 :     pfree(btspool);
     525        55589 : }
     526              : 
     527              : /*
     528              :  * spool an index entry into the sort file.
     529              :  */
     530              : static void
     531      7606729 : _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull)
     532              : {
     533      7606729 :     tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
     534              :                                   self, values, isnull);
     535      7606729 : }
     536              : 
     537              : /*
     538              :  * given a spool loaded by successive calls to _bt_spool,
     539              :  * create an entire btree.
     540              :  */
     541              : static void
     542        30926 : _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
     543              : {
     544              :     BTWriteState wstate;
     545              : 
     546              : #ifdef BTREE_BUILD_STATS
     547              :     if (log_btree_build_stats)
     548              :     {
     549              :         ShowUsage("BTREE BUILD (Spool) STATISTICS");
     550              :         ResetUsage();
     551              :     }
     552              : #endif                          /* BTREE_BUILD_STATS */
     553              : 
     554              :     /* Execute the sort */
     555        30926 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     556              :                                  PROGRESS_BTREE_PHASE_PERFORMSORT_1);
     557        30926 :     tuplesort_performsort(btspool->sortstate);
     558        30870 :     if (btspool2)
     559              :     {
     560           10 :         pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     561              :                                      PROGRESS_BTREE_PHASE_PERFORMSORT_2);
     562           10 :         tuplesort_performsort(btspool2->sortstate);
     563              :     }
     564              : 
     565        30870 :     wstate.heap = btspool->heap;
     566        30870 :     wstate.index = btspool->index;
     567        30870 :     wstate.inskey = _bt_mkscankey(wstate.index, NULL);
     568              :     /* _bt_mkscankey() won't set allequalimage without metapage */
     569        30870 :     wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
     570              : 
     571              :     /* reserve the metapage */
     572        30870 :     wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
     573              : 
     574        30870 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     575              :                                  PROGRESS_BTREE_PHASE_LEAF_LOAD);
     576        30870 :     _bt_load(&wstate, btspool, btspool2);
     577        30870 : }
     578              : 
     579              : /*
     580              :  * Per-tuple callback for table_index_build_scan
     581              :  */
     582              : static void
     583      7606729 : _bt_build_callback(Relation index,
     584              :                    ItemPointer tid,
     585              :                    Datum *values,
     586              :                    bool *isnull,
     587              :                    bool tupleIsAlive,
     588              :                    void *state)
     589              : {
     590      7606729 :     BTBuildState *buildstate = (BTBuildState *) state;
     591              : 
     592              :     /*
     593              :      * insert the index tuple into the appropriate spool file for subsequent
     594              :      * processing
     595              :      */
     596      7606729 :     if (tupleIsAlive || buildstate->spool2 == NULL)
     597      7606479 :         _bt_spool(buildstate->spool, tid, values, isnull);
     598              :     else
     599              :     {
     600              :         /* dead tuples are put into spool2 */
     601          250 :         buildstate->havedead = true;
     602          250 :         _bt_spool(buildstate->spool2, tid, values, isnull);
     603              :     }
     604              : 
     605      7606729 :     buildstate->indtuples += 1;
     606      7606729 : }
     607              : 
     608              : /*
     609              :  * allocate workspace for a new, clean btree page, not linked to any siblings.
     610              :  */
     611              : static BulkWriteBuffer
     612        30953 : _bt_blnewpage(BTWriteState *wstate, uint32 level)
     613              : {
     614              :     BulkWriteBuffer buf;
     615              :     Page        page;
     616              :     BTPageOpaque opaque;
     617              : 
     618        30953 :     buf = smgr_bulk_get_buf(wstate->bulkstate);
     619        30953 :     page = (Page) buf;
     620              : 
     621              :     /* Zero the page and set up standard page header info */
     622        30953 :     _bt_pageinit(page, BLCKSZ);
     623              : 
     624              :     /* Initialize BT opaque state */
     625        30953 :     opaque = BTPageGetOpaque(page);
     626        30953 :     opaque->btpo_prev = opaque->btpo_next = P_NONE;
     627        30953 :     opaque->btpo_level = level;
     628        30953 :     opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
     629        30953 :     opaque->btpo_cycleid = 0;
     630              : 
     631              :     /* Make the P_HIKEY line pointer appear allocated */
     632        30953 :     ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
     633              : 
     634        30953 :     return buf;
     635              : }
     636              : 
     637              : /*
     638              :  * emit a completed btree page, and release the working storage.
     639              :  */
     640              : static void
     641        61823 : _bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
     642              : {
     643        61823 :     smgr_bulk_write(wstate->bulkstate, blkno, buf, true);
     644              :     /* smgr_bulk_write took ownership of 'buf' */
     645        61823 : }
     646              : 
     647              : /*
     648              :  * allocate and initialize a new BTPageState.  the returned structure
     649              :  * is suitable for immediate use by _bt_buildadd.
     650              :  */
     651              : static BTPageState *
     652         7491 : _bt_pagestate(BTWriteState *wstate, uint32 level)
     653              : {
     654         7491 :     BTPageState *state = palloc0_object(BTPageState);
     655              : 
     656              :     /* create initial page for level */
     657         7491 :     state->btps_buf = _bt_blnewpage(wstate, level);
     658              : 
     659              :     /* and assign it a page position */
     660         7491 :     state->btps_blkno = wstate->btws_pages_alloced++;
     661              : 
     662         7491 :     state->btps_lowkey = NULL;
     663              :     /* initialize lastoff so first item goes into P_FIRSTKEY */
     664         7491 :     state->btps_lastoff = P_HIKEY;
     665         7491 :     state->btps_lastextra = 0;
     666         7491 :     state->btps_level = level;
     667              :     /* set "full" threshold based on level.  See notes at head of file. */
     668         7491 :     if (level > 0)
     669         1479 :         state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
     670              :     else
     671         6012 :         state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
     672              : 
     673              :     /* no parent level, yet */
     674         7491 :     state->btps_next = NULL;
     675              : 
     676         7491 :     return state;
     677              : }
     678              : 
     679              : /*
     680              :  * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
     681              :  * P_HIKEY, overwriting P_HIKEY).
     682              :  *
     683              :  * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
     684              :  * rightmost page on its level is not supposed to get a high key.  Now that
     685              :  * it's clear that this page is a rightmost page, remove the unneeded empty
     686              :  * P_HIKEY line pointer space.
     687              :  */
     688              : static void
     689         7491 : _bt_slideleft(Page rightmostpage)
     690              : {
     691              :     OffsetNumber off;
     692              :     OffsetNumber maxoff;
     693              :     ItemId      previi;
     694              : 
     695         7491 :     maxoff = PageGetMaxOffsetNumber(rightmostpage);
     696              :     Assert(maxoff >= P_FIRSTKEY);
     697         7491 :     previi = PageGetItemId(rightmostpage, P_HIKEY);
     698       443916 :     for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
     699              :     {
     700       436425 :         ItemId      thisii = PageGetItemId(rightmostpage, off);
     701              : 
     702       436425 :         *previi = *thisii;
     703       436425 :         previi = thisii;
     704              :     }
     705         7491 :     ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
     706         7491 : }
     707              : 
     708              : /*
     709              :  * Add an item to a page being built.
     710              :  *
     711              :  * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
     712              :  * raises an error directly.
     713              :  *
     714              :  * Note that our nbtsort.c caller does not know yet if the page will be
     715              :  * rightmost.  Offset P_FIRSTKEY is always assumed to be the first data key by
     716              :  * caller.  Page that turns out to be the rightmost on its level is fixed by
     717              :  * calling _bt_slideleft().
     718              :  */
     719              : static void
     720      6652160 : _bt_sortaddtup(Page page,
     721              :                Size itemsize,
     722              :                const IndexTupleData *itup,
     723              :                OffsetNumber itup_off,
     724              :                bool newfirstdataitem)
     725              : {
     726              :     IndexTupleData trunctuple;
     727              : 
     728      6652160 :     if (newfirstdataitem)
     729              :     {
     730         1558 :         trunctuple = *itup;
     731         1558 :         trunctuple.t_info = sizeof(IndexTupleData);
     732         1558 :         BTreeTupleSetNAtts(&trunctuple, 0, false);
     733         1558 :         itup = &trunctuple;
     734         1558 :         itemsize = sizeof(IndexTupleData);
     735              :     }
     736              : 
     737      6652160 :     if (PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber)
     738            0 :         elog(ERROR, "failed to add item to the index page");
     739      6652160 : }
     740              : 
     741              : /*----------
     742              :  * Add an item to a disk page from the sort output (or add a posting list
     743              :  * item formed from the sort output).
     744              :  *
     745              :  * We must be careful to observe the page layout conventions of nbtsearch.c:
     746              :  * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
     747              :  * - on non-leaf pages, the key portion of the first item need not be
     748              :  *   stored, we should store only the link.
     749              :  *
     750              :  * A leaf page being built looks like:
     751              :  *
     752              :  * +----------------+---------------------------------+
     753              :  * | PageHeaderData | linp0 linp1 linp2 ...           |
     754              :  * +-----------+----+---------------------------------+
     755              :  * | ... linpN |                                      |
     756              :  * +-----------+--------------------------------------+
     757              :  * |     ^ last                                       |
     758              :  * |                                                  |
     759              :  * +-------------+------------------------------------+
     760              :  * |             | itemN ...                          |
     761              :  * +-------------+------------------+-----------------+
     762              :  * |          ... item3 item2 item1 | "special space" |
     763              :  * +--------------------------------+-----------------+
     764              :  *
     765              :  * Contrast this with the diagram in bufpage.h; note the mismatch
     766              :  * between linps and items.  This is because we reserve linp0 as a
     767              :  * placeholder for the pointer to the "high key" item; when we have
     768              :  * filled up the page, we will set linp0 to point to itemN and clear
     769              :  * linpN.  On the other hand, if we find this is the last (rightmost)
     770              :  * page, we leave the items alone and slide the linp array over.  If
     771              :  * the high key is to be truncated, offset 1 is deleted, and we insert
     772              :  * the truncated high key at offset 1.
     773              :  *
     774              :  * 'last' pointer indicates the last offset added to the page.
     775              :  *
     776              :  * 'truncextra' is the size of the posting list in itup, if any.  This
     777              :  * information is stashed for the next call here, when we may benefit
     778              :  * from considering the impact of truncating away the posting list on
     779              :  * the page before deciding to finish the page off.  Posting lists are
     780              :  * often relatively large, so it is worth going to the trouble of
     781              :  * accounting for the saving from truncating away the posting list of
     782              :  * the tuple that becomes the high key (that may be the only way to
     783              :  * get close to target free space on the page).  Note that this is
     784              :  * only used for the soft fillfactor-wise limit, not the critical hard
     785              :  * limit.
     786              :  *----------
     787              :  */
     788              : static void
     789      6628698 : _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
     790              :              Size truncextra)
     791              : {
     792              :     BulkWriteBuffer nbuf;
     793              :     Page        npage;
     794              :     BlockNumber nblkno;
     795              :     OffsetNumber last_off;
     796              :     Size        last_truncextra;
     797              :     Size        pgspc;
     798              :     Size        itupsz;
     799              :     bool        isleaf;
     800              : 
     801              :     /*
     802              :      * This is a handy place to check for cancel interrupts during the btree
     803              :      * load phase of index creation.
     804              :      */
     805      6628698 :     CHECK_FOR_INTERRUPTS();
     806              : 
     807      6628698 :     nbuf = state->btps_buf;
     808      6628698 :     npage = (Page) nbuf;
     809      6628698 :     nblkno = state->btps_blkno;
     810      6628698 :     last_off = state->btps_lastoff;
     811      6628698 :     last_truncextra = state->btps_lastextra;
     812      6628698 :     state->btps_lastextra = truncextra;
     813              : 
     814      6628698 :     pgspc = PageGetFreeSpace(npage);
     815      6628698 :     itupsz = IndexTupleSize(itup);
     816      6628698 :     itupsz = MAXALIGN(itupsz);
     817              :     /* Leaf case has slightly different rules due to suffix truncation */
     818      6628698 :     isleaf = (state->btps_level == 0);
     819              : 
     820              :     /*
     821              :      * Check whether the new item can fit on a btree page on current level at
     822              :      * all.
     823              :      *
     824              :      * Every newly built index will treat heap TID as part of the keyspace,
     825              :      * which imposes the requirement that new high keys must occasionally have
     826              :      * a heap TID appended within _bt_truncate().  That may leave a new pivot
     827              :      * tuple one or two MAXALIGN() quantums larger than the original
     828              :      * firstright tuple it's derived from.  v4 deals with the problem by
     829              :      * decreasing the limit on the size of tuples inserted on the leaf level
     830              :      * by the same small amount.  Enforce the new v4+ limit on the leaf level,
     831              :      * and the old limit on internal levels, since pivot tuples may need to
     832              :      * make use of the reserved space.  This should never fail on internal
     833              :      * pages.
     834              :      */
     835      6628698 :     if (unlikely(itupsz > BTMaxItemSize))
     836          176 :         _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
     837              :                              itup);
     838              : 
     839              :     /*
     840              :      * Check to see if current page will fit new item, with space left over to
     841              :      * append a heap TID during suffix truncation when page is a leaf page.
     842              :      *
     843              :      * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
     844              :      * high key with heap TID when finishing off a leaf page, since we rely on
     845              :      * _bt_check_third_page() rejecting oversized non-pivot tuples.  On
     846              :      * internal pages we can always fit 3 pivot tuples with larger internal
     847              :      * page tuple limit (includes page high key).
     848              :      *
     849              :      * Most of the time, a page is only "full" in the sense that the soft
     850              :      * fillfactor-wise limit has been exceeded.  However, we must always leave
     851              :      * at least two items plus a high key on each page before starting a new
     852              :      * page.  Disregard fillfactor and insert on "full" current page if we
     853              :      * don't have the minimum number of items yet.  (Note that we deliberately
     854              :      * assume that suffix truncation neither enlarges nor shrinks new high key
     855              :      * when applying soft limit, except when last tuple has a posting list.)
     856              :      */
     857              :     Assert(last_truncextra == 0 || isleaf);
     858      6628698 :     if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
     859      6627847 :         (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
     860              :     {
     861              :         /*
     862              :          * Finish off the page and write it out.
     863              :          */
     864        23462 :         BulkWriteBuffer obuf = nbuf;
     865        23462 :         Page        opage = npage;
     866        23462 :         BlockNumber oblkno = nblkno;
     867              :         ItemId      ii;
     868              :         ItemId      hii;
     869              :         IndexTuple  oitup;
     870              : 
     871              :         /* Create new page of same level */
     872        23462 :         nbuf = _bt_blnewpage(wstate, state->btps_level);
     873        23462 :         npage = (Page) nbuf;
     874              : 
     875              :         /* and assign it a page position */
     876        23462 :         nblkno = wstate->btws_pages_alloced++;
     877              : 
     878              :         /*
     879              :          * We copy the last item on the page into the new page, and then
     880              :          * rearrange the old page so that the 'last item' becomes its high key
     881              :          * rather than a true data item.  There had better be at least two
     882              :          * items on the page already, else the page would be empty of useful
     883              :          * data.
     884              :          */
     885              :         Assert(last_off > P_FIRSTKEY);
     886        23462 :         ii = PageGetItemId(opage, last_off);
     887        23462 :         oitup = (IndexTuple) PageGetItem(opage, ii);
     888        23462 :         _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
     889        23462 :                        !isleaf);
     890              : 
     891              :         /*
     892              :          * Move 'last' into the high key position on opage.  _bt_blnewpage()
     893              :          * allocated empty space for a line pointer when opage was first
     894              :          * created, so this is a matter of rearranging already-allocated space
     895              :          * on page, and initializing high key line pointer. (Actually, leaf
     896              :          * pages must also swap oitup with a truncated version of oitup, which
     897              :          * is sometimes larger than oitup, though never by more than the space
     898              :          * needed to append a heap TID.)
     899              :          */
     900        23462 :         hii = PageGetItemId(opage, P_HIKEY);
     901        23462 :         *hii = *ii;
     902        23462 :         ItemIdSetUnused(ii);    /* redundant */
     903        23462 :         ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
     904              : 
     905        23462 :         if (isleaf)
     906              :         {
     907              :             IndexTuple  lastleft;
     908              :             IndexTuple  truncated;
     909              : 
     910              :             /*
     911              :              * Truncate away any unneeded attributes from high key on leaf
     912              :              * level.  This is only done at the leaf level because downlinks
     913              :              * in internal pages are either negative infinity items, or get
     914              :              * their contents from copying from one level down.  See also:
     915              :              * _bt_split().
     916              :              *
     917              :              * We don't try to bias our choice of split point to make it more
     918              :              * likely that _bt_truncate() can truncate away more attributes,
     919              :              * whereas the split point used within _bt_split() is chosen much
     920              :              * more delicately.  Even still, the lastleft and firstright
     921              :              * tuples passed to _bt_truncate() here are at least not fully
     922              :              * equal to each other when deduplication is used, unless there is
     923              :              * a large group of duplicates (also, unique index builds usually
     924              :              * have few or no spool2 duplicates).  When the split point is
     925              :              * between two unequal tuples, _bt_truncate() will avoid including
     926              :              * a heap TID in the new high key, which is the most important
     927              :              * benefit of suffix truncation.
     928              :              *
     929              :              * Overwrite the old item with new truncated high key directly.
     930              :              * oitup is already located at the physical beginning of tuple
     931              :              * space, so this should directly reuse the existing tuple space.
     932              :              */
     933        23383 :             ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
     934        23383 :             lastleft = (IndexTuple) PageGetItem(opage, ii);
     935              : 
     936              :             Assert(IndexTupleSize(oitup) > last_truncextra);
     937        23383 :             truncated = _bt_truncate(wstate->index, lastleft, oitup,
     938              :                                      wstate->inskey);
     939        23383 :             if (!PageIndexTupleOverwrite(opage, P_HIKEY, truncated, IndexTupleSize(truncated)))
     940            0 :                 elog(ERROR, "failed to add high key to the index page");
     941        23383 :             pfree(truncated);
     942              : 
     943              :             /* oitup should continue to point to the page's high key */
     944        23383 :             hii = PageGetItemId(opage, P_HIKEY);
     945        23383 :             oitup = (IndexTuple) PageGetItem(opage, hii);
     946              :         }
     947              : 
     948              :         /*
     949              :          * Link the old page into its parent, using its low key.  If we don't
     950              :          * have a parent, we have to create one; this adds a new btree level.
     951              :          */
     952        23462 :         if (state->btps_next == NULL)
     953         1479 :             state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
     954              : 
     955              :         Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
     956              :                 IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
     957              :                 BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
     958              :                P_LEFTMOST(BTPageGetOpaque(opage)));
     959              :         Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
     960              :                !P_LEFTMOST(BTPageGetOpaque(opage)));
     961        23462 :         BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
     962        23462 :         _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
     963        23462 :         pfree(state->btps_lowkey);
     964              : 
     965              :         /*
     966              :          * Save a copy of the high key from the old page.  It is also the low
     967              :          * key for the new page.
     968              :          */
     969        23462 :         state->btps_lowkey = CopyIndexTuple(oitup);
     970              : 
     971              :         /*
     972              :          * Set the sibling links for both pages.
     973              :          */
     974              :         {
     975        23462 :             BTPageOpaque oopaque = BTPageGetOpaque(opage);
     976        23462 :             BTPageOpaque nopaque = BTPageGetOpaque(npage);
     977              : 
     978        23462 :             oopaque->btpo_next = nblkno;
     979        23462 :             nopaque->btpo_prev = oblkno;
     980        23462 :             nopaque->btpo_next = P_NONE; /* redundant */
     981              :         }
     982              : 
     983              :         /*
     984              :          * Write out the old page. _bt_blwritepage takes ownership of the
     985              :          * 'opage' buffer.
     986              :          */
     987        23462 :         _bt_blwritepage(wstate, obuf, oblkno);
     988              : 
     989              :         /*
     990              :          * Reset last_off to point to new page
     991              :          */
     992        23462 :         last_off = P_FIRSTKEY;
     993              :     }
     994              : 
     995              :     /*
     996              :      * By here, either original page is still the current page, or a new page
     997              :      * was created that became the current page.  Either way, the current page
     998              :      * definitely has space for new item.
     999              :      *
    1000              :      * If the new item is the first for its page, it must also be the first
    1001              :      * item on its entire level.  On later same-level pages, a low key for a
    1002              :      * page will be copied from the prior page in the code above.  Generate a
    1003              :      * minus infinity low key here instead.
    1004              :      */
    1005      6628698 :     if (last_off == P_HIKEY)
    1006              :     {
    1007              :         Assert(state->btps_lowkey == NULL);
    1008         7491 :         state->btps_lowkey = palloc0_object(IndexTupleData);
    1009         7491 :         state->btps_lowkey->t_info = sizeof(IndexTupleData);
    1010         7491 :         BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
    1011              :     }
    1012              : 
    1013              :     /*
    1014              :      * Add the new item into the current page.
    1015              :      */
    1016      6628698 :     last_off = OffsetNumberNext(last_off);
    1017      6628698 :     _bt_sortaddtup(npage, itupsz, itup, last_off,
    1018      6628698 :                    !isleaf && last_off == P_FIRSTKEY);
    1019              : 
    1020      6628698 :     state->btps_buf = nbuf;
    1021      6628698 :     state->btps_blkno = nblkno;
    1022      6628698 :     state->btps_lastoff = last_off;
    1023      6628698 : }
    1024              : 
    1025              : /*
    1026              :  * Finalize pending posting list tuple, and add it to the index.  Final tuple
    1027              :  * is based on saved base tuple, and saved list of heap TIDs.
    1028              :  *
    1029              :  * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
    1030              :  * using _bt_buildadd().
    1031              :  */
    1032              : static void
    1033      3070081 : _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
    1034              :                               BTDedupState dstate)
    1035              : {
    1036              :     Assert(dstate->nitems > 0);
    1037              : 
    1038      3070081 :     if (dstate->nitems == 1)
    1039      3044406 :         _bt_buildadd(wstate, state, dstate->base, 0);
    1040              :     else
    1041              :     {
    1042              :         IndexTuple  postingtuple;
    1043              :         Size        truncextra;
    1044              : 
    1045              :         /* form a tuple with a posting list */
    1046        25675 :         postingtuple = _bt_form_posting(dstate->base,
    1047        25675 :                                         dstate->htids,
    1048              :                                         dstate->nhtids);
    1049              :         /* Calculate posting list overhead */
    1050        25675 :         truncextra = IndexTupleSize(postingtuple) -
    1051        25675 :             BTreeTupleGetPostingOffset(postingtuple);
    1052              : 
    1053        25675 :         _bt_buildadd(wstate, state, postingtuple, truncextra);
    1054        25675 :         pfree(postingtuple);
    1055              :     }
    1056              : 
    1057      3070081 :     dstate->nmaxitems = 0;
    1058      3070081 :     dstate->nhtids = 0;
    1059      3070081 :     dstate->nitems = 0;
    1060      3070081 :     dstate->phystupsize = 0;
    1061      3070081 : }
    1062              : 
    1063              : /*
    1064              :  * Finish writing out the completed btree.
    1065              :  */
    1066              : static void
    1067        30870 : _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
    1068              : {
    1069              :     BTPageState *s;
    1070        30870 :     BlockNumber rootblkno = P_NONE;
    1071        30870 :     uint32      rootlevel = 0;
    1072              :     BulkWriteBuffer metabuf;
    1073              : 
    1074              :     /*
    1075              :      * Each iteration of this loop completes one more level of the tree.
    1076              :      */
    1077        38361 :     for (s = state; s != NULL; s = s->btps_next)
    1078              :     {
    1079              :         BlockNumber blkno;
    1080              :         BTPageOpaque opaque;
    1081              : 
    1082         7491 :         blkno = s->btps_blkno;
    1083         7491 :         opaque = BTPageGetOpaque((Page) s->btps_buf);
    1084              : 
    1085              :         /*
    1086              :          * We have to link the last page on this level to somewhere.
    1087              :          *
    1088              :          * If we're at the top, it's the root, so attach it to the metapage.
    1089              :          * Otherwise, add an entry for it to its parent using its low key.
    1090              :          * This may cause the last page of the parent level to split, but
    1091              :          * that's not a problem -- we haven't gotten to it yet.
    1092              :          */
    1093         7491 :         if (s->btps_next == NULL)
    1094              :         {
    1095         6012 :             opaque->btpo_flags |= BTP_ROOT;
    1096         6012 :             rootblkno = blkno;
    1097         6012 :             rootlevel = s->btps_level;
    1098              :         }
    1099              :         else
    1100              :         {
    1101              :             Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
    1102              :                     IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
    1103              :                     BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
    1104              :                    P_LEFTMOST(opaque));
    1105              :             Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
    1106              :                    !P_LEFTMOST(opaque));
    1107         1479 :             BTreeTupleSetDownLink(s->btps_lowkey, blkno);
    1108         1479 :             _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
    1109         1479 :             pfree(s->btps_lowkey);
    1110         1479 :             s->btps_lowkey = NULL;
    1111              :         }
    1112              : 
    1113              :         /*
    1114              :          * This is the rightmost page, so the ItemId array needs to be slid
    1115              :          * back one slot.  Then we can dump out the page.
    1116              :          */
    1117         7491 :         _bt_slideleft((Page) s->btps_buf);
    1118         7491 :         _bt_blwritepage(wstate, s->btps_buf, s->btps_blkno);
    1119         7491 :         s->btps_buf = NULL;      /* writepage took ownership of the buffer */
    1120              :     }
    1121              : 
    1122              :     /*
    1123              :      * As the last step in the process, construct the metapage and make it
    1124              :      * point to the new root (unless we had no data at all, in which case it's
    1125              :      * set to point to "P_NONE").  This changes the index to the "valid" state
    1126              :      * by filling in a valid magic number in the metapage.
    1127              :      */
    1128        30870 :     metabuf = smgr_bulk_get_buf(wstate->bulkstate);
    1129        30870 :     _bt_initmetapage((Page) metabuf, rootblkno, rootlevel,
    1130        30870 :                      wstate->inskey->allequalimage);
    1131        30870 :     _bt_blwritepage(wstate, metabuf, BTREE_METAPAGE);
    1132        30870 : }
    1133              : 
    1134              : /*
    1135              :  * Read tuples in correct sort order from tuplesort, and load them into
    1136              :  * btree leaves.
    1137              :  */
    1138              : static void
    1139        30870 : _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
    1140              : {
    1141        30870 :     BTPageState *state = NULL;
    1142        30870 :     bool        merge = (btspool2 != NULL);
    1143              :     IndexTuple  itup,
    1144        30870 :                 itup2 = NULL;
    1145              :     bool        load1;
    1146        30870 :     TupleDesc   tupdes = RelationGetDescr(wstate->index);
    1147              :     int         i,
    1148        30870 :                 keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
    1149              :     SortSupport sortKeys;
    1150        30870 :     int64       tuples_done = 0;
    1151              :     bool        deduplicate;
    1152              : 
    1153        30870 :     wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);
    1154              : 
    1155        36884 :     deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
    1156         6014 :         BTGetDeduplicateItems(wstate->index);
    1157              : 
    1158        30870 :     if (merge)
    1159              :     {
    1160              :         /*
    1161              :          * Another BTSpool for dead tuples exists. Now we have to merge
    1162              :          * btspool and btspool2.
    1163              :          */
    1164              : 
    1165              :         /* the preparation of merge */
    1166           10 :         itup = tuplesort_getindextuple(btspool->sortstate, true);
    1167           10 :         itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
    1168              : 
    1169              :         /* Prepare SortSupport data for each column */
    1170           10 :         sortKeys = palloc0_array(SortSupportData, keysz);
    1171              : 
    1172           21 :         for (i = 0; i < keysz; i++)
    1173              :         {
    1174           11 :             SortSupport sortKey = sortKeys + i;
    1175           11 :             ScanKey     scanKey = wstate->inskey->scankeys + i;
    1176              :             bool        reverse;
    1177              : 
    1178           11 :             sortKey->ssup_cxt = CurrentMemoryContext;
    1179           11 :             sortKey->ssup_collation = scanKey->sk_collation;
    1180           11 :             sortKey->ssup_nulls_first =
    1181           11 :                 (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
    1182           11 :             sortKey->ssup_attno = scanKey->sk_attno;
    1183              :             /* Abbreviation is not supported here */
    1184           11 :             sortKey->abbreviate = false;
    1185              : 
    1186              :             Assert(sortKey->ssup_attno != 0);
    1187              : 
    1188           11 :             reverse = (scanKey->sk_flags & SK_BT_DESC) != 0;
    1189              : 
    1190           11 :             PrepareSortSupportFromIndexRel(wstate->index, reverse, sortKey);
    1191              :         }
    1192              : 
    1193              :         for (;;)
    1194              :         {
    1195         1768 :             load1 = true;       /* load BTSpool next ? */
    1196         1768 :             if (itup2 == NULL)
    1197              :             {
    1198           79 :                 if (itup == NULL)
    1199           10 :                     break;
    1200              :             }
    1201         1689 :             else if (itup != NULL)
    1202              :             {
    1203         1594 :                 int32       compare = 0;
    1204              : 
    1205         1717 :                 for (i = 1; i <= keysz; i++)
    1206              :                 {
    1207              :                     SortSupport entry;
    1208              :                     Datum       attrDatum1,
    1209              :                                 attrDatum2;
    1210              :                     bool        isNull1,
    1211              :                                 isNull2;
    1212              : 
    1213         1620 :                     entry = sortKeys + i - 1;
    1214         1620 :                     attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
    1215         1620 :                     attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
    1216              : 
    1217         1620 :                     compare = ApplySortComparator(attrDatum1, isNull1,
    1218              :                                                   attrDatum2, isNull2,
    1219              :                                                   entry);
    1220         1620 :                     if (compare > 0)
    1221              :                     {
    1222          139 :                         load1 = false;
    1223         1497 :                         break;
    1224              :                     }
    1225         1481 :                     else if (compare < 0)
    1226         1358 :                         break;
    1227              :                 }
    1228              : 
    1229              :                 /*
    1230              :                  * If key values are equal, we sort on ItemPointer.  This is
    1231              :                  * required for btree indexes, since heap TID is treated as an
    1232              :                  * implicit last key attribute in order to ensure that all
    1233              :                  * keys in the index are physically unique.
    1234              :                  */
    1235         1594 :                 if (compare == 0)
    1236              :                 {
    1237           97 :                     compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
    1238              :                     Assert(compare != 0);
    1239           97 :                     if (compare > 0)
    1240           16 :                         load1 = false;
    1241              :                 }
    1242              :             }
    1243              :             else
    1244           95 :                 load1 = false;
    1245              : 
    1246              :             /* When we see first tuple, create first index page */
    1247         1758 :             if (state == NULL)
    1248           10 :                 state = _bt_pagestate(wstate, 0);
    1249              : 
    1250         1758 :             if (load1)
    1251              :             {
    1252         1508 :                 _bt_buildadd(wstate, state, itup, 0);
    1253         1508 :                 itup = tuplesort_getindextuple(btspool->sortstate, true);
    1254              :             }
    1255              :             else
    1256              :             {
    1257          250 :                 _bt_buildadd(wstate, state, itup2, 0);
    1258          250 :                 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
    1259              :             }
    1260              : 
    1261              :             /* Report progress */
    1262         1758 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1263              :                                          ++tuples_done);
    1264              :         }
    1265           10 :         pfree(sortKeys);
    1266              :     }
    1267        30860 :     else if (deduplicate)
    1268              :     {
    1269              :         /* merge is unnecessary, deduplicate into posting lists */
    1270              :         BTDedupState dstate;
    1271              : 
    1272         6014 :         dstate = palloc_object(BTDedupStateData);
    1273         6014 :         dstate->deduplicate = true; /* unused */
    1274         6014 :         dstate->nmaxitems = 0;   /* unused */
    1275         6014 :         dstate->maxpostingsize = 0; /* set later */
    1276              :         /* Metadata about base tuple of current pending posting list */
    1277         6014 :         dstate->base = NULL;
    1278         6014 :         dstate->baseoff = InvalidOffsetNumber;   /* unused */
    1279         6014 :         dstate->basetupsize = 0;
    1280              :         /* Metadata about current pending posting list TIDs */
    1281         6014 :         dstate->htids = NULL;
    1282         6014 :         dstate->nhtids = 0;
    1283         6014 :         dstate->nitems = 0;
    1284         6014 :         dstate->phystupsize = 0; /* unused */
    1285         6014 :         dstate->nintervals = 0; /* unused */
    1286              : 
    1287      4078815 :         while ((itup = tuplesort_getindextuple(btspool->sortstate,
    1288      4078815 :                                                true)) != NULL)
    1289              :         {
    1290              :             /* When we see first tuple, create first index page */
    1291      4072801 :             if (state == NULL)
    1292              :             {
    1293         1720 :                 state = _bt_pagestate(wstate, 0);
    1294              : 
    1295              :                 /*
    1296              :                  * Limit size of posting list tuples to 1/10 space we want to
    1297              :                  * leave behind on the page, plus space for final item's line
    1298              :                  * pointer.  This is equal to the space that we'd like to
    1299              :                  * leave behind on each leaf page when fillfactor is 90,
    1300              :                  * allowing us to get close to fillfactor% space utilization
    1301              :                  * when there happen to be a great many duplicates.  (This
    1302              :                  * makes higher leaf fillfactor settings ineffective when
    1303              :                  * building indexes that have many duplicates, but packing
    1304              :                  * leaf pages full with few very large tuples doesn't seem
    1305              :                  * like a useful goal.)
    1306              :                  */
    1307         1720 :                 dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
    1308              :                     sizeof(ItemIdData);
    1309              :                 Assert(dstate->maxpostingsize <= BTMaxItemSize &&
    1310              :                        dstate->maxpostingsize <= INDEX_SIZE_MASK);
    1311         1720 :                 dstate->htids = palloc(dstate->maxpostingsize);
    1312              : 
    1313              :                 /* start new pending posting list with itup copy */
    1314         1720 :                 _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
    1315              :                                         InvalidOffsetNumber);
    1316              :             }
    1317      4071081 :             else if (_bt_keep_natts_fast(wstate->index, dstate->base,
    1318      1008727 :                                          itup) > keysz &&
    1319      1008727 :                      _bt_dedup_save_htid(dstate, itup))
    1320              :             {
    1321              :                 /*
    1322              :                  * Tuple is equal to base tuple of pending posting list.  Heap
    1323              :                  * TID from itup has been saved in state.
    1324              :                  */
    1325              :             }
    1326              :             else
    1327              :             {
    1328              :                 /*
    1329              :                  * Tuple is not equal to pending posting list tuple, or
    1330              :                  * _bt_dedup_save_htid() opted to not merge current item into
    1331              :                  * pending posting list.
    1332              :                  */
    1333      3068361 :                 _bt_sort_dedup_finish_pending(wstate, state, dstate);
    1334      3068361 :                 pfree(dstate->base);
    1335              : 
    1336              :                 /* start new pending posting list with itup copy */
    1337      3068361 :                 _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
    1338              :                                         InvalidOffsetNumber);
    1339              :             }
    1340              : 
    1341              :             /* Report progress */
    1342      4072801 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1343              :                                          ++tuples_done);
    1344              :         }
    1345              : 
    1346         6014 :         if (state)
    1347              :         {
    1348              :             /*
    1349              :              * Handle the last item (there must be a last item when the
    1350              :              * tuplesort returned one or more tuples)
    1351              :              */
    1352         1720 :             _bt_sort_dedup_finish_pending(wstate, state, dstate);
    1353         1720 :             pfree(dstate->base);
    1354         1720 :             pfree(dstate->htids);
    1355              :         }
    1356              : 
    1357         6014 :         pfree(dstate);
    1358              :     }
    1359              :     else
    1360              :     {
    1361              :         /* merging and deduplication are both unnecessary */
    1362      3556764 :         while ((itup = tuplesort_getindextuple(btspool->sortstate,
    1363      3556764 :                                                true)) != NULL)
    1364              :         {
    1365              :             /* When we see first tuple, create first index page */
    1366      3531918 :             if (state == NULL)
    1367         4282 :                 state = _bt_pagestate(wstate, 0);
    1368              : 
    1369      3531918 :             _bt_buildadd(wstate, state, itup, 0);
    1370              : 
    1371              :             /* Report progress */
    1372      3531918 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1373              :                                          ++tuples_done);
    1374              :         }
    1375              :     }
    1376              : 
    1377              :     /* Close down final pages and write the metapage */
    1378        30870 :     _bt_uppershutdown(wstate, state);
    1379        30870 :     smgr_bulk_finish(wstate->bulkstate);
    1380        30870 : }
    1381              : 
    1382              : /*
    1383              :  * Create parallel context, and launch workers for leader.
    1384              :  *
    1385              :  * buildstate argument should be initialized (with the exception of the
    1386              :  * tuplesort state in spools, which may later be created based on shared
    1387              :  * state initially set up here).
    1388              :  *
    1389              :  * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
    1390              :  *
    1391              :  * request is the target number of parallel worker processes to launch.
    1392              :  *
    1393              :  * Sets buildstate's BTLeader, which caller must use to shut down parallel
    1394              :  * mode by passing it to _bt_end_parallel() at the very end of its index
    1395              :  * build.  If not even a single worker process can be launched, this is
    1396              :  * never set, and caller should proceed with a serial index build.
    1397              :  */
    1398              : static void
    1399          108 : _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
    1400              : {
    1401              :     ParallelContext *pcxt;
    1402              :     int         scantuplesortstates;
    1403              :     Snapshot    snapshot;
    1404              :     Size        estbtshared;
    1405              :     Size        estsort;
    1406              :     BTShared   *btshared;
    1407              :     Sharedsort *sharedsort;
    1408              :     Sharedsort *sharedsort2;
    1409          108 :     BTSpool    *btspool = buildstate->spool;
    1410          108 :     BTLeader   *btleader = palloc0_object(BTLeader);
    1411              :     WalUsage   *walusage;
    1412              :     BufferUsage *bufferusage;
    1413          108 :     bool        leaderparticipates = true;
    1414              :     int         querylen;
    1415              : 
    1416              : #ifdef DISABLE_LEADER_PARTICIPATION
    1417              :     leaderparticipates = false;
    1418              : #endif
    1419              : 
    1420              :     /*
    1421              :      * Enter parallel mode, and create context for parallel build of btree
    1422              :      * index
    1423              :      */
    1424          108 :     EnterParallelMode();
    1425              :     Assert(request > 0);
    1426          108 :     pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
    1427              :                                  request);
    1428              : 
    1429          108 :     scantuplesortstates = leaderparticipates ? request + 1 : request;
    1430              : 
    1431              :     /*
    1432              :      * Prepare for scan of the base relation.  In a normal index build, we use
    1433              :      * SnapshotAny because we must retrieve all tuples and do our own time
    1434              :      * qual checks (because we have to index RECENTLY_DEAD tuples).  In a
    1435              :      * concurrent build, we take a regular MVCC snapshot and index whatever's
    1436              :      * live according to that.
    1437              :      */
    1438          108 :     if (!isconcurrent)
    1439          108 :         snapshot = SnapshotAny;
    1440              :     else
    1441            0 :         snapshot = RegisterSnapshot(GetTransactionSnapshot());
    1442              : 
    1443              :     /*
    1444              :      * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
    1445              :      * PARALLEL_KEY_TUPLESORT tuplesort workspace
    1446              :      */
    1447          108 :     estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
    1448          108 :     shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
    1449          108 :     estsort = tuplesort_estimate_shared(scantuplesortstates);
    1450          108 :     shm_toc_estimate_chunk(&pcxt->estimator, estsort);
    1451              : 
    1452              :     /*
    1453              :      * Unique case requires a second spool, and so we may have to account for
    1454              :      * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
    1455              :      */
    1456          108 :     if (!btspool->isunique)
    1457           60 :         shm_toc_estimate_keys(&pcxt->estimator, 2);
    1458              :     else
    1459              :     {
    1460           48 :         shm_toc_estimate_chunk(&pcxt->estimator, estsort);
    1461           48 :         shm_toc_estimate_keys(&pcxt->estimator, 3);
    1462              :     }
    1463              : 
    1464              :     /*
    1465              :      * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
    1466              :      * and PARALLEL_KEY_BUFFER_USAGE.
    1467              :      *
    1468              :      * If there are no extensions loaded that care, we could skip this.  We
    1469              :      * have no way of knowing whether anyone's looking at pgWalUsage or
    1470              :      * pgBufferUsage, so do it unconditionally.
    1471              :      */
    1472          108 :     shm_toc_estimate_chunk(&pcxt->estimator,
    1473              :                            mul_size(sizeof(WalUsage), pcxt->nworkers));
    1474          108 :     shm_toc_estimate_keys(&pcxt->estimator, 1);
    1475          108 :     shm_toc_estimate_chunk(&pcxt->estimator,
    1476              :                            mul_size(sizeof(BufferUsage), pcxt->nworkers));
    1477          108 :     shm_toc_estimate_keys(&pcxt->estimator, 1);
    1478              : 
    1479              :     /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
    1480          108 :     if (debug_query_string)
    1481              :     {
    1482          108 :         querylen = strlen(debug_query_string);
    1483          108 :         shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
    1484          108 :         shm_toc_estimate_keys(&pcxt->estimator, 1);
    1485              :     }
    1486              :     else
    1487            0 :         querylen = 0;           /* keep compiler quiet */
    1488              : 
    1489              :     /* Everyone's had a chance to ask for space, so now create the DSM */
    1490          108 :     InitializeParallelDSM(pcxt);
    1491              : 
    1492              :     /* If no DSM segment was available, back out (do serial build) */
    1493          108 :     if (pcxt->seg == NULL)
    1494              :     {
    1495            0 :         if (IsMVCCSnapshot(snapshot))
    1496            0 :             UnregisterSnapshot(snapshot);
    1497            0 :         DestroyParallelContext(pcxt);
    1498            0 :         ExitParallelMode();
    1499            0 :         return;
    1500              :     }
    1501              : 
    1502              :     /* Store shared build state, for which we reserved space */
    1503          108 :     btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
    1504              :     /* Initialize immutable state */
    1505          108 :     btshared->heaprelid = RelationGetRelid(btspool->heap);
    1506          108 :     btshared->indexrelid = RelationGetRelid(btspool->index);
    1507          108 :     btshared->isunique = btspool->isunique;
    1508          108 :     btshared->nulls_not_distinct = btspool->nulls_not_distinct;
    1509          108 :     btshared->isconcurrent = isconcurrent;
    1510          108 :     btshared->scantuplesortstates = scantuplesortstates;
    1511          108 :     btshared->queryid = pgstat_get_my_query_id();
    1512          108 :     ConditionVariableInit(&btshared->workersdonecv);
    1513          108 :     SpinLockInit(&btshared->mutex);
    1514              :     /* Initialize mutable state */
    1515          108 :     btshared->nparticipantsdone = 0;
    1516          108 :     btshared->reltuples = 0.0;
    1517          108 :     btshared->havedead = false;
    1518          108 :     btshared->indtuples = 0.0;
    1519          108 :     btshared->brokenhotchain = false;
    1520          108 :     table_parallelscan_initialize(btspool->heap,
    1521              :                                   ParallelTableScanFromBTShared(btshared),
    1522              :                                   snapshot);
    1523              : 
    1524              :     /*
    1525              :      * Store shared tuplesort-private state, for which we reserved space.
    1526              :      * Then, initialize opaque state using tuplesort routine.
    1527              :      */
    1528          108 :     sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
    1529          108 :     tuplesort_initialize_shared(sharedsort, scantuplesortstates,
    1530              :                                 pcxt->seg);
    1531              : 
    1532          108 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
    1533          108 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
    1534              : 
    1535              :     /* Unique case requires a second spool, and associated shared state */
    1536          108 :     if (!btspool->isunique)
    1537           60 :         sharedsort2 = NULL;
    1538              :     else
    1539              :     {
    1540              :         /*
    1541              :          * Store additional shared tuplesort-private state, for which we
    1542              :          * reserved space.  Then, initialize opaque state using tuplesort
    1543              :          * routine.
    1544              :          */
    1545           48 :         sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
    1546           48 :         tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
    1547              :                                     pcxt->seg);
    1548              : 
    1549           48 :         shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
    1550              :     }
    1551              : 
    1552              :     /* Store query string for workers */
    1553          108 :     if (debug_query_string)
    1554              :     {
    1555              :         char       *sharedquery;
    1556              : 
    1557          108 :         sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
    1558          108 :         memcpy(sharedquery, debug_query_string, querylen + 1);
    1559          108 :         shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
    1560              :     }
    1561              : 
    1562              :     /*
    1563              :      * Allocate space for each worker's WalUsage and BufferUsage; no need to
    1564              :      * initialize.
    1565              :      */
    1566          108 :     walusage = shm_toc_allocate(pcxt->toc,
    1567          108 :                                 mul_size(sizeof(WalUsage), pcxt->nworkers));
    1568          108 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
    1569          108 :     bufferusage = shm_toc_allocate(pcxt->toc,
    1570          108 :                                    mul_size(sizeof(BufferUsage), pcxt->nworkers));
    1571          108 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
    1572              : 
    1573              :     /* Launch workers, saving status for leader/caller */
    1574          108 :     LaunchParallelWorkers(pcxt);
    1575          108 :     btleader->pcxt = pcxt;
    1576          108 :     btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
    1577          108 :     if (leaderparticipates)
    1578          108 :         btleader->nparticipanttuplesorts++;
    1579          108 :     btleader->btshared = btshared;
    1580          108 :     btleader->sharedsort = sharedsort;
    1581          108 :     btleader->sharedsort2 = sharedsort2;
    1582          108 :     btleader->snapshot = snapshot;
    1583          108 :     btleader->walusage = walusage;
    1584          108 :     btleader->bufferusage = bufferusage;
    1585              : 
    1586              :     /* If no workers were successfully launched, back out (do serial build) */
    1587          108 :     if (pcxt->nworkers_launched == 0)
    1588              :     {
    1589            0 :         _bt_end_parallel(btleader);
    1590            0 :         return;
    1591              :     }
    1592              : 
    1593              :     /* Save leader state now that it's clear build will be parallel */
    1594          108 :     buildstate->btleader = btleader;
    1595              : 
    1596              :     /* Join heap scan ourselves */
    1597          108 :     if (leaderparticipates)
    1598          108 :         _bt_leader_participate_as_worker(buildstate);
    1599              : 
    1600              :     /*
    1601              :      * Caller needs to wait for all launched workers when we return.  Make
    1602              :      * sure that the failure-to-start case will not hang forever.
    1603              :      */
    1604          108 :     WaitForParallelWorkersToAttach(pcxt);
    1605              : }
    1606              : 
    1607              : /*
    1608              :  * Shut down workers, destroy parallel context, and end parallel mode.
    1609              :  */
    1610              : static void
    1611          108 : _bt_end_parallel(BTLeader *btleader)
    1612              : {
    1613              :     int         i;
    1614              : 
    1615              :     /* Shutdown worker processes */
    1616          108 :     WaitForParallelWorkersToFinish(btleader->pcxt);
    1617              : 
    1618              :     /*
    1619              :      * Next, accumulate WAL usage.  (This must wait for the workers to finish,
    1620              :      * or we might get incomplete data.)
    1621              :      */
    1622          220 :     for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
    1623          112 :         InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
    1624              : 
    1625              :     /* Free last reference to MVCC snapshot, if one was used */
    1626          108 :     if (IsMVCCSnapshot(btleader->snapshot))
    1627            0 :         UnregisterSnapshot(btleader->snapshot);
    1628          108 :     DestroyParallelContext(btleader->pcxt);
    1629          108 :     ExitParallelMode();
    1630          108 : }
    1631              : 
    1632              : /*
    1633              :  * Returns size of shared memory required to store state for a parallel
    1634              :  * btree index build based on the snapshot its parallel scan will use.
    1635              :  */
    1636              : static Size
    1637          108 : _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
    1638              : {
    1639              :     /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
    1640          108 :     return add_size(BUFFERALIGN(sizeof(BTShared)),
    1641              :                     table_parallelscan_estimate(heap, snapshot));
    1642              : }
    1643              : 
    1644              : /*
    1645              :  * Within leader, wait for end of heap scan.
    1646              :  *
    1647              :  * When called, parallel heap scan started by _bt_begin_parallel() will
    1648              :  * already be underway within worker processes (when leader participates
    1649              :  * as a worker, we should end up here just as workers are finishing).
    1650              :  *
    1651              :  * Fills in fields needed for ambuild statistics, and lets caller set
    1652              :  * field indicating that some worker encountered a broken HOT chain.
    1653              :  *
    1654              :  * Returns the total number of heap tuples scanned.
    1655              :  */
    1656              : static double
    1657          108 : _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
    1658              : {
    1659          108 :     BTShared   *btshared = buildstate->btleader->btshared;
    1660              :     int         nparticipanttuplesorts;
    1661              :     double      reltuples;
    1662              : 
    1663          108 :     nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
    1664              :     for (;;)
    1665              :     {
    1666          276 :         SpinLockAcquire(&btshared->mutex);
    1667          276 :         if (btshared->nparticipantsdone == nparticipanttuplesorts)
    1668              :         {
    1669          108 :             buildstate->havedead = btshared->havedead;
    1670          108 :             buildstate->indtuples = btshared->indtuples;
    1671          108 :             *brokenhotchain = btshared->brokenhotchain;
    1672          108 :             reltuples = btshared->reltuples;
    1673          108 :             SpinLockRelease(&btshared->mutex);
    1674          108 :             break;
    1675              :         }
    1676          168 :         SpinLockRelease(&btshared->mutex);
    1677              : 
    1678          168 :         ConditionVariableSleep(&btshared->workersdonecv,
    1679              :                                WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
    1680              :     }
    1681              : 
    1682          108 :     ConditionVariableCancelSleep();
    1683              : 
    1684          108 :     return reltuples;
    1685              : }
    1686              : 
    1687              : /*
    1688              :  * Within leader, participate as a parallel worker.
    1689              :  */
    1690              : static void
    1691          108 : _bt_leader_participate_as_worker(BTBuildState *buildstate)
    1692              : {
    1693          108 :     BTLeader   *btleader = buildstate->btleader;
    1694              :     BTSpool    *leaderworker;
    1695              :     BTSpool    *leaderworker2;
    1696              :     int         sortmem;
    1697              : 
    1698              :     /* Allocate memory and initialize private spool */
    1699          108 :     leaderworker = palloc0_object(BTSpool);
    1700          108 :     leaderworker->heap = buildstate->spool->heap;
    1701          108 :     leaderworker->index = buildstate->spool->index;
    1702          108 :     leaderworker->isunique = buildstate->spool->isunique;
    1703          108 :     leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;
    1704              : 
    1705              :     /* Initialize second spool, if required */
    1706          108 :     if (!btleader->btshared->isunique)
    1707           60 :         leaderworker2 = NULL;
    1708              :     else
    1709              :     {
    1710              :         /* Allocate memory for worker's own private secondary spool */
    1711           48 :         leaderworker2 = palloc0_object(BTSpool);
    1712              : 
    1713              :         /* Initialize worker's own secondary spool */
    1714           48 :         leaderworker2->heap = leaderworker->heap;
    1715           48 :         leaderworker2->index = leaderworker->index;
    1716           48 :         leaderworker2->isunique = false;
    1717              :     }
    1718              : 
    1719              :     /*
    1720              :      * Might as well use reliable figure when doling out maintenance_work_mem
    1721              :      * (when requested number of workers were not launched, this will be
    1722              :      * somewhat higher than it is for other workers).
    1723              :      */
    1724          108 :     sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
    1725              : 
    1726              :     /* Perform work common to all participants */
    1727          108 :     _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
    1728              :                                btleader->sharedsort, btleader->sharedsort2,
    1729              :                                sortmem, true);
    1730              : 
    1731              : #ifdef BTREE_BUILD_STATS
    1732              :     if (log_btree_build_stats)
    1733              :     {
    1734              :         ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
    1735              :         ResetUsage();
    1736              :     }
    1737              : #endif                          /* BTREE_BUILD_STATS */
    1738          108 : }
    1739              : 
    1740              : /*
    1741              :  * Perform work within a launched parallel process.
    1742              :  */
    1743              : void
    1744          112 : _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
    1745              : {
    1746              :     char       *sharedquery;
    1747              :     BTSpool    *btspool;
    1748              :     BTSpool    *btspool2;
    1749              :     BTShared   *btshared;
    1750              :     Sharedsort *sharedsort;
    1751              :     Sharedsort *sharedsort2;
    1752              :     Relation    heapRel;
    1753              :     Relation    indexRel;
    1754              :     LOCKMODE    heapLockmode;
    1755              :     LOCKMODE    indexLockmode;
    1756              :     WalUsage   *walusage;
    1757              :     BufferUsage *bufferusage;
    1758              :     int         sortmem;
    1759              : 
    1760              : #ifdef BTREE_BUILD_STATS
    1761              :     if (log_btree_build_stats)
    1762              :         ResetUsage();
    1763              : #endif                          /* BTREE_BUILD_STATS */
    1764              : 
    1765              :     /*
    1766              :      * The only possible status flag that can be set to the parallel worker is
    1767              :      * PROC_IN_SAFE_IC.
    1768              :      */
    1769              :     Assert((MyProc->statusFlags == 0) ||
    1770              :            (MyProc->statusFlags == PROC_IN_SAFE_IC));
    1771              : 
    1772              :     /* Set debug_query_string for individual workers first */
    1773          112 :     sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
    1774          112 :     debug_query_string = sharedquery;
    1775              : 
    1776              :     /* Report the query string from leader */
    1777          112 :     pgstat_report_activity(STATE_RUNNING, debug_query_string);
    1778              : 
    1779              :     /* Look up nbtree shared state */
    1780          112 :     btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
    1781              : 
    1782              :     /* Open relations using lock modes known to be obtained by index.c */
    1783          112 :     if (!btshared->isconcurrent)
    1784              :     {
    1785          112 :         heapLockmode = ShareLock;
    1786          112 :         indexLockmode = AccessExclusiveLock;
    1787              :     }
    1788              :     else
    1789              :     {
    1790            0 :         heapLockmode = ShareUpdateExclusiveLock;
    1791            0 :         indexLockmode = RowExclusiveLock;
    1792              :     }
    1793              : 
    1794              :     /* Track query ID */
    1795          112 :     pgstat_report_query_id(btshared->queryid, false);
    1796              : 
    1797              :     /* Open relations within worker */
    1798          112 :     heapRel = table_open(btshared->heaprelid, heapLockmode);
    1799          112 :     indexRel = index_open(btshared->indexrelid, indexLockmode);
    1800              : 
    1801              :     /* Initialize worker's own spool */
    1802          112 :     btspool = palloc0_object(BTSpool);
    1803          112 :     btspool->heap = heapRel;
    1804          112 :     btspool->index = indexRel;
    1805          112 :     btspool->isunique = btshared->isunique;
    1806          112 :     btspool->nulls_not_distinct = btshared->nulls_not_distinct;
    1807              : 
    1808              :     /* Look up shared state private to tuplesort.c */
    1809          112 :     sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
    1810          112 :     tuplesort_attach_shared(sharedsort, seg);
    1811          112 :     if (!btshared->isunique)
    1812              :     {
    1813           64 :         btspool2 = NULL;
    1814           64 :         sharedsort2 = NULL;
    1815              :     }
    1816              :     else
    1817              :     {
    1818              :         /* Allocate memory for worker's own private secondary spool */
    1819           48 :         btspool2 = palloc0_object(BTSpool);
    1820              : 
    1821              :         /* Initialize worker's own secondary spool */
    1822           48 :         btspool2->heap = btspool->heap;
    1823           48 :         btspool2->index = btspool->index;
    1824           48 :         btspool2->isunique = false;
    1825              :         /* Look up shared state private to tuplesort.c */
    1826           48 :         sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
    1827           48 :         tuplesort_attach_shared(sharedsort2, seg);
    1828              :     }
    1829              : 
    1830              :     /* Prepare to track buffer usage during parallel execution */
    1831          112 :     InstrStartParallelQuery();
    1832              : 
    1833              :     /* Perform sorting of spool, and possibly a spool2 */
    1834          112 :     sortmem = maintenance_work_mem / btshared->scantuplesortstates;
    1835          112 :     _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
    1836              :                                sharedsort2, sortmem, false);
    1837              : 
    1838              :     /* Report WAL/buffer usage during parallel execution */
    1839          112 :     bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
    1840          112 :     walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
    1841          112 :     InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
    1842          112 :                           &walusage[ParallelWorkerNumber]);
    1843              : 
    1844              : #ifdef BTREE_BUILD_STATS
    1845              :     if (log_btree_build_stats)
    1846              :     {
    1847              :         ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
    1848              :         ResetUsage();
    1849              :     }
    1850              : #endif                          /* BTREE_BUILD_STATS */
    1851              : 
    1852          112 :     index_close(indexRel, indexLockmode);
    1853          112 :     table_close(heapRel, heapLockmode);
    1854          112 : }
    1855              : 
    1856              : /*
    1857              :  * Perform a worker's portion of a parallel sort.
    1858              :  *
    1859              :  * This generates a tuplesort for passed btspool, and a second tuplesort
    1860              :  * state if a second btspool is need (i.e. for unique index builds).  All
    1861              :  * other spool fields should already be set when this is called.
    1862              :  *
    1863              :  * sortmem is the amount of working memory to use within each worker,
    1864              :  * expressed in KBs.
    1865              :  *
    1866              :  * When this returns, workers are done, and need only release resources.
    1867              :  */
    1868              : static void
    1869          220 : _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
    1870              :                            BTShared *btshared, Sharedsort *sharedsort,
    1871              :                            Sharedsort *sharedsort2, int sortmem, bool progress)
    1872              : {
    1873              :     SortCoordinate coordinate;
    1874              :     BTBuildState buildstate;
    1875              :     TableScanDesc scan;
    1876              :     double      reltuples;
    1877              :     IndexInfo  *indexInfo;
    1878              : 
    1879              :     /* Initialize local tuplesort coordination state */
    1880          220 :     coordinate = palloc0_object(SortCoordinateData);
    1881          220 :     coordinate->isWorker = true;
    1882          220 :     coordinate->nParticipants = -1;
    1883          220 :     coordinate->sharedsort = sharedsort;
    1884              : 
    1885              :     /* Begin "partial" tuplesort */
    1886          440 :     btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
    1887              :                                                      btspool->index,
    1888          220 :                                                      btspool->isunique,
    1889          220 :                                                      btspool->nulls_not_distinct,
    1890              :                                                      sortmem, coordinate,
    1891              :                                                      TUPLESORT_NONE);
    1892              : 
    1893              :     /*
    1894              :      * Just as with serial case, there may be a second spool.  If so, a
    1895              :      * second, dedicated spool2 partial tuplesort is required.
    1896              :      */
    1897          220 :     if (btspool2)
    1898              :     {
    1899              :         SortCoordinate coordinate2;
    1900              : 
    1901              :         /*
    1902              :          * We expect that the second one (for dead tuples) won't get very
    1903              :          * full, so we give it only work_mem (unless sortmem is less for
    1904              :          * worker).  Worker processes are generally permitted to allocate
    1905              :          * work_mem independently.
    1906              :          */
    1907           96 :         coordinate2 = palloc0_object(SortCoordinateData);
    1908           96 :         coordinate2->isWorker = true;
    1909           96 :         coordinate2->nParticipants = -1;
    1910           96 :         coordinate2->sharedsort = sharedsort2;
    1911           96 :         btspool2->sortstate =
    1912           96 :             tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,
    1913              :                                         Min(sortmem, work_mem), coordinate2,
    1914              :                                         false);
    1915              :     }
    1916              : 
    1917              :     /* Fill in buildstate for _bt_build_callback() */
    1918          220 :     buildstate.isunique = btshared->isunique;
    1919          220 :     buildstate.nulls_not_distinct = btshared->nulls_not_distinct;
    1920          220 :     buildstate.havedead = false;
    1921          220 :     buildstate.heap = btspool->heap;
    1922          220 :     buildstate.spool = btspool;
    1923          220 :     buildstate.spool2 = btspool2;
    1924          220 :     buildstate.indtuples = 0;
    1925          220 :     buildstate.btleader = NULL;
    1926              : 
    1927              :     /* Join parallel scan */
    1928          220 :     indexInfo = BuildIndexInfo(btspool->index);
    1929          220 :     indexInfo->ii_Concurrent = btshared->isconcurrent;
    1930          220 :     scan = table_beginscan_parallel(btspool->heap,
    1931              :                                     ParallelTableScanFromBTShared(btshared));
    1932          220 :     reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
    1933              :                                        true, progress, _bt_build_callback,
    1934              :                                        &buildstate, scan);
    1935              : 
    1936              :     /* Execute this worker's part of the sort */
    1937          220 :     if (progress)
    1938          108 :         pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
    1939              :                                      PROGRESS_BTREE_PHASE_PERFORMSORT_1);
    1940          220 :     tuplesort_performsort(btspool->sortstate);
    1941          220 :     if (btspool2)
    1942              :     {
    1943           96 :         if (progress)
    1944           48 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
    1945              :                                          PROGRESS_BTREE_PHASE_PERFORMSORT_2);
    1946           96 :         tuplesort_performsort(btspool2->sortstate);
    1947              :     }
    1948              : 
    1949              :     /*
    1950              :      * Done.  Record ambuild statistics, and whether we encountered a broken
    1951              :      * HOT chain.
    1952              :      */
    1953          220 :     SpinLockAcquire(&btshared->mutex);
    1954          220 :     btshared->nparticipantsdone++;
    1955          220 :     btshared->reltuples += reltuples;
    1956          220 :     if (buildstate.havedead)
    1957            0 :         btshared->havedead = true;
    1958          220 :     btshared->indtuples += buildstate.indtuples;
    1959          220 :     if (indexInfo->ii_BrokenHotChain)
    1960            0 :         btshared->brokenhotchain = true;
    1961          220 :     SpinLockRelease(&btshared->mutex);
    1962              : 
    1963              :     /* Notify leader */
    1964          220 :     ConditionVariableSignal(&btshared->workersdonecv);
    1965              : 
    1966              :     /* We can end tuplesorts immediately */
    1967          220 :     tuplesort_end(btspool->sortstate);
    1968          220 :     if (btspool2)
    1969           96 :         tuplesort_end(btspool2->sortstate);
    1970          220 : }
        

Generated by: LCOV version 2.0-1