LCOV - code coverage report
Current view: top level - src/backend/access/nbtree - nbtsort.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 538 555 96.9 %
Date: 2025-10-31 10:19:07 Functions: 22 22 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * nbtsort.c
       4             :  *      Build a btree from sorted input by loading leaf pages sequentially.
       5             :  *
       6             :  * NOTES
       7             :  *
       8             :  * We use tuplesort.c to sort the given index tuples into order.
       9             :  * Then we scan the index tuples in order and build the btree pages
      10             :  * for each level.  We load source tuples into leaf-level pages.
      11             :  * Whenever we fill a page at one level, we add a link to it to its
      12             :  * parent level (starting a new parent level if necessary).  When
      13             :  * done, we write out each final page on each level, adding it to
      14             :  * its parent level.  When we have only one page on a level, it must be
      15             :  * the root -- it can be attached to the btree metapage and we are done.
      16             :  *
      17             :  * It is not wise to pack the pages entirely full, since then *any*
      18             :  * insertion would cause a split (and not only of the leaf page; the need
      19             :  * for a split would cascade right up the tree).  The steady-state load
      20             :  * factor for btrees is usually estimated at 70%.  We choose to pack leaf
      21             :  * pages to the user-controllable fill factor (default 90%) while upper pages
      22             :  * are always packed to 70%.  This gives us reasonable density (there aren't
      23             :  * many upper pages if the keys are reasonable-size) without risking a lot of
      24             :  * cascading splits during early insertions.
      25             :  *
      26             :  * We use the bulk smgr loading facility to bypass the buffer cache and
      27             :  * WAL-log the pages efficiently.
      28             :  *
      29             :  * This code isn't concerned about the FSM at all. The caller is responsible
      30             :  * for initializing that.
      31             :  *
      32             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
      33             :  * Portions Copyright (c) 1994, Regents of the University of California
      34             :  *
      35             :  * IDENTIFICATION
      36             :  *    src/backend/access/nbtree/nbtsort.c
      37             :  *
      38             :  *-------------------------------------------------------------------------
      39             :  */
      40             : 
      41             : #include "postgres.h"
      42             : 
      43             : #include "access/nbtree.h"
      44             : #include "access/parallel.h"
      45             : #include "access/relscan.h"
      46             : #include "access/table.h"
      47             : #include "access/tableam.h"
      48             : #include "access/xact.h"
      49             : #include "catalog/index.h"
      50             : #include "commands/progress.h"
      51             : #include "executor/instrument.h"
      52             : #include "miscadmin.h"
      53             : #include "pgstat.h"
      54             : #include "storage/bulk_write.h"
      55             : #include "tcop/tcopprot.h"
      56             : #include "utils/rel.h"
      57             : #include "utils/sortsupport.h"
      58             : #include "utils/tuplesort.h"
      59             : 
      60             : 
      61             : /* Magic numbers for parallel state sharing */
      62             : #define PARALLEL_KEY_BTREE_SHARED       UINT64CONST(0xA000000000000001)
      63             : #define PARALLEL_KEY_TUPLESORT          UINT64CONST(0xA000000000000002)
      64             : #define PARALLEL_KEY_TUPLESORT_SPOOL2   UINT64CONST(0xA000000000000003)
      65             : #define PARALLEL_KEY_QUERY_TEXT         UINT64CONST(0xA000000000000004)
      66             : #define PARALLEL_KEY_WAL_USAGE          UINT64CONST(0xA000000000000005)
      67             : #define PARALLEL_KEY_BUFFER_USAGE       UINT64CONST(0xA000000000000006)
      68             : 
      69             : /*
      70             :  * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
      71             :  * parallel index builds.  This may be useful as a debugging aid.
      72             : #undef DISABLE_LEADER_PARTICIPATION
      73             :  */
      74             : 
      75             : /*
      76             :  * Status record for spooling/sorting phase.  (Note we may have two of
      77             :  * these due to the special requirements for uniqueness-checking with
      78             :  * dead tuples.)
      79             :  */
      80             : typedef struct BTSpool
      81             : {
      82             :     Tuplesortstate *sortstate;  /* state data for tuplesort.c */
      83             :     Relation    heap;
      84             :     Relation    index;
      85             :     bool        isunique;
      86             :     bool        nulls_not_distinct;
      87             : } BTSpool;
      88             : 
      89             : /*
      90             :  * Status for index builds performed in parallel.  This is allocated in a
      91             :  * dynamic shared memory segment.  Note that there is a separate tuplesort TOC
      92             :  * entry, private to tuplesort.c but allocated by this module on its behalf.
      93             :  */
      94             : typedef struct BTShared
      95             : {
      96             :     /*
      97             :      * These fields are not modified during the sort.  They primarily exist
      98             :      * for the benefit of worker processes that need to create BTSpool state
      99             :      * corresponding to that used by the leader.
     100             :      */
     101             :     Oid         heaprelid;
     102             :     Oid         indexrelid;
     103             :     bool        isunique;
     104             :     bool        nulls_not_distinct;
     105             :     bool        isconcurrent;
     106             :     int         scantuplesortstates;
     107             : 
     108             :     /* Query ID, for report in worker processes */
     109             :     int64       queryid;
     110             : 
     111             :     /*
     112             :      * workersdonecv is used to monitor the progress of workers.  All parallel
     113             :      * participants must indicate that they are done before leader can use
     114             :      * mutable state that workers maintain during scan (and before leader can
     115             :      * proceed to tuplesort_performsort()).
     116             :      */
     117             :     ConditionVariable workersdonecv;
     118             : 
     119             :     /*
     120             :      * mutex protects all fields before heapdesc.
     121             :      *
     122             :      * These fields contain status information of interest to B-Tree index
     123             :      * builds that must work just the same when an index is built in parallel.
     124             :      */
     125             :     slock_t     mutex;
     126             : 
     127             :     /*
     128             :      * Mutable state that is maintained by workers, and reported back to
     129             :      * leader at end of parallel scan.
     130             :      *
     131             :      * nparticipantsdone is number of worker processes finished.
     132             :      *
     133             :      * reltuples is the total number of input heap tuples.
     134             :      *
     135             :      * havedead indicates if RECENTLY_DEAD tuples were encountered during
     136             :      * build.
     137             :      *
     138             :      * indtuples is the total number of tuples that made it into the index.
     139             :      *
     140             :      * brokenhotchain indicates if any worker detected a broken HOT chain
     141             :      * during build.
     142             :      */
     143             :     int         nparticipantsdone;
     144             :     double      reltuples;
     145             :     bool        havedead;
     146             :     double      indtuples;
     147             :     bool        brokenhotchain;
     148             : 
     149             :     /*
     150             :      * ParallelTableScanDescData data follows. Can't directly embed here, as
     151             :      * implementations of the parallel table scan desc interface might need
     152             :      * stronger alignment.
     153             :      */
     154             : } BTShared;
     155             : 
     156             : /*
     157             :  * Return pointer to a BTShared's parallel table scan.
     158             :  *
     159             :  * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
     160             :  * MAXALIGN.
     161             :  */
     162             : #define ParallelTableScanFromBTShared(shared) \
     163             :     (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
     164             : 
     165             : /*
     166             :  * Status for leader in parallel index build.
     167             :  */
     168             : typedef struct BTLeader
     169             : {
     170             :     /* parallel context itself */
     171             :     ParallelContext *pcxt;
     172             : 
     173             :     /*
     174             :      * nparticipanttuplesorts is the exact number of worker processes
     175             :      * successfully launched, plus one leader process if it participates as a
     176             :      * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
     177             :      * participating as a worker).
     178             :      */
     179             :     int         nparticipanttuplesorts;
     180             : 
     181             :     /*
     182             :      * Leader process convenience pointers to shared state (leader avoids TOC
     183             :      * lookups).
     184             :      *
     185             :      * btshared is the shared state for entire build.  sharedsort is the
     186             :      * shared, tuplesort-managed state passed to each process tuplesort.
     187             :      * sharedsort2 is the corresponding btspool2 shared state, used only when
     188             :      * building unique indexes.  snapshot is the snapshot used by the scan iff
     189             :      * an MVCC snapshot is required.
     190             :      */
     191             :     BTShared   *btshared;
     192             :     Sharedsort *sharedsort;
     193             :     Sharedsort *sharedsort2;
     194             :     Snapshot    snapshot;
     195             :     WalUsage   *walusage;
     196             :     BufferUsage *bufferusage;
     197             : } BTLeader;
     198             : 
     199             : /*
     200             :  * Working state for btbuild and its callback.
     201             :  *
     202             :  * When parallel CREATE INDEX is used, there is a BTBuildState for each
     203             :  * participant.
     204             :  */
     205             : typedef struct BTBuildState
     206             : {
     207             :     bool        isunique;
     208             :     bool        nulls_not_distinct;
     209             :     bool        havedead;
     210             :     Relation    heap;
     211             :     BTSpool    *spool;
     212             : 
     213             :     /*
     214             :      * spool2 is needed only when the index is a unique index. Dead tuples are
     215             :      * put into spool2 instead of spool in order to avoid uniqueness check.
     216             :      */
     217             :     BTSpool    *spool2;
     218             :     double      indtuples;
     219             : 
     220             :     /*
     221             :      * btleader is only present when a parallel index build is performed, and
     222             :      * only in the leader process. (Actually, only the leader has a
     223             :      * BTBuildState.  Workers have their own spool and spool2, though.)
     224             :      */
     225             :     BTLeader   *btleader;
     226             : } BTBuildState;
     227             : 
     228             : /*
     229             :  * Status record for a btree page being built.  We have one of these
     230             :  * for each active tree level.
     231             :  */
     232             : typedef struct BTPageState
     233             : {
     234             :     BulkWriteBuffer btps_buf;   /* workspace for page building */
     235             :     BlockNumber btps_blkno;     /* block # to write this page at */
     236             :     IndexTuple  btps_lowkey;    /* page's strict lower bound pivot tuple */
     237             :     OffsetNumber btps_lastoff;  /* last item offset loaded */
     238             :     Size        btps_lastextra; /* last item's extra posting list space */
     239             :     uint32      btps_level;     /* tree level (0 = leaf) */
     240             :     Size        btps_full;      /* "full" if less than this much free space */
     241             :     struct BTPageState *btps_next;  /* link to parent level, if any */
     242             : } BTPageState;
     243             : 
     244             : /*
     245             :  * Overall status record for index writing phase.
     246             :  */
     247             : typedef struct BTWriteState
     248             : {
     249             :     Relation    heap;
     250             :     Relation    index;
     251             :     BulkWriteState *bulkstate;
     252             :     BTScanInsert inskey;        /* generic insertion scankey */
     253             :     BlockNumber btws_pages_alloced; /* # pages allocated */
     254             : } BTWriteState;
     255             : 
     256             : 
     257             : static double _bt_spools_heapscan(Relation heap, Relation index,
     258             :                                   BTBuildState *buildstate, IndexInfo *indexInfo);
     259             : static void _bt_spooldestroy(BTSpool *btspool);
     260             : static void _bt_spool(BTSpool *btspool, const ItemPointerData *self,
     261             :                       const Datum *values, const bool *isnull);
     262             : static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
     263             : static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
     264             :                                bool *isnull, bool tupleIsAlive, void *state);
     265             : static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level);
     266             : static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
     267             : static void _bt_slideleft(Page rightmostpage);
     268             : static void _bt_sortaddtup(Page page, Size itemsize,
     269             :                            const IndexTupleData *itup, OffsetNumber itup_off,
     270             :                            bool newfirstdataitem);
     271             : static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
     272             :                          IndexTuple itup, Size truncextra);
     273             : static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
     274             :                                           BTPageState *state,
     275             :                                           BTDedupState dstate);
     276             : static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
     277             : static void _bt_load(BTWriteState *wstate,
     278             :                      BTSpool *btspool, BTSpool *btspool2);
     279             : static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
     280             :                                int request);
     281             : static void _bt_end_parallel(BTLeader *btleader);
     282             : static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
     283             : static double _bt_parallel_heapscan(BTBuildState *buildstate,
     284             :                                     bool *brokenhotchain);
     285             : static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
     286             : static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
     287             :                                        BTShared *btshared, Sharedsort *sharedsort,
     288             :                                        Sharedsort *sharedsort2, int sortmem,
     289             :                                        bool progress);
     290             : 
     291             : 
     292             : /*
     293             :  *  btbuild() -- build a new btree index.
     294             :  */
     295             : IndexBuildResult *
     296       50732 : btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
     297             : {
     298             :     IndexBuildResult *result;
     299             :     BTBuildState buildstate;
     300             :     double      reltuples;
     301             : 
     302             : #ifdef BTREE_BUILD_STATS
     303             :     if (log_btree_build_stats)
     304             :         ResetUsage();
     305             : #endif                          /* BTREE_BUILD_STATS */
     306             : 
     307       50732 :     buildstate.isunique = indexInfo->ii_Unique;
     308       50732 :     buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
     309       50732 :     buildstate.havedead = false;
     310       50732 :     buildstate.heap = heap;
     311       50732 :     buildstate.spool = NULL;
     312       50732 :     buildstate.spool2 = NULL;
     313       50732 :     buildstate.indtuples = 0;
     314       50732 :     buildstate.btleader = NULL;
     315             : 
     316             :     /*
     317             :      * We expect to be called exactly once for any index relation. If that's
     318             :      * not the case, big trouble's what we have.
     319             :      */
     320       50732 :     if (RelationGetNumberOfBlocks(index) != 0)
     321           0 :         elog(ERROR, "index \"%s\" already contains data",
     322             :              RelationGetRelationName(index));
     323             : 
     324       50732 :     reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
     325             : 
     326             :     /*
     327             :      * Finish the build by (1) completing the sort of the spool file, (2)
     328             :      * inserting the sorted tuples into btree pages and (3) building the upper
     329             :      * levels.  Finally, it may also be necessary to end use of parallelism.
     330             :      */
     331       50720 :     _bt_leafbuild(buildstate.spool, buildstate.spool2);
     332       50636 :     _bt_spooldestroy(buildstate.spool);
     333       50636 :     if (buildstate.spool2)
     334          26 :         _bt_spooldestroy(buildstate.spool2);
     335       50636 :     if (buildstate.btleader)
     336         148 :         _bt_end_parallel(buildstate.btleader);
     337             : 
     338       50636 :     result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
     339             : 
     340       50636 :     result->heap_tuples = reltuples;
     341       50636 :     result->index_tuples = buildstate.indtuples;
     342             : 
     343             : #ifdef BTREE_BUILD_STATS
     344             :     if (log_btree_build_stats)
     345             :     {
     346             :         ShowUsage("BTREE BUILD STATS");
     347             :         ResetUsage();
     348             :     }
     349             : #endif                          /* BTREE_BUILD_STATS */
     350             : 
     351       50636 :     return result;
     352             : }
     353             : 
     354             : /*
     355             :  * Create and initialize one or two spool structures, and save them in caller's
     356             :  * buildstate argument.  May also fill-in fields within indexInfo used by index
     357             :  * builds.
     358             :  *
     359             :  * Scans the heap, possibly in parallel, filling spools with IndexTuples.  This
     360             :  * routine encapsulates all aspects of managing parallelism.  Caller need only
     361             :  * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
     362             :  *
     363             :  * Returns the total number of heap tuples scanned.
     364             :  */
     365             : static double
     366       50732 : _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
     367             :                     IndexInfo *indexInfo)
     368             : {
     369       50732 :     BTSpool    *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
     370       50732 :     SortCoordinate coordinate = NULL;
     371       50732 :     double      reltuples = 0;
     372             : 
     373             :     /*
     374             :      * We size the sort area as maintenance_work_mem rather than work_mem to
     375             :      * speed index creation.  This should be OK since a single backend can't
     376             :      * run multiple index creations in parallel (see also: notes on
     377             :      * parallelism and maintenance_work_mem below).
     378             :      */
     379       50732 :     btspool->heap = heap;
     380       50732 :     btspool->index = index;
     381       50732 :     btspool->isunique = indexInfo->ii_Unique;
     382       50732 :     btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
     383             : 
     384             :     /* Save as primary spool */
     385       50732 :     buildstate->spool = btspool;
     386             : 
     387             :     /* Report table scan phase started */
     388       50732 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     389             :                                  PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);
     390             : 
     391             :     /* Attempt to launch parallel worker scan when required */
     392       50732 :     if (indexInfo->ii_ParallelWorkers > 0)
     393         148 :         _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
     394             :                            indexInfo->ii_ParallelWorkers);
     395             : 
     396             :     /*
     397             :      * If parallel build requested and at least one worker process was
     398             :      * successfully launched, set up coordination state
     399             :      */
     400       50732 :     if (buildstate->btleader)
     401             :     {
     402         148 :         coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
     403         148 :         coordinate->isWorker = false;
     404         148 :         coordinate->nParticipants =
     405         148 :             buildstate->btleader->nparticipanttuplesorts;
     406         148 :         coordinate->sharedsort = buildstate->btleader->sharedsort;
     407             :     }
     408             : 
     409             :     /*
     410             :      * Begin serial/leader tuplesort.
     411             :      *
     412             :      * In cases where parallelism is involved, the leader receives the same
     413             :      * share of maintenance_work_mem as a serial sort (it is generally treated
     414             :      * in the same way as a serial sort once we return).  Parallel worker
     415             :      * Tuplesortstates will have received only a fraction of
     416             :      * maintenance_work_mem, though.
     417             :      *
     418             :      * We rely on the lifetime of the Leader Tuplesortstate almost not
     419             :      * overlapping with any worker Tuplesortstate's lifetime.  There may be
     420             :      * some small overlap, but that's okay because we rely on leader
     421             :      * Tuplesortstate only allocating a small, fixed amount of memory here.
     422             :      * When its tuplesort_performsort() is called (by our caller), and
     423             :      * significant amounts of memory are likely to be used, all workers must
     424             :      * have already freed almost all memory held by their Tuplesortstates
     425             :      * (they are about to go away completely, too).  The overall effect is
     426             :      * that maintenance_work_mem always represents an absolute high watermark
     427             :      * on the amount of memory used by a CREATE INDEX operation, regardless of
     428             :      * the use of parallelism or any other factor.
     429             :      */
     430      101464 :     buildstate->spool->sortstate =
     431       50732 :         tuplesort_begin_index_btree(heap, index, buildstate->isunique,
     432       50732 :                                     buildstate->nulls_not_distinct,
     433             :                                     maintenance_work_mem, coordinate,
     434             :                                     TUPLESORT_NONE);
     435             : 
     436             :     /*
     437             :      * If building a unique index, put dead tuples in a second spool to keep
     438             :      * them out of the uniqueness check.  We expect that the second spool (for
     439             :      * dead tuples) won't get very full, so we give it only work_mem.
     440             :      */
     441       50732 :     if (indexInfo->ii_Unique)
     442             :     {
     443       41430 :         BTSpool    *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
     444       41430 :         SortCoordinate coordinate2 = NULL;
     445             : 
     446             :         /* Initialize secondary spool */
     447       41430 :         btspool2->heap = heap;
     448       41430 :         btspool2->index = index;
     449       41430 :         btspool2->isunique = false;
     450             :         /* Save as secondary spool */
     451       41430 :         buildstate->spool2 = btspool2;
     452             : 
     453       41430 :         if (buildstate->btleader)
     454             :         {
     455             :             /*
     456             :              * Set up non-private state that is passed to
     457             :              * tuplesort_begin_index_btree() about the basic high level
     458             :              * coordination of a parallel sort.
     459             :              */
     460          64 :             coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
     461          64 :             coordinate2->isWorker = false;
     462          64 :             coordinate2->nParticipants =
     463          64 :                 buildstate->btleader->nparticipanttuplesorts;
     464          64 :             coordinate2->sharedsort = buildstate->btleader->sharedsort2;
     465             :         }
     466             : 
     467             :         /*
     468             :          * We expect that the second one (for dead tuples) won't get very
     469             :          * full, so we give it only work_mem
     470             :          */
     471       41430 :         buildstate->spool2->sortstate =
     472       41430 :             tuplesort_begin_index_btree(heap, index, false, false, work_mem,
     473             :                                         coordinate2, TUPLESORT_NONE);
     474             :     }
     475             : 
     476             :     /* Fill spool using either serial or parallel heap scan */
     477       50732 :     if (!buildstate->btleader)
     478       50584 :         reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
     479             :                                            _bt_build_callback, buildstate,
     480             :                                            NULL);
     481             :     else
     482         148 :         reltuples = _bt_parallel_heapscan(buildstate,
     483             :                                           &indexInfo->ii_BrokenHotChain);
     484             : 
     485             :     /*
     486             :      * Set the progress target for the next phase.  Reset the block number
     487             :      * values set by table_index_build_scan
     488             :      */
     489             :     {
     490       50720 :         const int   progress_index[] = {
     491             :             PROGRESS_CREATEIDX_TUPLES_TOTAL,
     492             :             PROGRESS_SCAN_BLOCKS_TOTAL,
     493             :             PROGRESS_SCAN_BLOCKS_DONE
     494             :         };
     495       50720 :         const int64 progress_vals[] = {
     496       50720 :             buildstate->indtuples,
     497             :             0, 0
     498             :         };
     499             : 
     500       50720 :         pgstat_progress_update_multi_param(3, progress_index, progress_vals);
     501             :     }
     502             : 
     503             :     /* okay, all heap tuples are spooled */
     504       50720 :     if (buildstate->spool2 && !buildstate->havedead)
     505             :     {
     506             :         /* spool2 turns out to be unnecessary */
     507       41404 :         _bt_spooldestroy(buildstate->spool2);
     508       41404 :         buildstate->spool2 = NULL;
     509             :     }
     510             : 
     511       50720 :     return reltuples;
     512             : }
     513             : 
     514             : /*
     515             :  * clean up a spool structure and its substructures.
     516             :  */
     517             : static void
     518       92066 : _bt_spooldestroy(BTSpool *btspool)
     519             : {
     520       92066 :     tuplesort_end(btspool->sortstate);
     521       92066 :     pfree(btspool);
     522       92066 : }
     523             : 
     524             : /*
     525             :  * spool an index entry into the sort file.
     526             :  */
     527             : static void
     528    12786100 : _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull)
     529             : {
     530    12786100 :     tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
     531             :                                   self, values, isnull);
     532    12786100 : }
     533             : 
     534             : /*
     535             :  * given a spool loaded by successive calls to _bt_spool,
     536             :  * create an entire btree.
     537             :  */
     538             : static void
     539       50720 : _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
     540             : {
     541             :     BTWriteState wstate;
     542             : 
     543             : #ifdef BTREE_BUILD_STATS
     544             :     if (log_btree_build_stats)
     545             :     {
     546             :         ShowUsage("BTREE BUILD (Spool) STATISTICS");
     547             :         ResetUsage();
     548             :     }
     549             : #endif                          /* BTREE_BUILD_STATS */
     550             : 
     551             :     /* Execute the sort */
     552       50720 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     553             :                                  PROGRESS_BTREE_PHASE_PERFORMSORT_1);
     554       50720 :     tuplesort_performsort(btspool->sortstate);
     555       50636 :     if (btspool2)
     556             :     {
     557          26 :         pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     558             :                                      PROGRESS_BTREE_PHASE_PERFORMSORT_2);
     559          26 :         tuplesort_performsort(btspool2->sortstate);
     560             :     }
     561             : 
     562       50636 :     wstate.heap = btspool->heap;
     563       50636 :     wstate.index = btspool->index;
     564       50636 :     wstate.inskey = _bt_mkscankey(wstate.index, NULL);
     565             :     /* _bt_mkscankey() won't set allequalimage without metapage */
     566       50636 :     wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
     567             : 
     568             :     /* reserve the metapage */
     569       50636 :     wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
     570             : 
     571       50636 :     pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
     572             :                                  PROGRESS_BTREE_PHASE_LEAF_LOAD);
     573       50636 :     _bt_load(&wstate, btspool, btspool2);
     574       50636 : }
     575             : 
     576             : /*
     577             :  * Per-tuple callback for table_index_build_scan
     578             :  */
     579             : static void
     580    12786100 : _bt_build_callback(Relation index,
     581             :                    ItemPointer tid,
     582             :                    Datum *values,
     583             :                    bool *isnull,
     584             :                    bool tupleIsAlive,
     585             :                    void *state)
     586             : {
     587    12786100 :     BTBuildState *buildstate = (BTBuildState *) state;
     588             : 
     589             :     /*
     590             :      * insert the index tuple into the appropriate spool file for subsequent
     591             :      * processing
     592             :      */
     593    12786100 :     if (tupleIsAlive || buildstate->spool2 == NULL)
     594    12785594 :         _bt_spool(buildstate->spool, tid, values, isnull);
     595             :     else
     596             :     {
     597             :         /* dead tuples are put into spool2 */
     598         506 :         buildstate->havedead = true;
     599         506 :         _bt_spool(buildstate->spool2, tid, values, isnull);
     600             :     }
     601             : 
     602    12786100 :     buildstate->indtuples += 1;
     603    12786100 : }
     604             : 
     605             : /*
     606             :  * allocate workspace for a new, clean btree page, not linked to any siblings.
     607             :  */
     608             : static BulkWriteBuffer
     609       53000 : _bt_blnewpage(BTWriteState *wstate, uint32 level)
     610             : {
     611             :     BulkWriteBuffer buf;
     612             :     Page        page;
     613             :     BTPageOpaque opaque;
     614             : 
     615       53000 :     buf = smgr_bulk_get_buf(wstate->bulkstate);
     616       53000 :     page = (Page) buf;
     617             : 
     618             :     /* Zero the page and set up standard page header info */
     619       53000 :     _bt_pageinit(page, BLCKSZ);
     620             : 
     621             :     /* Initialize BT opaque state */
     622       53000 :     opaque = BTPageGetOpaque(page);
     623       53000 :     opaque->btpo_prev = opaque->btpo_next = P_NONE;
     624       53000 :     opaque->btpo_level = level;
     625       53000 :     opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
     626       53000 :     opaque->btpo_cycleid = 0;
     627             : 
     628             :     /* Make the P_HIKEY line pointer appear allocated */
     629       53000 :     ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
     630             : 
     631       53000 :     return buf;
     632             : }
     633             : 
     634             : /*
     635             :  * emit a completed btree page, and release the working storage.
     636             :  */
     637             : static void
     638      103636 : _bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
     639             : {
     640      103636 :     smgr_bulk_write(wstate->bulkstate, blkno, buf, true);
     641             :     /* smgr_bulk_write took ownership of 'buf' */
     642      103636 : }
     643             : 
     644             : /*
     645             :  * allocate and initialize a new BTPageState.  the returned structure
     646             :  * is suitable for immediate use by _bt_buildadd.
     647             :  */
     648             : static BTPageState *
     649       12890 : _bt_pagestate(BTWriteState *wstate, uint32 level)
     650             : {
     651       12890 :     BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
     652             : 
     653             :     /* create initial page for level */
     654       12890 :     state->btps_buf = _bt_blnewpage(wstate, level);
     655             : 
     656             :     /* and assign it a page position */
     657       12890 :     state->btps_blkno = wstate->btws_pages_alloced++;
     658             : 
     659       12890 :     state->btps_lowkey = NULL;
     660             :     /* initialize lastoff so first item goes into P_FIRSTKEY */
     661       12890 :     state->btps_lastoff = P_HIKEY;
     662       12890 :     state->btps_lastextra = 0;
     663       12890 :     state->btps_level = level;
     664             :     /* set "full" threshold based on level.  See notes at head of file. */
     665       12890 :     if (level > 0)
     666        2692 :         state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
     667             :     else
     668       10198 :         state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
     669             : 
     670             :     /* no parent level, yet */
     671       12890 :     state->btps_next = NULL;
     672             : 
     673       12890 :     return state;
     674             : }
     675             : 
     676             : /*
     677             :  * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
     678             :  * P_HIKEY, overwriting P_HIKEY).
     679             :  *
     680             :  * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
     681             :  * rightmost page on its level is not supposed to get a high key.  Now that
     682             :  * it's clear that this page is a rightmost page, remove the unneeded empty
     683             :  * P_HIKEY line pointer space.
     684             :  */
     685             : static void
     686       12890 : _bt_slideleft(Page rightmostpage)
     687             : {
     688             :     OffsetNumber off;
     689             :     OffsetNumber maxoff;
     690             :     ItemId      previi;
     691             : 
     692       12890 :     maxoff = PageGetMaxOffsetNumber(rightmostpage);
     693             :     Assert(maxoff >= P_FIRSTKEY);
     694       12890 :     previi = PageGetItemId(rightmostpage, P_HIKEY);
     695      761300 :     for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
     696             :     {
     697      748410 :         ItemId      thisii = PageGetItemId(rightmostpage, off);
     698             : 
     699      748410 :         *previi = *thisii;
     700      748410 :         previi = thisii;
     701             :     }
     702       12890 :     ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
     703       12890 : }
     704             : 
     705             : /*
     706             :  * Add an item to a page being built.
     707             :  *
     708             :  * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
     709             :  * raises an error directly.
     710             :  *
     711             :  * Note that our nbtsort.c caller does not know yet if the page will be
     712             :  * rightmost.  Offset P_FIRSTKEY is always assumed to be the first data key by
     713             :  * caller.  Page that turns out to be the rightmost on its level is fixed by
     714             :  * calling _bt_slideleft().
     715             :  */
     716             : static void
     717    11498890 : _bt_sortaddtup(Page page,
     718             :                Size itemsize,
     719             :                const IndexTupleData *itup,
     720             :                OffsetNumber itup_off,
     721             :                bool newfirstdataitem)
     722             : {
     723             :     IndexTupleData trunctuple;
     724             : 
     725    11498890 :     if (newfirstdataitem)
     726             :     {
     727        2816 :         trunctuple = *itup;
     728        2816 :         trunctuple.t_info = sizeof(IndexTupleData);
     729        2816 :         BTreeTupleSetNAtts(&trunctuple, 0, false);
     730        2816 :         itup = &trunctuple;
     731        2816 :         itemsize = sizeof(IndexTupleData);
     732             :     }
     733             : 
     734    11498890 :     if (PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber)
     735           0 :         elog(ERROR, "failed to add item to the index page");
     736    11498890 : }
     737             : 
     738             : /*----------
     739             :  * Add an item to a disk page from the sort output (or add a posting list
     740             :  * item formed from the sort output).
     741             :  *
     742             :  * We must be careful to observe the page layout conventions of nbtsearch.c:
     743             :  * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
     744             :  * - on non-leaf pages, the key portion of the first item need not be
     745             :  *   stored, we should store only the link.
     746             :  *
     747             :  * A leaf page being built looks like:
     748             :  *
     749             :  * +----------------+---------------------------------+
     750             :  * | PageHeaderData | linp0 linp1 linp2 ...           |
     751             :  * +-----------+----+---------------------------------+
     752             :  * | ... linpN |                                      |
     753             :  * +-----------+--------------------------------------+
     754             :  * |     ^ last                                       |
     755             :  * |                                                  |
     756             :  * +-------------+------------------------------------+
     757             :  * |             | itemN ...                          |
     758             :  * +-------------+------------------+-----------------+
     759             :  * |          ... item3 item2 item1 | "special space" |
     760             :  * +--------------------------------+-----------------+
     761             :  *
     762             :  * Contrast this with the diagram in bufpage.h; note the mismatch
     763             :  * between linps and items.  This is because we reserve linp0 as a
     764             :  * placeholder for the pointer to the "high key" item; when we have
     765             :  * filled up the page, we will set linp0 to point to itemN and clear
     766             :  * linpN.  On the other hand, if we find this is the last (rightmost)
     767             :  * page, we leave the items alone and slide the linp array over.  If
     768             :  * the high key is to be truncated, offset 1 is deleted, and we insert
     769             :  * the truncated high key at offset 1.
     770             :  *
     771             :  * 'last' pointer indicates the last offset added to the page.
     772             :  *
     773             :  * 'truncextra' is the size of the posting list in itup, if any.  This
     774             :  * information is stashed for the next call here, when we may benefit
     775             :  * from considering the impact of truncating away the posting list on
     776             :  * the page before deciding to finish the page off.  Posting lists are
     777             :  * often relatively large, so it is worth going to the trouble of
     778             :  * accounting for the saving from truncating away the posting list of
     779             :  * the tuple that becomes the high key (that may be the only way to
     780             :  * get close to target free space on the page).  Note that this is
     781             :  * only used for the soft fillfactor-wise limit, not the critical hard
     782             :  * limit.
     783             :  *----------
     784             :  */
     785             : static void
     786    11458780 : _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
     787             :              Size truncextra)
     788             : {
     789             :     BulkWriteBuffer nbuf;
     790             :     Page        npage;
     791             :     BlockNumber nblkno;
     792             :     OffsetNumber last_off;
     793             :     Size        last_truncextra;
     794             :     Size        pgspc;
     795             :     Size        itupsz;
     796             :     bool        isleaf;
     797             : 
     798             :     /*
     799             :      * This is a handy place to check for cancel interrupts during the btree
     800             :      * load phase of index creation.
     801             :      */
     802    11458780 :     CHECK_FOR_INTERRUPTS();
     803             : 
     804    11458780 :     nbuf = state->btps_buf;
     805    11458780 :     npage = (Page) nbuf;
     806    11458780 :     nblkno = state->btps_blkno;
     807    11458780 :     last_off = state->btps_lastoff;
     808    11458780 :     last_truncextra = state->btps_lastextra;
     809    11458780 :     state->btps_lastextra = truncextra;
     810             : 
     811    11458780 :     pgspc = PageGetFreeSpace(npage);
     812    11458780 :     itupsz = IndexTupleSize(itup);
     813    11458780 :     itupsz = MAXALIGN(itupsz);
     814             :     /* Leaf case has slightly different rules due to suffix truncation */
     815    11458780 :     isleaf = (state->btps_level == 0);
     816             : 
     817             :     /*
     818             :      * Check whether the new item can fit on a btree page on current level at
     819             :      * all.
     820             :      *
     821             :      * Every newly built index will treat heap TID as part of the keyspace,
     822             :      * which imposes the requirement that new high keys must occasionally have
     823             :      * a heap TID appended within _bt_truncate().  That may leave a new pivot
     824             :      * tuple one or two MAXALIGN() quantums larger than the original
     825             :      * firstright tuple it's derived from.  v4 deals with the problem by
     826             :      * decreasing the limit on the size of tuples inserted on the leaf level
     827             :      * by the same small amount.  Enforce the new v4+ limit on the leaf level,
     828             :      * and the old limit on internal levels, since pivot tuples may need to
     829             :      * make use of the reserved space.  This should never fail on internal
     830             :      * pages.
     831             :      */
     832    11458780 :     if (unlikely(itupsz > BTMaxItemSize))
     833         264 :         _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
     834             :                              itup);
     835             : 
     836             :     /*
     837             :      * Check to see if current page will fit new item, with space left over to
     838             :      * append a heap TID during suffix truncation when page is a leaf page.
     839             :      *
     840             :      * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
     841             :      * high key with heap TID when finishing off a leaf page, since we rely on
     842             :      * _bt_check_third_page() rejecting oversized non-pivot tuples.  On
     843             :      * internal pages we can always fit 3 pivot tuples with larger internal
     844             :      * page tuple limit (includes page high key).
     845             :      *
     846             :      * Most of the time, a page is only "full" in the sense that the soft
     847             :      * fillfactor-wise limit has been exceeded.  However, we must always leave
     848             :      * at least two items plus a high key on each page before starting a new
     849             :      * page.  Disregard fillfactor and insert on "full" current page if we
     850             :      * don't have the minimum number of items yet.  (Note that we deliberately
     851             :      * assume that suffix truncation neither enlarges nor shrinks new high key
     852             :      * when applying soft limit, except when last tuple has a posting list.)
     853             :      */
     854             :     Assert(last_truncextra == 0 || isleaf);
     855    11458780 :     if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
     856    11457624 :         (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
     857             :     {
     858             :         /*
     859             :          * Finish off the page and write it out.
     860             :          */
     861       40110 :         BulkWriteBuffer obuf = nbuf;
     862       40110 :         Page        opage = npage;
     863       40110 :         BlockNumber oblkno = nblkno;
     864             :         ItemId      ii;
     865             :         ItemId      hii;
     866             :         IndexTuple  oitup;
     867             : 
     868             :         /* Create new page of same level */
     869       40110 :         nbuf = _bt_blnewpage(wstate, state->btps_level);
     870       40110 :         npage = (Page) nbuf;
     871             : 
     872             :         /* and assign it a page position */
     873       40110 :         nblkno = wstate->btws_pages_alloced++;
     874             : 
     875             :         /*
     876             :          * We copy the last item on the page into the new page, and then
     877             :          * rearrange the old page so that the 'last item' becomes its high key
     878             :          * rather than a true data item.  There had better be at least two
     879             :          * items on the page already, else the page would be empty of useful
     880             :          * data.
     881             :          */
     882             :         Assert(last_off > P_FIRSTKEY);
     883       40110 :         ii = PageGetItemId(opage, last_off);
     884       40110 :         oitup = (IndexTuple) PageGetItem(opage, ii);
     885       40110 :         _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
     886       40110 :                        !isleaf);
     887             : 
     888             :         /*
     889             :          * Move 'last' into the high key position on opage.  _bt_blnewpage()
     890             :          * allocated empty space for a line pointer when opage was first
     891             :          * created, so this is a matter of rearranging already-allocated space
     892             :          * on page, and initializing high key line pointer. (Actually, leaf
     893             :          * pages must also swap oitup with a truncated version of oitup, which
     894             :          * is sometimes larger than oitup, though never by more than the space
     895             :          * needed to append a heap TID.)
     896             :          */
     897       40110 :         hii = PageGetItemId(opage, P_HIKEY);
     898       40110 :         *hii = *ii;
     899       40110 :         ItemIdSetUnused(ii);    /* redundant */
     900       40110 :         ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
     901             : 
     902       40110 :         if (isleaf)
     903             :         {
     904             :             IndexTuple  lastleft;
     905             :             IndexTuple  truncated;
     906             : 
     907             :             /*
     908             :              * Truncate away any unneeded attributes from high key on leaf
     909             :              * level.  This is only done at the leaf level because downlinks
     910             :              * in internal pages are either negative infinity items, or get
     911             :              * their contents from copying from one level down.  See also:
     912             :              * _bt_split().
     913             :              *
     914             :              * We don't try to bias our choice of split point to make it more
     915             :              * likely that _bt_truncate() can truncate away more attributes,
     916             :              * whereas the split point used within _bt_split() is chosen much
     917             :              * more delicately.  Even still, the lastleft and firstright
     918             :              * tuples passed to _bt_truncate() here are at least not fully
     919             :              * equal to each other when deduplication is used, unless there is
     920             :              * a large group of duplicates (also, unique index builds usually
     921             :              * have few or no spool2 duplicates).  When the split point is
     922             :              * between two unequal tuples, _bt_truncate() will avoid including
     923             :              * a heap TID in the new high key, which is the most important
     924             :              * benefit of suffix truncation.
     925             :              *
     926             :              * Overwrite the old item with new truncated high key directly.
     927             :              * oitup is already located at the physical beginning of tuple
     928             :              * space, so this should directly reuse the existing tuple space.
     929             :              */
     930       39986 :             ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
     931       39986 :             lastleft = (IndexTuple) PageGetItem(opage, ii);
     932             : 
     933             :             Assert(IndexTupleSize(oitup) > last_truncextra);
     934       39986 :             truncated = _bt_truncate(wstate->index, lastleft, oitup,
     935             :                                      wstate->inskey);
     936       39986 :             if (!PageIndexTupleOverwrite(opage, P_HIKEY, truncated, IndexTupleSize(truncated)))
     937           0 :                 elog(ERROR, "failed to add high key to the index page");
     938       39986 :             pfree(truncated);
     939             : 
     940             :             /* oitup should continue to point to the page's high key */
     941       39986 :             hii = PageGetItemId(opage, P_HIKEY);
     942       39986 :             oitup = (IndexTuple) PageGetItem(opage, hii);
     943             :         }
     944             : 
     945             :         /*
     946             :          * Link the old page into its parent, using its low key.  If we don't
     947             :          * have a parent, we have to create one; this adds a new btree level.
     948             :          */
     949       40110 :         if (state->btps_next == NULL)
     950        2692 :             state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
     951             : 
     952             :         Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
     953             :                 IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
     954             :                 BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
     955             :                P_LEFTMOST(BTPageGetOpaque(opage)));
     956             :         Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
     957             :                !P_LEFTMOST(BTPageGetOpaque(opage)));
     958       40110 :         BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
     959       40110 :         _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
     960       40110 :         pfree(state->btps_lowkey);
     961             : 
     962             :         /*
     963             :          * Save a copy of the high key from the old page.  It is also the low
     964             :          * key for the new page.
     965             :          */
     966       40110 :         state->btps_lowkey = CopyIndexTuple(oitup);
     967             : 
     968             :         /*
     969             :          * Set the sibling links for both pages.
     970             :          */
     971             :         {
     972       40110 :             BTPageOpaque oopaque = BTPageGetOpaque(opage);
     973       40110 :             BTPageOpaque nopaque = BTPageGetOpaque(npage);
     974             : 
     975       40110 :             oopaque->btpo_next = nblkno;
     976       40110 :             nopaque->btpo_prev = oblkno;
     977       40110 :             nopaque->btpo_next = P_NONE; /* redundant */
     978             :         }
     979             : 
     980             :         /*
     981             :          * Write out the old page. _bt_blwritepage takes ownership of the
     982             :          * 'opage' buffer.
     983             :          */
     984       40110 :         _bt_blwritepage(wstate, obuf, oblkno);
     985             : 
     986             :         /*
     987             :          * Reset last_off to point to new page
     988             :          */
     989       40110 :         last_off = P_FIRSTKEY;
     990             :     }
     991             : 
     992             :     /*
     993             :      * By here, either original page is still the current page, or a new page
     994             :      * was created that became the current page.  Either way, the current page
     995             :      * definitely has space for new item.
     996             :      *
     997             :      * If the new item is the first for its page, it must also be the first
     998             :      * item on its entire level.  On later same-level pages, a low key for a
     999             :      * page will be copied from the prior page in the code above.  Generate a
    1000             :      * minus infinity low key here instead.
    1001             :      */
    1002    11458780 :     if (last_off == P_HIKEY)
    1003             :     {
    1004             :         Assert(state->btps_lowkey == NULL);
    1005       12890 :         state->btps_lowkey = palloc0(sizeof(IndexTupleData));
    1006       12890 :         state->btps_lowkey->t_info = sizeof(IndexTupleData);
    1007       12890 :         BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
    1008             :     }
    1009             : 
    1010             :     /*
    1011             :      * Add the new item into the current page.
    1012             :      */
    1013    11458780 :     last_off = OffsetNumberNext(last_off);
    1014    11458780 :     _bt_sortaddtup(npage, itupsz, itup, last_off,
    1015    11458780 :                    !isleaf && last_off == P_FIRSTKEY);
    1016             : 
    1017    11458780 :     state->btps_buf = nbuf;
    1018    11458780 :     state->btps_blkno = nblkno;
    1019    11458780 :     state->btps_lastoff = last_off;
    1020    11458780 : }
    1021             : 
    1022             : /*
    1023             :  * Finalize pending posting list tuple, and add it to the index.  Final tuple
    1024             :  * is based on saved base tuple, and saved list of heap TIDs.
    1025             :  *
    1026             :  * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
    1027             :  * using _bt_buildadd().
    1028             :  */
    1029             : static void
    1030     4741902 : _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
    1031             :                               BTDedupState dstate)
    1032             : {
    1033             :     Assert(dstate->nitems > 0);
    1034             : 
    1035     4741902 :     if (dstate->nitems == 1)
    1036     4701432 :         _bt_buildadd(wstate, state, dstate->base, 0);
    1037             :     else
    1038             :     {
    1039             :         IndexTuple  postingtuple;
    1040             :         Size        truncextra;
    1041             : 
    1042             :         /* form a tuple with a posting list */
    1043       40470 :         postingtuple = _bt_form_posting(dstate->base,
    1044       40470 :                                         dstate->htids,
    1045             :                                         dstate->nhtids);
    1046             :         /* Calculate posting list overhead */
    1047       40470 :         truncextra = IndexTupleSize(postingtuple) -
    1048       40470 :             BTreeTupleGetPostingOffset(postingtuple);
    1049             : 
    1050       40470 :         _bt_buildadd(wstate, state, postingtuple, truncextra);
    1051       40470 :         pfree(postingtuple);
    1052             :     }
    1053             : 
    1054     4741902 :     dstate->nmaxitems = 0;
    1055     4741902 :     dstate->nhtids = 0;
    1056     4741902 :     dstate->nitems = 0;
    1057     4741902 :     dstate->phystupsize = 0;
    1058     4741902 : }
    1059             : 
    1060             : /*
    1061             :  * Finish writing out the completed btree.
    1062             :  */
    1063             : static void
    1064       50636 : _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
    1065             : {
    1066             :     BTPageState *s;
    1067       50636 :     BlockNumber rootblkno = P_NONE;
    1068       50636 :     uint32      rootlevel = 0;
    1069             :     BulkWriteBuffer metabuf;
    1070             : 
    1071             :     /*
    1072             :      * Each iteration of this loop completes one more level of the tree.
    1073             :      */
    1074       63526 :     for (s = state; s != NULL; s = s->btps_next)
    1075             :     {
    1076             :         BlockNumber blkno;
    1077             :         BTPageOpaque opaque;
    1078             : 
    1079       12890 :         blkno = s->btps_blkno;
    1080       12890 :         opaque = BTPageGetOpaque((Page) s->btps_buf);
    1081             : 
    1082             :         /*
    1083             :          * We have to link the last page on this level to somewhere.
    1084             :          *
    1085             :          * If we're at the top, it's the root, so attach it to the metapage.
    1086             :          * Otherwise, add an entry for it to its parent using its low key.
    1087             :          * This may cause the last page of the parent level to split, but
    1088             :          * that's not a problem -- we haven't gotten to it yet.
    1089             :          */
    1090       12890 :         if (s->btps_next == NULL)
    1091             :         {
    1092       10198 :             opaque->btpo_flags |= BTP_ROOT;
    1093       10198 :             rootblkno = blkno;
    1094       10198 :             rootlevel = s->btps_level;
    1095             :         }
    1096             :         else
    1097             :         {
    1098             :             Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
    1099             :                     IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
    1100             :                     BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
    1101             :                    P_LEFTMOST(opaque));
    1102             :             Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
    1103             :                    !P_LEFTMOST(opaque));
    1104        2692 :             BTreeTupleSetDownLink(s->btps_lowkey, blkno);
    1105        2692 :             _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
    1106        2692 :             pfree(s->btps_lowkey);
    1107        2692 :             s->btps_lowkey = NULL;
    1108             :         }
    1109             : 
    1110             :         /*
    1111             :          * This is the rightmost page, so the ItemId array needs to be slid
    1112             :          * back one slot.  Then we can dump out the page.
    1113             :          */
    1114       12890 :         _bt_slideleft((Page) s->btps_buf);
    1115       12890 :         _bt_blwritepage(wstate, s->btps_buf, s->btps_blkno);
    1116       12890 :         s->btps_buf = NULL;      /* writepage took ownership of the buffer */
    1117             :     }
    1118             : 
    1119             :     /*
    1120             :      * As the last step in the process, construct the metapage and make it
    1121             :      * point to the new root (unless we had no data at all, in which case it's
    1122             :      * set to point to "P_NONE").  This changes the index to the "valid" state
    1123             :      * by filling in a valid magic number in the metapage.
    1124             :      */
    1125       50636 :     metabuf = smgr_bulk_get_buf(wstate->bulkstate);
    1126       50636 :     _bt_initmetapage((Page) metabuf, rootblkno, rootlevel,
    1127       50636 :                      wstate->inskey->allequalimage);
    1128       50636 :     _bt_blwritepage(wstate, metabuf, BTREE_METAPAGE);
    1129       50636 : }
    1130             : 
    1131             : /*
    1132             :  * Read tuples in correct sort order from tuplesort, and load them into
    1133             :  * btree leaves.
    1134             :  */
    1135             : static void
    1136       50636 : _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
    1137             : {
    1138       50636 :     BTPageState *state = NULL;
    1139       50636 :     bool        merge = (btspool2 != NULL);
    1140             :     IndexTuple  itup,
    1141       50636 :                 itup2 = NULL;
    1142             :     bool        load1;
    1143       50636 :     TupleDesc   tupdes = RelationGetDescr(wstate->index);
    1144             :     int         i,
    1145       50636 :                 keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
    1146             :     SortSupport sortKeys;
    1147       50636 :     int64       tuples_done = 0;
    1148             :     bool        deduplicate;
    1149             : 
    1150       50636 :     wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);
    1151             : 
    1152       59670 :     deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
    1153        9034 :         BTGetDeduplicateItems(wstate->index);
    1154             : 
    1155       50636 :     if (merge)
    1156             :     {
    1157             :         /*
    1158             :          * Another BTSpool for dead tuples exists. Now we have to merge
    1159             :          * btspool and btspool2.
    1160             :          */
    1161             : 
    1162             :         /* the preparation of merge */
    1163          26 :         itup = tuplesort_getindextuple(btspool->sortstate, true);
    1164          26 :         itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
    1165             : 
    1166             :         /* Prepare SortSupport data for each column */
    1167          26 :         sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
    1168             : 
    1169          54 :         for (i = 0; i < keysz; i++)
    1170             :         {
    1171          28 :             SortSupport sortKey = sortKeys + i;
    1172          28 :             ScanKey     scanKey = wstate->inskey->scankeys + i;
    1173             :             bool        reverse;
    1174             : 
    1175          28 :             sortKey->ssup_cxt = CurrentMemoryContext;
    1176          28 :             sortKey->ssup_collation = scanKey->sk_collation;
    1177          28 :             sortKey->ssup_nulls_first =
    1178          28 :                 (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
    1179          28 :             sortKey->ssup_attno = scanKey->sk_attno;
    1180             :             /* Abbreviation is not supported here */
    1181          28 :             sortKey->abbreviate = false;
    1182             : 
    1183             :             Assert(sortKey->ssup_attno != 0);
    1184             : 
    1185          28 :             reverse = (scanKey->sk_flags & SK_BT_DESC) != 0;
    1186             : 
    1187          28 :             PrepareSortSupportFromIndexRel(wstate->index, reverse, sortKey);
    1188             :         }
    1189             : 
    1190             :         for (;;)
    1191             :         {
    1192        3348 :             load1 = true;       /* load BTSpool next ? */
    1193        3348 :             if (itup2 == NULL)
    1194             :             {
    1195         158 :                 if (itup == NULL)
    1196          26 :                     break;
    1197             :             }
    1198        3190 :             else if (itup != NULL)
    1199             :             {
    1200        2992 :                 int32       compare = 0;
    1201             : 
    1202        3240 :                 for (i = 1; i <= keysz; i++)
    1203             :                 {
    1204             :                     SortSupport entry;
    1205             :                     Datum       attrDatum1,
    1206             :                                 attrDatum2;
    1207             :                     bool        isNull1,
    1208             :                                 isNull2;
    1209             : 
    1210        3044 :                     entry = sortKeys + i - 1;
    1211        3044 :                     attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
    1212        3044 :                     attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
    1213             : 
    1214        3044 :                     compare = ApplySortComparator(attrDatum1, isNull1,
    1215             :                                                   attrDatum2, isNull2,
    1216             :                                                   entry);
    1217        3044 :                     if (compare > 0)
    1218             :                     {
    1219         278 :                         load1 = false;
    1220        2796 :                         break;
    1221             :                     }
    1222        2766 :                     else if (compare < 0)
    1223        2518 :                         break;
    1224             :                 }
    1225             : 
    1226             :                 /*
    1227             :                  * If key values are equal, we sort on ItemPointer.  This is
    1228             :                  * required for btree indexes, since heap TID is treated as an
    1229             :                  * implicit last key attribute in order to ensure that all
    1230             :                  * keys in the index are physically unique.
    1231             :                  */
    1232        2992 :                 if (compare == 0)
    1233             :                 {
    1234         196 :                     compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
    1235             :                     Assert(compare != 0);
    1236         196 :                     if (compare > 0)
    1237          30 :                         load1 = false;
    1238             :                 }
    1239             :             }
    1240             :             else
    1241         198 :                 load1 = false;
    1242             : 
    1243             :             /* When we see first tuple, create first index page */
    1244        3322 :             if (state == NULL)
    1245          26 :                 state = _bt_pagestate(wstate, 0);
    1246             : 
    1247        3322 :             if (load1)
    1248             :             {
    1249        2816 :                 _bt_buildadd(wstate, state, itup, 0);
    1250        2816 :                 itup = tuplesort_getindextuple(btspool->sortstate, true);
    1251             :             }
    1252             :             else
    1253             :             {
    1254         506 :                 _bt_buildadd(wstate, state, itup2, 0);
    1255         506 :                 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
    1256             :             }
    1257             : 
    1258             :             /* Report progress */
    1259        3322 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1260             :                                          ++tuples_done);
    1261             :         }
    1262          26 :         pfree(sortKeys);
    1263             :     }
    1264       50610 :     else if (deduplicate)
    1265             :     {
    1266             :         /* merge is unnecessary, deduplicate into posting lists */
    1267             :         BTDedupState dstate;
    1268             : 
    1269        9034 :         dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
    1270        9034 :         dstate->deduplicate = true; /* unused */
    1271        9034 :         dstate->nmaxitems = 0;   /* unused */
    1272        9034 :         dstate->maxpostingsize = 0; /* set later */
    1273             :         /* Metadata about base tuple of current pending posting list */
    1274        9034 :         dstate->base = NULL;
    1275        9034 :         dstate->baseoff = InvalidOffsetNumber;   /* unused */
    1276        9034 :         dstate->basetupsize = 0;
    1277             :         /* Metadata about current pending posting list TIDs */
    1278        9034 :         dstate->htids = NULL;
    1279        9034 :         dstate->nhtids = 0;
    1280        9034 :         dstate->nitems = 0;
    1281        9034 :         dstate->phystupsize = 0; /* unused */
    1282        9034 :         dstate->nintervals = 0; /* unused */
    1283             : 
    1284     6120680 :         while ((itup = tuplesort_getindextuple(btspool->sortstate,
    1285     6120680 :                                                true)) != NULL)
    1286             :         {
    1287             :             /* When we see first tuple, create first index page */
    1288     6111646 :             if (state == NULL)
    1289             :             {
    1290        2460 :                 state = _bt_pagestate(wstate, 0);
    1291             : 
    1292             :                 /*
    1293             :                  * Limit size of posting list tuples to 1/10 space we want to
    1294             :                  * leave behind on the page, plus space for final item's line
    1295             :                  * pointer.  This is equal to the space that we'd like to
    1296             :                  * leave behind on each leaf page when fillfactor is 90,
    1297             :                  * allowing us to get close to fillfactor% space utilization
    1298             :                  * when there happen to be a great many duplicates.  (This
    1299             :                  * makes higher leaf fillfactor settings ineffective when
    1300             :                  * building indexes that have many duplicates, but packing
    1301             :                  * leaf pages full with few very large tuples doesn't seem
    1302             :                  * like a useful goal.)
    1303             :                  */
    1304        2460 :                 dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
    1305             :                     sizeof(ItemIdData);
    1306             :                 Assert(dstate->maxpostingsize <= BTMaxItemSize &&
    1307             :                        dstate->maxpostingsize <= INDEX_SIZE_MASK);
    1308        2460 :                 dstate->htids = palloc(dstate->maxpostingsize);
    1309             : 
    1310             :                 /* start new pending posting list with itup copy */
    1311        2460 :                 _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
    1312             :                                         InvalidOffsetNumber);
    1313             :             }
    1314     6109186 :             else if (_bt_keep_natts_fast(wstate->index, dstate->base,
    1315     1377682 :                                          itup) > keysz &&
    1316     1377682 :                      _bt_dedup_save_htid(dstate, itup))
    1317             :             {
    1318             :                 /*
    1319             :                  * Tuple is equal to base tuple of pending posting list.  Heap
    1320             :                  * TID from itup has been saved in state.
    1321             :                  */
    1322             :             }
    1323             :             else
    1324             :             {
    1325             :                 /*
    1326             :                  * Tuple is not equal to pending posting list tuple, or
    1327             :                  * _bt_dedup_save_htid() opted to not merge current item into
    1328             :                  * pending posting list.
    1329             :                  */
    1330     4739442 :                 _bt_sort_dedup_finish_pending(wstate, state, dstate);
    1331     4739442 :                 pfree(dstate->base);
    1332             : 
    1333             :                 /* start new pending posting list with itup copy */
    1334     4739442 :                 _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
    1335             :                                         InvalidOffsetNumber);
    1336             :             }
    1337             : 
    1338             :             /* Report progress */
    1339     6111646 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1340             :                                          ++tuples_done);
    1341             :         }
    1342             : 
    1343        9034 :         if (state)
    1344             :         {
    1345             :             /*
    1346             :              * Handle the last item (there must be a last item when the
    1347             :              * tuplesort returned one or more tuples)
    1348             :              */
    1349        2460 :             _bt_sort_dedup_finish_pending(wstate, state, dstate);
    1350        2460 :             pfree(dstate->base);
    1351        2460 :             pfree(dstate->htids);
    1352             :         }
    1353             : 
    1354        9034 :         pfree(dstate);
    1355             :     }
    1356             :     else
    1357             :     {
    1358             :         /* merging and deduplication are both unnecessary */
    1359     6712330 :         while ((itup = tuplesort_getindextuple(btspool->sortstate,
    1360     6712330 :                                                true)) != NULL)
    1361             :         {
    1362             :             /* When we see first tuple, create first index page */
    1363     6670754 :             if (state == NULL)
    1364        7712 :                 state = _bt_pagestate(wstate, 0);
    1365             : 
    1366     6670754 :             _bt_buildadd(wstate, state, itup, 0);
    1367             : 
    1368             :             /* Report progress */
    1369     6670754 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
    1370             :                                          ++tuples_done);
    1371             :         }
    1372             :     }
    1373             : 
    1374             :     /* Close down final pages and write the metapage */
    1375       50636 :     _bt_uppershutdown(wstate, state);
    1376       50636 :     smgr_bulk_finish(wstate->bulkstate);
    1377       50636 : }
    1378             : 
    1379             : /*
    1380             :  * Create parallel context, and launch workers for leader.
    1381             :  *
    1382             :  * buildstate argument should be initialized (with the exception of the
    1383             :  * tuplesort state in spools, which may later be created based on shared
    1384             :  * state initially set up here).
    1385             :  *
    1386             :  * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
    1387             :  *
    1388             :  * request is the target number of parallel worker processes to launch.
    1389             :  *
    1390             :  * Sets buildstate's BTLeader, which caller must use to shut down parallel
    1391             :  * mode by passing it to _bt_end_parallel() at the very end of its index
    1392             :  * build.  If not even a single worker process can be launched, this is
    1393             :  * never set, and caller should proceed with a serial index build.
    1394             :  */
    1395             : static void
    1396         148 : _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
    1397             : {
    1398             :     ParallelContext *pcxt;
    1399             :     int         scantuplesortstates;
    1400             :     Snapshot    snapshot;
    1401             :     Size        estbtshared;
    1402             :     Size        estsort;
    1403             :     BTShared   *btshared;
    1404             :     Sharedsort *sharedsort;
    1405             :     Sharedsort *sharedsort2;
    1406         148 :     BTSpool    *btspool = buildstate->spool;
    1407         148 :     BTLeader   *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
    1408             :     WalUsage   *walusage;
    1409             :     BufferUsage *bufferusage;
    1410         148 :     bool        leaderparticipates = true;
    1411             :     int         querylen;
    1412             : 
    1413             : #ifdef DISABLE_LEADER_PARTICIPATION
    1414             :     leaderparticipates = false;
    1415             : #endif
    1416             : 
    1417             :     /*
    1418             :      * Enter parallel mode, and create context for parallel build of btree
    1419             :      * index
    1420             :      */
    1421         148 :     EnterParallelMode();
    1422             :     Assert(request > 0);
    1423         148 :     pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
    1424             :                                  request);
    1425             : 
    1426         148 :     scantuplesortstates = leaderparticipates ? request + 1 : request;
    1427             : 
    1428             :     /*
    1429             :      * Prepare for scan of the base relation.  In a normal index build, we use
    1430             :      * SnapshotAny because we must retrieve all tuples and do our own time
    1431             :      * qual checks (because we have to index RECENTLY_DEAD tuples).  In a
    1432             :      * concurrent build, we take a regular MVCC snapshot and index whatever's
    1433             :      * live according to that.
    1434             :      */
    1435         148 :     if (!isconcurrent)
    1436         148 :         snapshot = SnapshotAny;
    1437             :     else
    1438           0 :         snapshot = RegisterSnapshot(GetTransactionSnapshot());
    1439             : 
    1440             :     /*
    1441             :      * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
    1442             :      * PARALLEL_KEY_TUPLESORT tuplesort workspace
    1443             :      */
    1444         148 :     estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
    1445         148 :     shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
    1446         148 :     estsort = tuplesort_estimate_shared(scantuplesortstates);
    1447         148 :     shm_toc_estimate_chunk(&pcxt->estimator, estsort);
    1448             : 
    1449             :     /*
    1450             :      * Unique case requires a second spool, and so we may have to account for
    1451             :      * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
    1452             :      */
    1453         148 :     if (!btspool->isunique)
    1454          84 :         shm_toc_estimate_keys(&pcxt->estimator, 2);
    1455             :     else
    1456             :     {
    1457          64 :         shm_toc_estimate_chunk(&pcxt->estimator, estsort);
    1458          64 :         shm_toc_estimate_keys(&pcxt->estimator, 3);
    1459             :     }
    1460             : 
    1461             :     /*
    1462             :      * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
    1463             :      * and PARALLEL_KEY_BUFFER_USAGE.
    1464             :      *
    1465             :      * If there are no extensions loaded that care, we could skip this.  We
    1466             :      * have no way of knowing whether anyone's looking at pgWalUsage or
    1467             :      * pgBufferUsage, so do it unconditionally.
    1468             :      */
    1469         148 :     shm_toc_estimate_chunk(&pcxt->estimator,
    1470             :                            mul_size(sizeof(WalUsage), pcxt->nworkers));
    1471         148 :     shm_toc_estimate_keys(&pcxt->estimator, 1);
    1472         148 :     shm_toc_estimate_chunk(&pcxt->estimator,
    1473             :                            mul_size(sizeof(BufferUsage), pcxt->nworkers));
    1474         148 :     shm_toc_estimate_keys(&pcxt->estimator, 1);
    1475             : 
    1476             :     /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
    1477         148 :     if (debug_query_string)
    1478             :     {
    1479         148 :         querylen = strlen(debug_query_string);
    1480         148 :         shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
    1481         148 :         shm_toc_estimate_keys(&pcxt->estimator, 1);
    1482             :     }
    1483             :     else
    1484           0 :         querylen = 0;           /* keep compiler quiet */
    1485             : 
    1486             :     /* Everyone's had a chance to ask for space, so now create the DSM */
    1487         148 :     InitializeParallelDSM(pcxt);
    1488             : 
    1489             :     /* If no DSM segment was available, back out (do serial build) */
    1490         148 :     if (pcxt->seg == NULL)
    1491             :     {
    1492           0 :         if (IsMVCCSnapshot(snapshot))
    1493           0 :             UnregisterSnapshot(snapshot);
    1494           0 :         DestroyParallelContext(pcxt);
    1495           0 :         ExitParallelMode();
    1496           0 :         return;
    1497             :     }
    1498             : 
    1499             :     /* Store shared build state, for which we reserved space */
    1500         148 :     btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
    1501             :     /* Initialize immutable state */
    1502         148 :     btshared->heaprelid = RelationGetRelid(btspool->heap);
    1503         148 :     btshared->indexrelid = RelationGetRelid(btspool->index);
    1504         148 :     btshared->isunique = btspool->isunique;
    1505         148 :     btshared->nulls_not_distinct = btspool->nulls_not_distinct;
    1506         148 :     btshared->isconcurrent = isconcurrent;
    1507         148 :     btshared->scantuplesortstates = scantuplesortstates;
    1508         148 :     btshared->queryid = pgstat_get_my_query_id();
    1509         148 :     ConditionVariableInit(&btshared->workersdonecv);
    1510         148 :     SpinLockInit(&btshared->mutex);
    1511             :     /* Initialize mutable state */
    1512         148 :     btshared->nparticipantsdone = 0;
    1513         148 :     btshared->reltuples = 0.0;
    1514         148 :     btshared->havedead = false;
    1515         148 :     btshared->indtuples = 0.0;
    1516         148 :     btshared->brokenhotchain = false;
    1517         148 :     table_parallelscan_initialize(btspool->heap,
    1518             :                                   ParallelTableScanFromBTShared(btshared),
    1519             :                                   snapshot);
    1520             : 
    1521             :     /*
    1522             :      * Store shared tuplesort-private state, for which we reserved space.
    1523             :      * Then, initialize opaque state using tuplesort routine.
    1524             :      */
    1525         148 :     sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
    1526         148 :     tuplesort_initialize_shared(sharedsort, scantuplesortstates,
    1527             :                                 pcxt->seg);
    1528             : 
    1529         148 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
    1530         148 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
    1531             : 
    1532             :     /* Unique case requires a second spool, and associated shared state */
    1533         148 :     if (!btspool->isunique)
    1534          84 :         sharedsort2 = NULL;
    1535             :     else
    1536             :     {
    1537             :         /*
    1538             :          * Store additional shared tuplesort-private state, for which we
    1539             :          * reserved space.  Then, initialize opaque state using tuplesort
    1540             :          * routine.
    1541             :          */
    1542          64 :         sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
    1543          64 :         tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
    1544             :                                     pcxt->seg);
    1545             : 
    1546          64 :         shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
    1547             :     }
    1548             : 
    1549             :     /* Store query string for workers */
    1550         148 :     if (debug_query_string)
    1551             :     {
    1552             :         char       *sharedquery;
    1553             : 
    1554         148 :         sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
    1555         148 :         memcpy(sharedquery, debug_query_string, querylen + 1);
    1556         148 :         shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
    1557             :     }
    1558             : 
    1559             :     /*
    1560             :      * Allocate space for each worker's WalUsage and BufferUsage; no need to
    1561             :      * initialize.
    1562             :      */
    1563         148 :     walusage = shm_toc_allocate(pcxt->toc,
    1564         148 :                                 mul_size(sizeof(WalUsage), pcxt->nworkers));
    1565         148 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
    1566         148 :     bufferusage = shm_toc_allocate(pcxt->toc,
    1567         148 :                                    mul_size(sizeof(BufferUsage), pcxt->nworkers));
    1568         148 :     shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
    1569             : 
    1570             :     /* Launch workers, saving status for leader/caller */
    1571         148 :     LaunchParallelWorkers(pcxt);
    1572         148 :     btleader->pcxt = pcxt;
    1573         148 :     btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
    1574         148 :     if (leaderparticipates)
    1575         148 :         btleader->nparticipanttuplesorts++;
    1576         148 :     btleader->btshared = btshared;
    1577         148 :     btleader->sharedsort = sharedsort;
    1578         148 :     btleader->sharedsort2 = sharedsort2;
    1579         148 :     btleader->snapshot = snapshot;
    1580         148 :     btleader->walusage = walusage;
    1581         148 :     btleader->bufferusage = bufferusage;
    1582             : 
    1583             :     /* If no workers were successfully launched, back out (do serial build) */
    1584         148 :     if (pcxt->nworkers_launched == 0)
    1585             :     {
    1586           0 :         _bt_end_parallel(btleader);
    1587           0 :         return;
    1588             :     }
    1589             : 
    1590             :     /* Save leader state now that it's clear build will be parallel */
    1591         148 :     buildstate->btleader = btleader;
    1592             : 
    1593             :     /* Join heap scan ourselves */
    1594         148 :     if (leaderparticipates)
    1595         148 :         _bt_leader_participate_as_worker(buildstate);
    1596             : 
    1597             :     /*
    1598             :      * Caller needs to wait for all launched workers when we return.  Make
    1599             :      * sure that the failure-to-start case will not hang forever.
    1600             :      */
    1601         148 :     WaitForParallelWorkersToAttach(pcxt);
    1602             : }
    1603             : 
    1604             : /*
    1605             :  * Shut down workers, destroy parallel context, and end parallel mode.
    1606             :  */
    1607             : static void
    1608         148 : _bt_end_parallel(BTLeader *btleader)
    1609             : {
    1610             :     int         i;
    1611             : 
    1612             :     /* Shutdown worker processes */
    1613         148 :     WaitForParallelWorkersToFinish(btleader->pcxt);
    1614             : 
    1615             :     /*
    1616             :      * Next, accumulate WAL usage.  (This must wait for the workers to finish,
    1617             :      * or we might get incomplete data.)
    1618             :      */
    1619         296 :     for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
    1620         148 :         InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
    1621             : 
    1622             :     /* Free last reference to MVCC snapshot, if one was used */
    1623         148 :     if (IsMVCCSnapshot(btleader->snapshot))
    1624           0 :         UnregisterSnapshot(btleader->snapshot);
    1625         148 :     DestroyParallelContext(btleader->pcxt);
    1626         148 :     ExitParallelMode();
    1627         148 : }
    1628             : 
    1629             : /*
    1630             :  * Returns size of shared memory required to store state for a parallel
    1631             :  * btree index build based on the snapshot its parallel scan will use.
    1632             :  */
    1633             : static Size
    1634         148 : _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
    1635             : {
    1636             :     /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
    1637         148 :     return add_size(BUFFERALIGN(sizeof(BTShared)),
    1638             :                     table_parallelscan_estimate(heap, snapshot));
    1639             : }
    1640             : 
    1641             : /*
    1642             :  * Within leader, wait for end of heap scan.
    1643             :  *
    1644             :  * When called, parallel heap scan started by _bt_begin_parallel() will
    1645             :  * already be underway within worker processes (when leader participates
    1646             :  * as a worker, we should end up here just as workers are finishing).
    1647             :  *
    1648             :  * Fills in fields needed for ambuild statistics, and lets caller set
    1649             :  * field indicating that some worker encountered a broken HOT chain.
    1650             :  *
    1651             :  * Returns the total number of heap tuples scanned.
    1652             :  */
    1653             : static double
    1654         148 : _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
    1655             : {
    1656         148 :     BTShared   *btshared = buildstate->btleader->btshared;
    1657             :     int         nparticipanttuplesorts;
    1658             :     double      reltuples;
    1659             : 
    1660         148 :     nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
    1661             :     for (;;)
    1662             :     {
    1663         336 :         SpinLockAcquire(&btshared->mutex);
    1664         336 :         if (btshared->nparticipantsdone == nparticipanttuplesorts)
    1665             :         {
    1666         148 :             buildstate->havedead = btshared->havedead;
    1667         148 :             buildstate->indtuples = btshared->indtuples;
    1668         148 :             *brokenhotchain = btshared->brokenhotchain;
    1669         148 :             reltuples = btshared->reltuples;
    1670         148 :             SpinLockRelease(&btshared->mutex);
    1671         148 :             break;
    1672             :         }
    1673         188 :         SpinLockRelease(&btshared->mutex);
    1674             : 
    1675         188 :         ConditionVariableSleep(&btshared->workersdonecv,
    1676             :                                WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
    1677             :     }
    1678             : 
    1679         148 :     ConditionVariableCancelSleep();
    1680             : 
    1681         148 :     return reltuples;
    1682             : }
    1683             : 
    1684             : /*
    1685             :  * Within leader, participate as a parallel worker.
    1686             :  */
    1687             : static void
    1688         148 : _bt_leader_participate_as_worker(BTBuildState *buildstate)
    1689             : {
    1690         148 :     BTLeader   *btleader = buildstate->btleader;
    1691             :     BTSpool    *leaderworker;
    1692             :     BTSpool    *leaderworker2;
    1693             :     int         sortmem;
    1694             : 
    1695             :     /* Allocate memory and initialize private spool */
    1696         148 :     leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
    1697         148 :     leaderworker->heap = buildstate->spool->heap;
    1698         148 :     leaderworker->index = buildstate->spool->index;
    1699         148 :     leaderworker->isunique = buildstate->spool->isunique;
    1700         148 :     leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;
    1701             : 
    1702             :     /* Initialize second spool, if required */
    1703         148 :     if (!btleader->btshared->isunique)
    1704          84 :         leaderworker2 = NULL;
    1705             :     else
    1706             :     {
    1707             :         /* Allocate memory for worker's own private secondary spool */
    1708          64 :         leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
    1709             : 
    1710             :         /* Initialize worker's own secondary spool */
    1711          64 :         leaderworker2->heap = leaderworker->heap;
    1712          64 :         leaderworker2->index = leaderworker->index;
    1713          64 :         leaderworker2->isunique = false;
    1714             :     }
    1715             : 
    1716             :     /*
    1717             :      * Might as well use reliable figure when doling out maintenance_work_mem
    1718             :      * (when requested number of workers were not launched, this will be
    1719             :      * somewhat higher than it is for other workers).
    1720             :      */
    1721         148 :     sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
    1722             : 
    1723             :     /* Perform work common to all participants */
    1724         148 :     _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
    1725             :                                btleader->sharedsort, btleader->sharedsort2,
    1726             :                                sortmem, true);
    1727             : 
    1728             : #ifdef BTREE_BUILD_STATS
    1729             :     if (log_btree_build_stats)
    1730             :     {
    1731             :         ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
    1732             :         ResetUsage();
    1733             :     }
    1734             : #endif                          /* BTREE_BUILD_STATS */
    1735         148 : }
    1736             : 
    1737             : /*
    1738             :  * Perform work within a launched parallel process.
    1739             :  */
    1740             : void
    1741         148 : _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
    1742             : {
    1743             :     char       *sharedquery;
    1744             :     BTSpool    *btspool;
    1745             :     BTSpool    *btspool2;
    1746             :     BTShared   *btshared;
    1747             :     Sharedsort *sharedsort;
    1748             :     Sharedsort *sharedsort2;
    1749             :     Relation    heapRel;
    1750             :     Relation    indexRel;
    1751             :     LOCKMODE    heapLockmode;
    1752             :     LOCKMODE    indexLockmode;
    1753             :     WalUsage   *walusage;
    1754             :     BufferUsage *bufferusage;
    1755             :     int         sortmem;
    1756             : 
    1757             : #ifdef BTREE_BUILD_STATS
    1758             :     if (log_btree_build_stats)
    1759             :         ResetUsage();
    1760             : #endif                          /* BTREE_BUILD_STATS */
    1761             : 
    1762             :     /*
    1763             :      * The only possible status flag that can be set to the parallel worker is
    1764             :      * PROC_IN_SAFE_IC.
    1765             :      */
    1766             :     Assert((MyProc->statusFlags == 0) ||
    1767             :            (MyProc->statusFlags == PROC_IN_SAFE_IC));
    1768             : 
    1769             :     /* Set debug_query_string for individual workers first */
    1770         148 :     sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
    1771         148 :     debug_query_string = sharedquery;
    1772             : 
    1773             :     /* Report the query string from leader */
    1774         148 :     pgstat_report_activity(STATE_RUNNING, debug_query_string);
    1775             : 
    1776             :     /* Look up nbtree shared state */
    1777         148 :     btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
    1778             : 
    1779             :     /* Open relations using lock modes known to be obtained by index.c */
    1780         148 :     if (!btshared->isconcurrent)
    1781             :     {
    1782         148 :         heapLockmode = ShareLock;
    1783         148 :         indexLockmode = AccessExclusiveLock;
    1784             :     }
    1785             :     else
    1786             :     {
    1787           0 :         heapLockmode = ShareUpdateExclusiveLock;
    1788           0 :         indexLockmode = RowExclusiveLock;
    1789             :     }
    1790             : 
    1791             :     /* Track query ID */
    1792         148 :     pgstat_report_query_id(btshared->queryid, false);
    1793             : 
    1794             :     /* Open relations within worker */
    1795         148 :     heapRel = table_open(btshared->heaprelid, heapLockmode);
    1796         148 :     indexRel = index_open(btshared->indexrelid, indexLockmode);
    1797             : 
    1798             :     /* Initialize worker's own spool */
    1799         148 :     btspool = (BTSpool *) palloc0(sizeof(BTSpool));
    1800         148 :     btspool->heap = heapRel;
    1801         148 :     btspool->index = indexRel;
    1802         148 :     btspool->isunique = btshared->isunique;
    1803         148 :     btspool->nulls_not_distinct = btshared->nulls_not_distinct;
    1804             : 
    1805             :     /* Look up shared state private to tuplesort.c */
    1806         148 :     sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
    1807         148 :     tuplesort_attach_shared(sharedsort, seg);
    1808         148 :     if (!btshared->isunique)
    1809             :     {
    1810          84 :         btspool2 = NULL;
    1811          84 :         sharedsort2 = NULL;
    1812             :     }
    1813             :     else
    1814             :     {
    1815             :         /* Allocate memory for worker's own private secondary spool */
    1816          64 :         btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
    1817             : 
    1818             :         /* Initialize worker's own secondary spool */
    1819          64 :         btspool2->heap = btspool->heap;
    1820          64 :         btspool2->index = btspool->index;
    1821          64 :         btspool2->isunique = false;
    1822             :         /* Look up shared state private to tuplesort.c */
    1823          64 :         sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
    1824          64 :         tuplesort_attach_shared(sharedsort2, seg);
    1825             :     }
    1826             : 
    1827             :     /* Prepare to track buffer usage during parallel execution */
    1828         148 :     InstrStartParallelQuery();
    1829             : 
    1830             :     /* Perform sorting of spool, and possibly a spool2 */
    1831         148 :     sortmem = maintenance_work_mem / btshared->scantuplesortstates;
    1832         148 :     _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
    1833             :                                sharedsort2, sortmem, false);
    1834             : 
    1835             :     /* Report WAL/buffer usage during parallel execution */
    1836         148 :     bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
    1837         148 :     walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
    1838         148 :     InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
    1839         148 :                           &walusage[ParallelWorkerNumber]);
    1840             : 
    1841             : #ifdef BTREE_BUILD_STATS
    1842             :     if (log_btree_build_stats)
    1843             :     {
    1844             :         ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
    1845             :         ResetUsage();
    1846             :     }
    1847             : #endif                          /* BTREE_BUILD_STATS */
    1848             : 
    1849         148 :     index_close(indexRel, indexLockmode);
    1850         148 :     table_close(heapRel, heapLockmode);
    1851         148 : }
    1852             : 
    1853             : /*
    1854             :  * Perform a worker's portion of a parallel sort.
    1855             :  *
    1856             :  * This generates a tuplesort for passed btspool, and a second tuplesort
    1857             :  * state if a second btspool is need (i.e. for unique index builds).  All
    1858             :  * other spool fields should already be set when this is called.
    1859             :  *
    1860             :  * sortmem is the amount of working memory to use within each worker,
    1861             :  * expressed in KBs.
    1862             :  *
    1863             :  * When this returns, workers are done, and need only release resources.
    1864             :  */
    1865             : static void
    1866         296 : _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
    1867             :                            BTShared *btshared, Sharedsort *sharedsort,
    1868             :                            Sharedsort *sharedsort2, int sortmem, bool progress)
    1869             : {
    1870             :     SortCoordinate coordinate;
    1871             :     BTBuildState buildstate;
    1872             :     TableScanDesc scan;
    1873             :     double      reltuples;
    1874             :     IndexInfo  *indexInfo;
    1875             : 
    1876             :     /* Initialize local tuplesort coordination state */
    1877         296 :     coordinate = palloc0(sizeof(SortCoordinateData));
    1878         296 :     coordinate->isWorker = true;
    1879         296 :     coordinate->nParticipants = -1;
    1880         296 :     coordinate->sharedsort = sharedsort;
    1881             : 
    1882             :     /* Begin "partial" tuplesort */
    1883         592 :     btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
    1884             :                                                      btspool->index,
    1885         296 :                                                      btspool->isunique,
    1886         296 :                                                      btspool->nulls_not_distinct,
    1887             :                                                      sortmem, coordinate,
    1888             :                                                      TUPLESORT_NONE);
    1889             : 
    1890             :     /*
    1891             :      * Just as with serial case, there may be a second spool.  If so, a
    1892             :      * second, dedicated spool2 partial tuplesort is required.
    1893             :      */
    1894         296 :     if (btspool2)
    1895             :     {
    1896             :         SortCoordinate coordinate2;
    1897             : 
    1898             :         /*
    1899             :          * We expect that the second one (for dead tuples) won't get very
    1900             :          * full, so we give it only work_mem (unless sortmem is less for
    1901             :          * worker).  Worker processes are generally permitted to allocate
    1902             :          * work_mem independently.
    1903             :          */
    1904         128 :         coordinate2 = palloc0(sizeof(SortCoordinateData));
    1905         128 :         coordinate2->isWorker = true;
    1906         128 :         coordinate2->nParticipants = -1;
    1907         128 :         coordinate2->sharedsort = sharedsort2;
    1908         128 :         btspool2->sortstate =
    1909         128 :             tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,
    1910             :                                         Min(sortmem, work_mem), coordinate2,
    1911             :                                         false);
    1912             :     }
    1913             : 
    1914             :     /* Fill in buildstate for _bt_build_callback() */
    1915         296 :     buildstate.isunique = btshared->isunique;
    1916         296 :     buildstate.nulls_not_distinct = btshared->nulls_not_distinct;
    1917         296 :     buildstate.havedead = false;
    1918         296 :     buildstate.heap = btspool->heap;
    1919         296 :     buildstate.spool = btspool;
    1920         296 :     buildstate.spool2 = btspool2;
    1921         296 :     buildstate.indtuples = 0;
    1922         296 :     buildstate.btleader = NULL;
    1923             : 
    1924             :     /* Join parallel scan */
    1925         296 :     indexInfo = BuildIndexInfo(btspool->index);
    1926         296 :     indexInfo->ii_Concurrent = btshared->isconcurrent;
    1927         296 :     scan = table_beginscan_parallel(btspool->heap,
    1928             :                                     ParallelTableScanFromBTShared(btshared));
    1929         296 :     reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
    1930             :                                        true, progress, _bt_build_callback,
    1931             :                                        &buildstate, scan);
    1932             : 
    1933             :     /* Execute this worker's part of the sort */
    1934         296 :     if (progress)
    1935         148 :         pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
    1936             :                                      PROGRESS_BTREE_PHASE_PERFORMSORT_1);
    1937         296 :     tuplesort_performsort(btspool->sortstate);
    1938         296 :     if (btspool2)
    1939             :     {
    1940         128 :         if (progress)
    1941          64 :             pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
    1942             :                                          PROGRESS_BTREE_PHASE_PERFORMSORT_2);
    1943         128 :         tuplesort_performsort(btspool2->sortstate);
    1944             :     }
    1945             : 
    1946             :     /*
    1947             :      * Done.  Record ambuild statistics, and whether we encountered a broken
    1948             :      * HOT chain.
    1949             :      */
    1950         296 :     SpinLockAcquire(&btshared->mutex);
    1951         296 :     btshared->nparticipantsdone++;
    1952         296 :     btshared->reltuples += reltuples;
    1953         296 :     if (buildstate.havedead)
    1954           0 :         btshared->havedead = true;
    1955         296 :     btshared->indtuples += buildstate.indtuples;
    1956         296 :     if (indexInfo->ii_BrokenHotChain)
    1957           0 :         btshared->brokenhotchain = true;
    1958         296 :     SpinLockRelease(&btshared->mutex);
    1959             : 
    1960             :     /* Notify leader */
    1961         296 :     ConditionVariableSignal(&btshared->workersdonecv);
    1962             : 
    1963             :     /* We can end tuplesorts immediately */
    1964         296 :     tuplesort_end(btspool->sortstate);
    1965         296 :     if (btspool2)
    1966         128 :         tuplesort_end(btspool2->sortstate);
    1967         296 : }

Generated by: LCOV version 1.16