Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtsort.c
4 : * Build a btree from sorted input by loading leaf pages sequentially.
5 : *
6 : * NOTES
7 : *
8 : * We use tuplesort.c to sort the given index tuples into order.
9 : * Then we scan the index tuples in order and build the btree pages
10 : * for each level. We load source tuples into leaf-level pages.
11 : * Whenever we fill a page at one level, we add a link to it to its
12 : * parent level (starting a new parent level if necessary). When
13 : * done, we write out each final page on each level, adding it to
14 : * its parent level. When we have only one page on a level, it must be
15 : * the root -- it can be attached to the btree metapage and we are done.
16 : *
17 : * It is not wise to pack the pages entirely full, since then *any*
18 : * insertion would cause a split (and not only of the leaf page; the need
19 : * for a split would cascade right up the tree). The steady-state load
20 : * factor for btrees is usually estimated at 70%. We choose to pack leaf
21 : * pages to the user-controllable fill factor (default 90%) while upper pages
22 : * are always packed to 70%. This gives us reasonable density (there aren't
23 : * many upper pages if the keys are reasonable-size) without risking a lot of
24 : * cascading splits during early insertions.
25 : *
26 : * We use the bulk smgr loading facility to bypass the buffer cache and
27 : * WAL-log the pages efficiently.
28 : *
29 : * This code isn't concerned about the FSM at all. The caller is responsible
30 : * for initializing that.
31 : *
32 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
33 : * Portions Copyright (c) 1994, Regents of the University of California
34 : *
35 : * IDENTIFICATION
36 : * src/backend/access/nbtree/nbtsort.c
37 : *
38 : *-------------------------------------------------------------------------
39 : */
40 :
41 : #include "postgres.h"
42 :
43 : #include "access/nbtree.h"
44 : #include "access/parallel.h"
45 : #include "access/relscan.h"
46 : #include "access/table.h"
47 : #include "access/xact.h"
48 : #include "catalog/index.h"
49 : #include "commands/progress.h"
50 : #include "executor/instrument.h"
51 : #include "miscadmin.h"
52 : #include "pgstat.h"
53 : #include "storage/bulk_write.h"
54 : #include "tcop/tcopprot.h"
55 : #include "utils/rel.h"
56 : #include "utils/sortsupport.h"
57 : #include "utils/tuplesort.h"
58 :
59 :
60 : /* Magic numbers for parallel state sharing */
61 : #define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001)
62 : #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002)
63 : #define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003)
64 : #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004)
65 : #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xA000000000000005)
66 : #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xA000000000000006)
67 :
68 : /*
69 : * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
70 : * parallel index builds. This may be useful as a debugging aid.
71 : #undef DISABLE_LEADER_PARTICIPATION
72 : */
73 :
74 : /*
75 : * Status record for spooling/sorting phase. (Note we may have two of
76 : * these due to the special requirements for uniqueness-checking with
77 : * dead tuples.)
78 : */
79 : typedef struct BTSpool
80 : {
81 : Tuplesortstate *sortstate; /* state data for tuplesort.c */
82 : Relation heap;
83 : Relation index;
84 : bool isunique;
85 : bool nulls_not_distinct;
86 : } BTSpool;
87 :
88 : /*
89 : * Status for index builds performed in parallel. This is allocated in a
90 : * dynamic shared memory segment. Note that there is a separate tuplesort TOC
91 : * entry, private to tuplesort.c but allocated by this module on its behalf.
92 : */
93 : typedef struct BTShared
94 : {
95 : /*
96 : * These fields are not modified during the sort. They primarily exist
97 : * for the benefit of worker processes that need to create BTSpool state
98 : * corresponding to that used by the leader.
99 : */
100 : Oid heaprelid;
101 : Oid indexrelid;
102 : bool isunique;
103 : bool nulls_not_distinct;
104 : bool isconcurrent;
105 : int scantuplesortstates;
106 :
107 : /* Query ID, for report in worker processes */
108 : uint64 queryid;
109 :
110 : /*
111 : * workersdonecv is used to monitor the progress of workers. All parallel
112 : * participants must indicate that they are done before leader can use
113 : * mutable state that workers maintain during scan (and before leader can
114 : * proceed to tuplesort_performsort()).
115 : */
116 : ConditionVariable workersdonecv;
117 :
118 : /*
119 : * mutex protects all fields before heapdesc.
120 : *
121 : * These fields contain status information of interest to B-Tree index
122 : * builds that must work just the same when an index is built in parallel.
123 : */
124 : slock_t mutex;
125 :
126 : /*
127 : * Mutable state that is maintained by workers, and reported back to
128 : * leader at end of parallel scan.
129 : *
130 : * nparticipantsdone is number of worker processes finished.
131 : *
132 : * reltuples is the total number of input heap tuples.
133 : *
134 : * havedead indicates if RECENTLY_DEAD tuples were encountered during
135 : * build.
136 : *
137 : * indtuples is the total number of tuples that made it into the index.
138 : *
139 : * brokenhotchain indicates if any worker detected a broken HOT chain
140 : * during build.
141 : */
142 : int nparticipantsdone;
143 : double reltuples;
144 : bool havedead;
145 : double indtuples;
146 : bool brokenhotchain;
147 :
148 : /*
149 : * ParallelTableScanDescData data follows. Can't directly embed here, as
150 : * implementations of the parallel table scan desc interface might need
151 : * stronger alignment.
152 : */
153 : } BTShared;
154 :
155 : /*
156 : * Return pointer to a BTShared's parallel table scan.
157 : *
158 : * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
159 : * MAXALIGN.
160 : */
161 : #define ParallelTableScanFromBTShared(shared) \
162 : (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
163 :
164 : /*
165 : * Status for leader in parallel index build.
166 : */
167 : typedef struct BTLeader
168 : {
169 : /* parallel context itself */
170 : ParallelContext *pcxt;
171 :
172 : /*
173 : * nparticipanttuplesorts is the exact number of worker processes
174 : * successfully launched, plus one leader process if it participates as a
175 : * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
176 : * participating as a worker).
177 : */
178 : int nparticipanttuplesorts;
179 :
180 : /*
181 : * Leader process convenience pointers to shared state (leader avoids TOC
182 : * lookups).
183 : *
184 : * btshared is the shared state for entire build. sharedsort is the
185 : * shared, tuplesort-managed state passed to each process tuplesort.
186 : * sharedsort2 is the corresponding btspool2 shared state, used only when
187 : * building unique indexes. snapshot is the snapshot used by the scan iff
188 : * an MVCC snapshot is required.
189 : */
190 : BTShared *btshared;
191 : Sharedsort *sharedsort;
192 : Sharedsort *sharedsort2;
193 : Snapshot snapshot;
194 : WalUsage *walusage;
195 : BufferUsage *bufferusage;
196 : } BTLeader;
197 :
198 : /*
199 : * Working state for btbuild and its callback.
200 : *
201 : * When parallel CREATE INDEX is used, there is a BTBuildState for each
202 : * participant.
203 : */
204 : typedef struct BTBuildState
205 : {
206 : bool isunique;
207 : bool nulls_not_distinct;
208 : bool havedead;
209 : Relation heap;
210 : BTSpool *spool;
211 :
212 : /*
213 : * spool2 is needed only when the index is a unique index. Dead tuples are
214 : * put into spool2 instead of spool in order to avoid uniqueness check.
215 : */
216 : BTSpool *spool2;
217 : double indtuples;
218 :
219 : /*
220 : * btleader is only present when a parallel index build is performed, and
221 : * only in the leader process. (Actually, only the leader has a
222 : * BTBuildState. Workers have their own spool and spool2, though.)
223 : */
224 : BTLeader *btleader;
225 : } BTBuildState;
226 :
227 : /*
228 : * Status record for a btree page being built. We have one of these
229 : * for each active tree level.
230 : */
231 : typedef struct BTPageState
232 : {
233 : BulkWriteBuffer btps_buf; /* workspace for page building */
234 : BlockNumber btps_blkno; /* block # to write this page at */
235 : IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */
236 : OffsetNumber btps_lastoff; /* last item offset loaded */
237 : Size btps_lastextra; /* last item's extra posting list space */
238 : uint32 btps_level; /* tree level (0 = leaf) */
239 : Size btps_full; /* "full" if less than this much free space */
240 : struct BTPageState *btps_next; /* link to parent level, if any */
241 : } BTPageState;
242 :
243 : /*
244 : * Overall status record for index writing phase.
245 : */
246 : typedef struct BTWriteState
247 : {
248 : Relation heap;
249 : Relation index;
250 : BulkWriteState *bulkstate;
251 : BTScanInsert inskey; /* generic insertion scankey */
252 : BlockNumber btws_pages_alloced; /* # pages allocated */
253 : } BTWriteState;
254 :
255 :
256 : static double _bt_spools_heapscan(Relation heap, Relation index,
257 : BTBuildState *buildstate, IndexInfo *indexInfo);
258 : static void _bt_spooldestroy(BTSpool *btspool);
259 : static void _bt_spool(BTSpool *btspool, ItemPointer self,
260 : Datum *values, bool *isnull);
261 : static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
262 : static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
263 : bool *isnull, bool tupleIsAlive, void *state);
264 : static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level);
265 : static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
266 : static void _bt_slideleft(Page rightmostpage);
267 : static void _bt_sortaddtup(Page page, Size itemsize,
268 : IndexTuple itup, OffsetNumber itup_off,
269 : bool newfirstdataitem);
270 : static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
271 : IndexTuple itup, Size truncextra);
272 : static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
273 : BTPageState *state,
274 : BTDedupState dstate);
275 : static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
276 : static void _bt_load(BTWriteState *wstate,
277 : BTSpool *btspool, BTSpool *btspool2);
278 : static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
279 : int request);
280 : static void _bt_end_parallel(BTLeader *btleader);
281 : static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
282 : static double _bt_parallel_heapscan(BTBuildState *buildstate,
283 : bool *brokenhotchain);
284 : static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
285 : static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
286 : BTShared *btshared, Sharedsort *sharedsort,
287 : Sharedsort *sharedsort2, int sortmem,
288 : bool progress);
289 :
290 :
291 : /*
292 : * btbuild() -- build a new btree index.
293 : */
294 : IndexBuildResult *
295 47764 : btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
296 : {
297 : IndexBuildResult *result;
298 : BTBuildState buildstate;
299 : double reltuples;
300 :
301 : #ifdef BTREE_BUILD_STATS
302 : if (log_btree_build_stats)
303 : ResetUsage();
304 : #endif /* BTREE_BUILD_STATS */
305 :
306 47764 : buildstate.isunique = indexInfo->ii_Unique;
307 47764 : buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
308 47764 : buildstate.havedead = false;
309 47764 : buildstate.heap = heap;
310 47764 : buildstate.spool = NULL;
311 47764 : buildstate.spool2 = NULL;
312 47764 : buildstate.indtuples = 0;
313 47764 : buildstate.btleader = NULL;
314 :
315 : /*
316 : * We expect to be called exactly once for any index relation. If that's
317 : * not the case, big trouble's what we have.
318 : */
319 47764 : if (RelationGetNumberOfBlocks(index) != 0)
320 0 : elog(ERROR, "index \"%s\" already contains data",
321 : RelationGetRelationName(index));
322 :
323 47764 : reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
324 :
325 : /*
326 : * Finish the build by (1) completing the sort of the spool file, (2)
327 : * inserting the sorted tuples into btree pages and (3) building the upper
328 : * levels. Finally, it may also be necessary to end use of parallelism.
329 : */
330 47752 : _bt_leafbuild(buildstate.spool, buildstate.spool2);
331 47668 : _bt_spooldestroy(buildstate.spool);
332 47668 : if (buildstate.spool2)
333 20 : _bt_spooldestroy(buildstate.spool2);
334 47668 : if (buildstate.btleader)
335 148 : _bt_end_parallel(buildstate.btleader);
336 :
337 47668 : result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
338 :
339 47668 : result->heap_tuples = reltuples;
340 47668 : result->index_tuples = buildstate.indtuples;
341 :
342 : #ifdef BTREE_BUILD_STATS
343 : if (log_btree_build_stats)
344 : {
345 : ShowUsage("BTREE BUILD STATS");
346 : ResetUsage();
347 : }
348 : #endif /* BTREE_BUILD_STATS */
349 :
350 47668 : return result;
351 : }
352 :
353 : /*
354 : * Create and initialize one or two spool structures, and save them in caller's
355 : * buildstate argument. May also fill-in fields within indexInfo used by index
356 : * builds.
357 : *
358 : * Scans the heap, possibly in parallel, filling spools with IndexTuples. This
359 : * routine encapsulates all aspects of managing parallelism. Caller need only
360 : * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
361 : *
362 : * Returns the total number of heap tuples scanned.
363 : */
364 : static double
365 47764 : _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
366 : IndexInfo *indexInfo)
367 : {
368 47764 : BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
369 47764 : SortCoordinate coordinate = NULL;
370 47764 : double reltuples = 0;
371 :
372 : /*
373 : * We size the sort area as maintenance_work_mem rather than work_mem to
374 : * speed index creation. This should be OK since a single backend can't
375 : * run multiple index creations in parallel (see also: notes on
376 : * parallelism and maintenance_work_mem below).
377 : */
378 47764 : btspool->heap = heap;
379 47764 : btspool->index = index;
380 47764 : btspool->isunique = indexInfo->ii_Unique;
381 47764 : btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
382 :
383 : /* Save as primary spool */
384 47764 : buildstate->spool = btspool;
385 :
386 : /* Report table scan phase started */
387 47764 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
388 : PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);
389 :
390 : /* Attempt to launch parallel worker scan when required */
391 47764 : if (indexInfo->ii_ParallelWorkers > 0)
392 148 : _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
393 : indexInfo->ii_ParallelWorkers);
394 :
395 : /*
396 : * If parallel build requested and at least one worker process was
397 : * successfully launched, set up coordination state
398 : */
399 47764 : if (buildstate->btleader)
400 : {
401 148 : coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
402 148 : coordinate->isWorker = false;
403 148 : coordinate->nParticipants =
404 148 : buildstate->btleader->nparticipanttuplesorts;
405 148 : coordinate->sharedsort = buildstate->btleader->sharedsort;
406 : }
407 :
408 : /*
409 : * Begin serial/leader tuplesort.
410 : *
411 : * In cases where parallelism is involved, the leader receives the same
412 : * share of maintenance_work_mem as a serial sort (it is generally treated
413 : * in the same way as a serial sort once we return). Parallel worker
414 : * Tuplesortstates will have received only a fraction of
415 : * maintenance_work_mem, though.
416 : *
417 : * We rely on the lifetime of the Leader Tuplesortstate almost not
418 : * overlapping with any worker Tuplesortstate's lifetime. There may be
419 : * some small overlap, but that's okay because we rely on leader
420 : * Tuplesortstate only allocating a small, fixed amount of memory here.
421 : * When its tuplesort_performsort() is called (by our caller), and
422 : * significant amounts of memory are likely to be used, all workers must
423 : * have already freed almost all memory held by their Tuplesortstates
424 : * (they are about to go away completely, too). The overall effect is
425 : * that maintenance_work_mem always represents an absolute high watermark
426 : * on the amount of memory used by a CREATE INDEX operation, regardless of
427 : * the use of parallelism or any other factor.
428 : */
429 95528 : buildstate->spool->sortstate =
430 47764 : tuplesort_begin_index_btree(heap, index, buildstate->isunique,
431 47764 : buildstate->nulls_not_distinct,
432 : maintenance_work_mem, coordinate,
433 : TUPLESORT_NONE);
434 :
435 : /*
436 : * If building a unique index, put dead tuples in a second spool to keep
437 : * them out of the uniqueness check. We expect that the second spool (for
438 : * dead tuples) won't get very full, so we give it only work_mem.
439 : */
440 47764 : if (indexInfo->ii_Unique)
441 : {
442 38916 : BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
443 38916 : SortCoordinate coordinate2 = NULL;
444 :
445 : /* Initialize secondary spool */
446 38916 : btspool2->heap = heap;
447 38916 : btspool2->index = index;
448 38916 : btspool2->isunique = false;
449 : /* Save as secondary spool */
450 38916 : buildstate->spool2 = btspool2;
451 :
452 38916 : if (buildstate->btleader)
453 : {
454 : /*
455 : * Set up non-private state that is passed to
456 : * tuplesort_begin_index_btree() about the basic high level
457 : * coordination of a parallel sort.
458 : */
459 64 : coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
460 64 : coordinate2->isWorker = false;
461 64 : coordinate2->nParticipants =
462 64 : buildstate->btleader->nparticipanttuplesorts;
463 64 : coordinate2->sharedsort = buildstate->btleader->sharedsort2;
464 : }
465 :
466 : /*
467 : * We expect that the second one (for dead tuples) won't get very
468 : * full, so we give it only work_mem
469 : */
470 38916 : buildstate->spool2->sortstate =
471 38916 : tuplesort_begin_index_btree(heap, index, false, false, work_mem,
472 : coordinate2, TUPLESORT_NONE);
473 : }
474 :
475 : /* Fill spool using either serial or parallel heap scan */
476 47764 : if (!buildstate->btleader)
477 47616 : reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
478 : _bt_build_callback, buildstate,
479 : NULL);
480 : else
481 148 : reltuples = _bt_parallel_heapscan(buildstate,
482 : &indexInfo->ii_BrokenHotChain);
483 :
484 : /*
485 : * Set the progress target for the next phase. Reset the block number
486 : * values set by table_index_build_scan
487 : */
488 : {
489 47752 : const int progress_index[] = {
490 : PROGRESS_CREATEIDX_TUPLES_TOTAL,
491 : PROGRESS_SCAN_BLOCKS_TOTAL,
492 : PROGRESS_SCAN_BLOCKS_DONE
493 : };
494 47752 : const int64 progress_vals[] = {
495 47752 : buildstate->indtuples,
496 : 0, 0
497 : };
498 :
499 47752 : pgstat_progress_update_multi_param(3, progress_index, progress_vals);
500 : }
501 :
502 : /* okay, all heap tuples are spooled */
503 47752 : if (buildstate->spool2 && !buildstate->havedead)
504 : {
505 : /* spool2 turns out to be unnecessary */
506 38896 : _bt_spooldestroy(buildstate->spool2);
507 38896 : buildstate->spool2 = NULL;
508 : }
509 :
510 47752 : return reltuples;
511 : }
512 :
513 : /*
514 : * clean up a spool structure and its substructures.
515 : */
516 : static void
517 86584 : _bt_spooldestroy(BTSpool *btspool)
518 : {
519 86584 : tuplesort_end(btspool->sortstate);
520 86584 : pfree(btspool);
521 86584 : }
522 :
523 : /*
524 : * spool an index entry into the sort file.
525 : */
526 : static void
527 12495744 : _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
528 : {
529 12495744 : tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
530 : self, values, isnull);
531 12495744 : }
532 :
533 : /*
534 : * given a spool loaded by successive calls to _bt_spool,
535 : * create an entire btree.
536 : */
537 : static void
538 47752 : _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
539 : {
540 : BTWriteState wstate;
541 :
542 : #ifdef BTREE_BUILD_STATS
543 : if (log_btree_build_stats)
544 : {
545 : ShowUsage("BTREE BUILD (Spool) STATISTICS");
546 : ResetUsage();
547 : }
548 : #endif /* BTREE_BUILD_STATS */
549 :
550 : /* Execute the sort */
551 47752 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
552 : PROGRESS_BTREE_PHASE_PERFORMSORT_1);
553 47752 : tuplesort_performsort(btspool->sortstate);
554 47668 : if (btspool2)
555 : {
556 20 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
557 : PROGRESS_BTREE_PHASE_PERFORMSORT_2);
558 20 : tuplesort_performsort(btspool2->sortstate);
559 : }
560 :
561 47668 : wstate.heap = btspool->heap;
562 47668 : wstate.index = btspool->index;
563 47668 : wstate.inskey = _bt_mkscankey(wstate.index, NULL);
564 : /* _bt_mkscankey() won't set allequalimage without metapage */
565 47668 : wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
566 :
567 : /* reserve the metapage */
568 47668 : wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
569 :
570 47668 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
571 : PROGRESS_BTREE_PHASE_LEAF_LOAD);
572 47668 : _bt_load(&wstate, btspool, btspool2);
573 47668 : }
574 :
575 : /*
576 : * Per-tuple callback for table_index_build_scan
577 : */
578 : static void
579 12495744 : _bt_build_callback(Relation index,
580 : ItemPointer tid,
581 : Datum *values,
582 : bool *isnull,
583 : bool tupleIsAlive,
584 : void *state)
585 : {
586 12495744 : BTBuildState *buildstate = (BTBuildState *) state;
587 :
588 : /*
589 : * insert the index tuple into the appropriate spool file for subsequent
590 : * processing
591 : */
592 12495744 : if (tupleIsAlive || buildstate->spool2 == NULL)
593 12495254 : _bt_spool(buildstate->spool, tid, values, isnull);
594 : else
595 : {
596 : /* dead tuples are put into spool2 */
597 490 : buildstate->havedead = true;
598 490 : _bt_spool(buildstate->spool2, tid, values, isnull);
599 : }
600 :
601 12495744 : buildstate->indtuples += 1;
602 12495744 : }
603 :
604 : /*
605 : * allocate workspace for a new, clean btree page, not linked to any siblings.
606 : */
607 : static BulkWriteBuffer
608 51174 : _bt_blnewpage(BTWriteState *wstate, uint32 level)
609 : {
610 : BulkWriteBuffer buf;
611 : Page page;
612 : BTPageOpaque opaque;
613 :
614 51174 : buf = smgr_bulk_get_buf(wstate->bulkstate);
615 51174 : page = (Page) buf;
616 :
617 : /* Zero the page and set up standard page header info */
618 51174 : _bt_pageinit(page, BLCKSZ);
619 :
620 : /* Initialize BT opaque state */
621 51174 : opaque = BTPageGetOpaque(page);
622 51174 : opaque->btpo_prev = opaque->btpo_next = P_NONE;
623 51174 : opaque->btpo_level = level;
624 51174 : opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
625 51174 : opaque->btpo_cycleid = 0;
626 :
627 : /* Make the P_HIKEY line pointer appear allocated */
628 51174 : ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
629 :
630 51174 : return buf;
631 : }
632 :
633 : /*
634 : * emit a completed btree page, and release the working storage.
635 : */
636 : static void
637 98842 : _bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
638 : {
639 98842 : smgr_bulk_write(wstate->bulkstate, blkno, buf, true);
640 : /* smgr_bulk_write took ownership of 'buf' */
641 98842 : }
642 :
643 : /*
644 : * allocate and initialize a new BTPageState. the returned structure
645 : * is suitable for immediate use by _bt_buildadd.
646 : */
647 : static BTPageState *
648 12030 : _bt_pagestate(BTWriteState *wstate, uint32 level)
649 : {
650 12030 : BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
651 :
652 : /* create initial page for level */
653 12030 : state->btps_buf = _bt_blnewpage(wstate, level);
654 :
655 : /* and assign it a page position */
656 12030 : state->btps_blkno = wstate->btws_pages_alloced++;
657 :
658 12030 : state->btps_lowkey = NULL;
659 : /* initialize lastoff so first item goes into P_FIRSTKEY */
660 12030 : state->btps_lastoff = P_HIKEY;
661 12030 : state->btps_lastextra = 0;
662 12030 : state->btps_level = level;
663 : /* set "full" threshold based on level. See notes at head of file. */
664 12030 : if (level > 0)
665 2528 : state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
666 : else
667 9502 : state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
668 :
669 : /* no parent level, yet */
670 12030 : state->btps_next = NULL;
671 :
672 12030 : return state;
673 : }
674 :
675 : /*
676 : * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
677 : * P_HIKEY, overwriting P_HIKEY).
678 : *
679 : * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
680 : * rightmost page on its level is not supposed to get a high key. Now that
681 : * it's clear that this page is a rightmost page, remove the unneeded empty
682 : * P_HIKEY line pointer space.
683 : */
684 : static void
685 12030 : _bt_slideleft(Page rightmostpage)
686 : {
687 : OffsetNumber off;
688 : OffsetNumber maxoff;
689 : ItemId previi;
690 :
691 12030 : maxoff = PageGetMaxOffsetNumber(rightmostpage);
692 : Assert(maxoff >= P_FIRSTKEY);
693 12030 : previi = PageGetItemId(rightmostpage, P_HIKEY);
694 711012 : for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
695 : {
696 698982 : ItemId thisii = PageGetItemId(rightmostpage, off);
697 :
698 698982 : *previi = *thisii;
699 698982 : previi = thisii;
700 : }
701 12030 : ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
702 12030 : }
703 :
704 : /*
705 : * Add an item to a page being built.
706 : *
707 : * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
708 : * raises an error directly.
709 : *
710 : * Note that our nbtsort.c caller does not know yet if the page will be
711 : * rightmost. Offset P_FIRSTKEY is always assumed to be the first data key by
712 : * caller. Page that turns out to be the rightmost on its level is fixed by
713 : * calling _bt_slideleft().
714 : */
715 : static void
716 11223860 : _bt_sortaddtup(Page page,
717 : Size itemsize,
718 : IndexTuple itup,
719 : OffsetNumber itup_off,
720 : bool newfirstdataitem)
721 : {
722 : IndexTupleData trunctuple;
723 :
724 11223860 : if (newfirstdataitem)
725 : {
726 2652 : trunctuple = *itup;
727 2652 : trunctuple.t_info = sizeof(IndexTupleData);
728 2652 : BTreeTupleSetNAtts(&trunctuple, 0, false);
729 2652 : itup = &trunctuple;
730 2652 : itemsize = sizeof(IndexTupleData);
731 : }
732 :
733 11223860 : if (PageAddItem(page, (Item) itup, itemsize, itup_off,
734 : false, false) == InvalidOffsetNumber)
735 0 : elog(ERROR, "failed to add item to the index page");
736 11223860 : }
737 :
738 : /*----------
739 : * Add an item to a disk page from the sort output (or add a posting list
740 : * item formed from the sort output).
741 : *
742 : * We must be careful to observe the page layout conventions of nbtsearch.c:
743 : * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
744 : * - on non-leaf pages, the key portion of the first item need not be
745 : * stored, we should store only the link.
746 : *
747 : * A leaf page being built looks like:
748 : *
749 : * +----------------+---------------------------------+
750 : * | PageHeaderData | linp0 linp1 linp2 ... |
751 : * +-----------+----+---------------------------------+
752 : * | ... linpN | |
753 : * +-----------+--------------------------------------+
754 : * | ^ last |
755 : * | |
756 : * +-------------+------------------------------------+
757 : * | | itemN ... |
758 : * +-------------+------------------+-----------------+
759 : * | ... item3 item2 item1 | "special space" |
760 : * +--------------------------------+-----------------+
761 : *
762 : * Contrast this with the diagram in bufpage.h; note the mismatch
763 : * between linps and items. This is because we reserve linp0 as a
764 : * placeholder for the pointer to the "high key" item; when we have
765 : * filled up the page, we will set linp0 to point to itemN and clear
766 : * linpN. On the other hand, if we find this is the last (rightmost)
767 : * page, we leave the items alone and slide the linp array over. If
768 : * the high key is to be truncated, offset 1 is deleted, and we insert
769 : * the truncated high key at offset 1.
770 : *
771 : * 'last' pointer indicates the last offset added to the page.
772 : *
773 : * 'truncextra' is the size of the posting list in itup, if any. This
774 : * information is stashed for the next call here, when we may benefit
775 : * from considering the impact of truncating away the posting list on
776 : * the page before deciding to finish the page off. Posting lists are
777 : * often relatively large, so it is worth going to the trouble of
778 : * accounting for the saving from truncating away the posting list of
779 : * the tuple that becomes the high key (that may be the only way to
780 : * get close to target free space on the page). Note that this is
781 : * only used for the soft fillfactor-wise limit, not the critical hard
782 : * limit.
783 : *----------
784 : */
785 : static void
786 11184716 : _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
787 : Size truncextra)
788 : {
789 : BulkWriteBuffer nbuf;
790 : Page npage;
791 : BlockNumber nblkno;
792 : OffsetNumber last_off;
793 : Size last_truncextra;
794 : Size pgspc;
795 : Size itupsz;
796 : bool isleaf;
797 :
798 : /*
799 : * This is a handy place to check for cancel interrupts during the btree
800 : * load phase of index creation.
801 : */
802 11184716 : CHECK_FOR_INTERRUPTS();
803 :
804 11184716 : nbuf = state->btps_buf;
805 11184716 : npage = (Page) nbuf;
806 11184716 : nblkno = state->btps_blkno;
807 11184716 : last_off = state->btps_lastoff;
808 11184716 : last_truncextra = state->btps_lastextra;
809 11184716 : state->btps_lastextra = truncextra;
810 :
811 11184716 : pgspc = PageGetFreeSpace(npage);
812 11184716 : itupsz = IndexTupleSize(itup);
813 11184716 : itupsz = MAXALIGN(itupsz);
814 : /* Leaf case has slightly different rules due to suffix truncation */
815 11184716 : isleaf = (state->btps_level == 0);
816 :
817 : /*
818 : * Check whether the new item can fit on a btree page on current level at
819 : * all.
820 : *
821 : * Every newly built index will treat heap TID as part of the keyspace,
822 : * which imposes the requirement that new high keys must occasionally have
823 : * a heap TID appended within _bt_truncate(). That may leave a new pivot
824 : * tuple one or two MAXALIGN() quantums larger than the original
825 : * firstright tuple it's derived from. v4 deals with the problem by
826 : * decreasing the limit on the size of tuples inserted on the leaf level
827 : * by the same small amount. Enforce the new v4+ limit on the leaf level,
828 : * and the old limit on internal levels, since pivot tuples may need to
829 : * make use of the reserved space. This should never fail on internal
830 : * pages.
831 : */
832 11184716 : if (unlikely(itupsz > BTMaxItemSize(npage)))
833 264 : _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
834 : itup);
835 :
836 : /*
837 : * Check to see if current page will fit new item, with space left over to
838 : * append a heap TID during suffix truncation when page is a leaf page.
839 : *
840 : * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
841 : * high key with heap TID when finishing off a leaf page, since we rely on
842 : * _bt_check_third_page() rejecting oversized non-pivot tuples. On
843 : * internal pages we can always fit 3 pivot tuples with larger internal
844 : * page tuple limit (includes page high key).
845 : *
846 : * Most of the time, a page is only "full" in the sense that the soft
847 : * fillfactor-wise limit has been exceeded. However, we must always leave
848 : * at least two items plus a high key on each page before starting a new
849 : * page. Disregard fillfactor and insert on "full" current page if we
850 : * don't have the minimum number of items yet. (Note that we deliberately
851 : * assume that suffix truncation neither enlarges nor shrinks new high key
852 : * when applying soft limit, except when last tuple has a posting list.)
853 : */
854 : Assert(last_truncextra == 0 || isleaf);
855 11184716 : if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
856 11183562 : (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
857 : {
858 : /*
859 : * Finish off the page and write it out.
860 : */
861 39144 : BulkWriteBuffer obuf = nbuf;
862 39144 : Page opage = npage;
863 39144 : BlockNumber oblkno = nblkno;
864 : ItemId ii;
865 : ItemId hii;
866 : IndexTuple oitup;
867 :
868 : /* Create new page of same level */
869 39144 : nbuf = _bt_blnewpage(wstate, state->btps_level);
870 39144 : npage = (Page) nbuf;
871 :
872 : /* and assign it a page position */
873 39144 : nblkno = wstate->btws_pages_alloced++;
874 :
875 : /*
876 : * We copy the last item on the page into the new page, and then
877 : * rearrange the old page so that the 'last item' becomes its high key
878 : * rather than a true data item. There had better be at least two
879 : * items on the page already, else the page would be empty of useful
880 : * data.
881 : */
882 : Assert(last_off > P_FIRSTKEY);
883 39144 : ii = PageGetItemId(opage, last_off);
884 39144 : oitup = (IndexTuple) PageGetItem(opage, ii);
885 39144 : _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
886 39144 : !isleaf);
887 :
888 : /*
889 : * Move 'last' into the high key position on opage. _bt_blnewpage()
890 : * allocated empty space for a line pointer when opage was first
891 : * created, so this is a matter of rearranging already-allocated space
892 : * on page, and initializing high key line pointer. (Actually, leaf
893 : * pages must also swap oitup with a truncated version of oitup, which
894 : * is sometimes larger than oitup, though never by more than the space
895 : * needed to append a heap TID.)
896 : */
897 39144 : hii = PageGetItemId(opage, P_HIKEY);
898 39144 : *hii = *ii;
899 39144 : ItemIdSetUnused(ii); /* redundant */
900 39144 : ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
901 :
902 39144 : if (isleaf)
903 : {
904 : IndexTuple lastleft;
905 : IndexTuple truncated;
906 :
907 : /*
908 : * Truncate away any unneeded attributes from high key on leaf
909 : * level. This is only done at the leaf level because downlinks
910 : * in internal pages are either negative infinity items, or get
911 : * their contents from copying from one level down. See also:
912 : * _bt_split().
913 : *
914 : * We don't try to bias our choice of split point to make it more
915 : * likely that _bt_truncate() can truncate away more attributes,
916 : * whereas the split point used within _bt_split() is chosen much
917 : * more delicately. Even still, the lastleft and firstright
918 : * tuples passed to _bt_truncate() here are at least not fully
919 : * equal to each other when deduplication is used, unless there is
920 : * a large group of duplicates (also, unique index builds usually
921 : * have few or no spool2 duplicates). When the split point is
922 : * between two unequal tuples, _bt_truncate() will avoid including
923 : * a heap TID in the new high key, which is the most important
924 : * benefit of suffix truncation.
925 : *
926 : * Overwrite the old item with new truncated high key directly.
927 : * oitup is already located at the physical beginning of tuple
928 : * space, so this should directly reuse the existing tuple space.
929 : */
930 39020 : ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
931 39020 : lastleft = (IndexTuple) PageGetItem(opage, ii);
932 :
933 : Assert(IndexTupleSize(oitup) > last_truncextra);
934 39020 : truncated = _bt_truncate(wstate->index, lastleft, oitup,
935 : wstate->inskey);
936 39020 : if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated,
937 39020 : IndexTupleSize(truncated)))
938 0 : elog(ERROR, "failed to add high key to the index page");
939 39020 : pfree(truncated);
940 :
941 : /* oitup should continue to point to the page's high key */
942 39020 : hii = PageGetItemId(opage, P_HIKEY);
943 39020 : oitup = (IndexTuple) PageGetItem(opage, hii);
944 : }
945 :
946 : /*
947 : * Link the old page into its parent, using its low key. If we don't
948 : * have a parent, we have to create one; this adds a new btree level.
949 : */
950 39144 : if (state->btps_next == NULL)
951 2528 : state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
952 :
953 : Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
954 : IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
955 : BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
956 : P_LEFTMOST(BTPageGetOpaque(opage)));
957 : Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
958 : !P_LEFTMOST(BTPageGetOpaque(opage)));
959 39144 : BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
960 39144 : _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
961 39144 : pfree(state->btps_lowkey);
962 :
963 : /*
964 : * Save a copy of the high key from the old page. It is also the low
965 : * key for the new page.
966 : */
967 39144 : state->btps_lowkey = CopyIndexTuple(oitup);
968 :
969 : /*
970 : * Set the sibling links for both pages.
971 : */
972 : {
973 39144 : BTPageOpaque oopaque = BTPageGetOpaque(opage);
974 39144 : BTPageOpaque nopaque = BTPageGetOpaque(npage);
975 :
976 39144 : oopaque->btpo_next = nblkno;
977 39144 : nopaque->btpo_prev = oblkno;
978 39144 : nopaque->btpo_next = P_NONE; /* redundant */
979 : }
980 :
981 : /*
982 : * Write out the old page. _bt_blwritepage takes ownership of the
983 : * 'opage' buffer.
984 : */
985 39144 : _bt_blwritepage(wstate, obuf, oblkno);
986 :
987 : /*
988 : * Reset last_off to point to new page
989 : */
990 39144 : last_off = P_FIRSTKEY;
991 : }
992 :
993 : /*
994 : * By here, either original page is still the current page, or a new page
995 : * was created that became the current page. Either way, the current page
996 : * definitely has space for new item.
997 : *
998 : * If the new item is the first for its page, it must also be the first
999 : * item on its entire level. On later same-level pages, a low key for a
1000 : * page will be copied from the prior page in the code above. Generate a
1001 : * minus infinity low key here instead.
1002 : */
1003 11184716 : if (last_off == P_HIKEY)
1004 : {
1005 : Assert(state->btps_lowkey == NULL);
1006 12030 : state->btps_lowkey = palloc0(sizeof(IndexTupleData));
1007 12030 : state->btps_lowkey->t_info = sizeof(IndexTupleData);
1008 12030 : BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
1009 : }
1010 :
1011 : /*
1012 : * Add the new item into the current page.
1013 : */
1014 11184716 : last_off = OffsetNumberNext(last_off);
1015 11184716 : _bt_sortaddtup(npage, itupsz, itup, last_off,
1016 11184716 : !isleaf && last_off == P_FIRSTKEY);
1017 :
1018 11184716 : state->btps_buf = nbuf;
1019 11184716 : state->btps_blkno = nblkno;
1020 11184716 : state->btps_lastoff = last_off;
1021 11184716 : }
1022 :
1023 : /*
1024 : * Finalize pending posting list tuple, and add it to the index. Final tuple
1025 : * is based on saved base tuple, and saved list of heap TIDs.
1026 : *
1027 : * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
1028 : * using _bt_buildadd().
1029 : */
1030 : static void
1031 4712644 : _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
1032 : BTDedupState dstate)
1033 : {
1034 : Assert(dstate->nitems > 0);
1035 :
1036 4712644 : if (dstate->nitems == 1)
1037 4673742 : _bt_buildadd(wstate, state, dstate->base, 0);
1038 : else
1039 : {
1040 : IndexTuple postingtuple;
1041 : Size truncextra;
1042 :
1043 : /* form a tuple with a posting list */
1044 38902 : postingtuple = _bt_form_posting(dstate->base,
1045 : dstate->htids,
1046 : dstate->nhtids);
1047 : /* Calculate posting list overhead */
1048 77804 : truncextra = IndexTupleSize(postingtuple) -
1049 38902 : BTreeTupleGetPostingOffset(postingtuple);
1050 :
1051 38902 : _bt_buildadd(wstate, state, postingtuple, truncextra);
1052 38902 : pfree(postingtuple);
1053 : }
1054 :
1055 4712644 : dstate->nmaxitems = 0;
1056 4712644 : dstate->nhtids = 0;
1057 4712644 : dstate->nitems = 0;
1058 4712644 : dstate->phystupsize = 0;
1059 4712644 : }
1060 :
1061 : /*
1062 : * Finish writing out the completed btree.
1063 : */
1064 : static void
1065 47668 : _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
1066 : {
1067 : BTPageState *s;
1068 47668 : BlockNumber rootblkno = P_NONE;
1069 47668 : uint32 rootlevel = 0;
1070 : BulkWriteBuffer metabuf;
1071 :
1072 : /*
1073 : * Each iteration of this loop completes one more level of the tree.
1074 : */
1075 59698 : for (s = state; s != NULL; s = s->btps_next)
1076 : {
1077 : BlockNumber blkno;
1078 : BTPageOpaque opaque;
1079 :
1080 12030 : blkno = s->btps_blkno;
1081 12030 : opaque = BTPageGetOpaque((Page) s->btps_buf);
1082 :
1083 : /*
1084 : * We have to link the last page on this level to somewhere.
1085 : *
1086 : * If we're at the top, it's the root, so attach it to the metapage.
1087 : * Otherwise, add an entry for it to its parent using its low key.
1088 : * This may cause the last page of the parent level to split, but
1089 : * that's not a problem -- we haven't gotten to it yet.
1090 : */
1091 12030 : if (s->btps_next == NULL)
1092 : {
1093 9502 : opaque->btpo_flags |= BTP_ROOT;
1094 9502 : rootblkno = blkno;
1095 9502 : rootlevel = s->btps_level;
1096 : }
1097 : else
1098 : {
1099 : Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
1100 : IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
1101 : BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
1102 : P_LEFTMOST(opaque));
1103 : Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
1104 : !P_LEFTMOST(opaque));
1105 2528 : BTreeTupleSetDownLink(s->btps_lowkey, blkno);
1106 2528 : _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
1107 2528 : pfree(s->btps_lowkey);
1108 2528 : s->btps_lowkey = NULL;
1109 : }
1110 :
1111 : /*
1112 : * This is the rightmost page, so the ItemId array needs to be slid
1113 : * back one slot. Then we can dump out the page.
1114 : */
1115 12030 : _bt_slideleft((Page) s->btps_buf);
1116 12030 : _bt_blwritepage(wstate, s->btps_buf, s->btps_blkno);
1117 12030 : s->btps_buf = NULL; /* writepage took ownership of the buffer */
1118 : }
1119 :
1120 : /*
1121 : * As the last step in the process, construct the metapage and make it
1122 : * point to the new root (unless we had no data at all, in which case it's
1123 : * set to point to "P_NONE"). This changes the index to the "valid" state
1124 : * by filling in a valid magic number in the metapage.
1125 : */
1126 47668 : metabuf = smgr_bulk_get_buf(wstate->bulkstate);
1127 47668 : _bt_initmetapage((Page) metabuf, rootblkno, rootlevel,
1128 47668 : wstate->inskey->allequalimage);
1129 47668 : _bt_blwritepage(wstate, metabuf, BTREE_METAPAGE);
1130 47668 : }
1131 :
1132 : /*
1133 : * Read tuples in correct sort order from tuplesort, and load them into
1134 : * btree leaves.
1135 : */
1136 : static void
1137 47668 : _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
1138 : {
1139 47668 : BTPageState *state = NULL;
1140 47668 : bool merge = (btspool2 != NULL);
1141 : IndexTuple itup,
1142 47668 : itup2 = NULL;
1143 : bool load1;
1144 47668 : TupleDesc tupdes = RelationGetDescr(wstate->index);
1145 : int i,
1146 47668 : keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
1147 : SortSupport sortKeys;
1148 47668 : int64 tuples_done = 0;
1149 : bool deduplicate;
1150 :
1151 47668 : wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);
1152 :
1153 56254 : deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
1154 8586 : BTGetDeduplicateItems(wstate->index);
1155 :
1156 47668 : if (merge)
1157 : {
1158 : /*
1159 : * Another BTSpool for dead tuples exists. Now we have to merge
1160 : * btspool and btspool2.
1161 : */
1162 :
1163 : /* the preparation of merge */
1164 20 : itup = tuplesort_getindextuple(btspool->sortstate, true);
1165 20 : itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1166 :
1167 : /* Prepare SortSupport data for each column */
1168 20 : sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
1169 :
1170 42 : for (i = 0; i < keysz; i++)
1171 : {
1172 22 : SortSupport sortKey = sortKeys + i;
1173 22 : ScanKey scanKey = wstate->inskey->scankeys + i;
1174 : int16 strategy;
1175 :
1176 22 : sortKey->ssup_cxt = CurrentMemoryContext;
1177 22 : sortKey->ssup_collation = scanKey->sk_collation;
1178 22 : sortKey->ssup_nulls_first =
1179 22 : (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1180 22 : sortKey->ssup_attno = scanKey->sk_attno;
1181 : /* Abbreviation is not supported here */
1182 22 : sortKey->abbreviate = false;
1183 :
1184 : Assert(sortKey->ssup_attno != 0);
1185 :
1186 22 : strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
1187 : BTGreaterStrategyNumber : BTLessStrategyNumber;
1188 :
1189 22 : PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
1190 : }
1191 :
1192 : for (;;)
1193 : {
1194 3298 : load1 = true; /* load BTSpool next ? */
1195 3298 : if (itup2 == NULL)
1196 : {
1197 154 : if (itup == NULL)
1198 20 : break;
1199 : }
1200 3144 : else if (itup != NULL)
1201 : {
1202 2952 : int32 compare = 0;
1203 :
1204 3188 : for (i = 1; i <= keysz; i++)
1205 : {
1206 : SortSupport entry;
1207 : Datum attrDatum1,
1208 : attrDatum2;
1209 : bool isNull1,
1210 : isNull2;
1211 :
1212 3004 : entry = sortKeys + i - 1;
1213 3004 : attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
1214 3004 : attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
1215 :
1216 3004 : compare = ApplySortComparator(attrDatum1, isNull1,
1217 : attrDatum2, isNull2,
1218 : entry);
1219 3004 : if (compare > 0)
1220 : {
1221 276 : load1 = false;
1222 2768 : break;
1223 : }
1224 2728 : else if (compare < 0)
1225 2492 : break;
1226 : }
1227 :
1228 : /*
1229 : * If key values are equal, we sort on ItemPointer. This is
1230 : * required for btree indexes, since heap TID is treated as an
1231 : * implicit last key attribute in order to ensure that all
1232 : * keys in the index are physically unique.
1233 : */
1234 2952 : if (compare == 0)
1235 : {
1236 184 : compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
1237 : Assert(compare != 0);
1238 184 : if (compare > 0)
1239 22 : load1 = false;
1240 : }
1241 : }
1242 : else
1243 192 : load1 = false;
1244 :
1245 : /* When we see first tuple, create first index page */
1246 3278 : if (state == NULL)
1247 20 : state = _bt_pagestate(wstate, 0);
1248 :
1249 3278 : if (load1)
1250 : {
1251 2788 : _bt_buildadd(wstate, state, itup, 0);
1252 2788 : itup = tuplesort_getindextuple(btspool->sortstate, true);
1253 : }
1254 : else
1255 : {
1256 490 : _bt_buildadd(wstate, state, itup2, 0);
1257 490 : itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1258 : }
1259 :
1260 : /* Report progress */
1261 3278 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1262 : ++tuples_done);
1263 : }
1264 20 : pfree(sortKeys);
1265 : }
1266 47648 : else if (deduplicate)
1267 : {
1268 : /* merge is unnecessary, deduplicate into posting lists */
1269 : BTDedupState dstate;
1270 :
1271 8586 : dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
1272 8586 : dstate->deduplicate = true; /* unused */
1273 8586 : dstate->nmaxitems = 0; /* unused */
1274 8586 : dstate->maxpostingsize = 0; /* set later */
1275 : /* Metadata about base tuple of current pending posting list */
1276 8586 : dstate->base = NULL;
1277 8586 : dstate->baseoff = InvalidOffsetNumber; /* unused */
1278 8586 : dstate->basetupsize = 0;
1279 : /* Metadata about current pending posting list TIDs */
1280 8586 : dstate->htids = NULL;
1281 8586 : dstate->nhtids = 0;
1282 8586 : dstate->nitems = 0;
1283 8586 : dstate->phystupsize = 0; /* unused */
1284 8586 : dstate->nintervals = 0; /* unused */
1285 :
1286 6073552 : while ((itup = tuplesort_getindextuple(btspool->sortstate,
1287 : true)) != NULL)
1288 : {
1289 : /* When we see first tuple, create first index page */
1290 6064966 : if (state == NULL)
1291 : {
1292 2332 : state = _bt_pagestate(wstate, 0);
1293 :
1294 : /*
1295 : * Limit size of posting list tuples to 1/10 space we want to
1296 : * leave behind on the page, plus space for final item's line
1297 : * pointer. This is equal to the space that we'd like to
1298 : * leave behind on each leaf page when fillfactor is 90,
1299 : * allowing us to get close to fillfactor% space utilization
1300 : * when there happen to be a great many duplicates. (This
1301 : * makes higher leaf fillfactor settings ineffective when
1302 : * building indexes that have many duplicates, but packing
1303 : * leaf pages full with few very large tuples doesn't seem
1304 : * like a useful goal.)
1305 : */
1306 2332 : dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
1307 : sizeof(ItemIdData);
1308 : Assert(dstate->maxpostingsize <= BTMaxItemSize((Page) state->btps_buf) &&
1309 : dstate->maxpostingsize <= INDEX_SIZE_MASK);
1310 2332 : dstate->htids = palloc(dstate->maxpostingsize);
1311 :
1312 : /* start new pending posting list with itup copy */
1313 2332 : _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
1314 : InvalidOffsetNumber);
1315 : }
1316 6062634 : else if (_bt_keep_natts_fast(wstate->index, dstate->base,
1317 1360238 : itup) > keysz &&
1318 1360238 : _bt_dedup_save_htid(dstate, itup))
1319 : {
1320 : /*
1321 : * Tuple is equal to base tuple of pending posting list. Heap
1322 : * TID from itup has been saved in state.
1323 : */
1324 : }
1325 : else
1326 : {
1327 : /*
1328 : * Tuple is not equal to pending posting list tuple, or
1329 : * _bt_dedup_save_htid() opted to not merge current item into
1330 : * pending posting list.
1331 : */
1332 4710312 : _bt_sort_dedup_finish_pending(wstate, state, dstate);
1333 4710312 : pfree(dstate->base);
1334 :
1335 : /* start new pending posting list with itup copy */
1336 4710312 : _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
1337 : InvalidOffsetNumber);
1338 : }
1339 :
1340 : /* Report progress */
1341 6064966 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1342 : ++tuples_done);
1343 : }
1344 :
1345 8586 : if (state)
1346 : {
1347 : /*
1348 : * Handle the last item (there must be a last item when the
1349 : * tuplesort returned one or more tuples)
1350 : */
1351 2332 : _bt_sort_dedup_finish_pending(wstate, state, dstate);
1352 2332 : pfree(dstate->base);
1353 2332 : pfree(dstate->htids);
1354 : }
1355 :
1356 8586 : pfree(dstate);
1357 : }
1358 : else
1359 : {
1360 : /* merging and deduplication are both unnecessary */
1361 6466184 : while ((itup = tuplesort_getindextuple(btspool->sortstate,
1362 : true)) != NULL)
1363 : {
1364 : /* When we see first tuple, create first index page */
1365 6427122 : if (state == NULL)
1366 7150 : state = _bt_pagestate(wstate, 0);
1367 :
1368 6427122 : _bt_buildadd(wstate, state, itup, 0);
1369 :
1370 : /* Report progress */
1371 6427122 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1372 : ++tuples_done);
1373 : }
1374 : }
1375 :
1376 : /* Close down final pages and write the metapage */
1377 47668 : _bt_uppershutdown(wstate, state);
1378 47668 : smgr_bulk_finish(wstate->bulkstate);
1379 47668 : }
1380 :
1381 : /*
1382 : * Create parallel context, and launch workers for leader.
1383 : *
1384 : * buildstate argument should be initialized (with the exception of the
1385 : * tuplesort state in spools, which may later be created based on shared
1386 : * state initially set up here).
1387 : *
1388 : * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
1389 : *
1390 : * request is the target number of parallel worker processes to launch.
1391 : *
1392 : * Sets buildstate's BTLeader, which caller must use to shut down parallel
1393 : * mode by passing it to _bt_end_parallel() at the very end of its index
1394 : * build. If not even a single worker process can be launched, this is
1395 : * never set, and caller should proceed with a serial index build.
1396 : */
1397 : static void
1398 148 : _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
1399 : {
1400 : ParallelContext *pcxt;
1401 : int scantuplesortstates;
1402 : Snapshot snapshot;
1403 : Size estbtshared;
1404 : Size estsort;
1405 : BTShared *btshared;
1406 : Sharedsort *sharedsort;
1407 : Sharedsort *sharedsort2;
1408 148 : BTSpool *btspool = buildstate->spool;
1409 148 : BTLeader *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
1410 : WalUsage *walusage;
1411 : BufferUsage *bufferusage;
1412 148 : bool leaderparticipates = true;
1413 : int querylen;
1414 :
1415 : #ifdef DISABLE_LEADER_PARTICIPATION
1416 : leaderparticipates = false;
1417 : #endif
1418 :
1419 : /*
1420 : * Enter parallel mode, and create context for parallel build of btree
1421 : * index
1422 : */
1423 148 : EnterParallelMode();
1424 : Assert(request > 0);
1425 148 : pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1426 : request);
1427 :
1428 148 : scantuplesortstates = leaderparticipates ? request + 1 : request;
1429 :
1430 : /*
1431 : * Prepare for scan of the base relation. In a normal index build, we use
1432 : * SnapshotAny because we must retrieve all tuples and do our own time
1433 : * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1434 : * concurrent build, we take a regular MVCC snapshot and index whatever's
1435 : * live according to that.
1436 : */
1437 148 : if (!isconcurrent)
1438 148 : snapshot = SnapshotAny;
1439 : else
1440 0 : snapshot = RegisterSnapshot(GetTransactionSnapshot());
1441 :
1442 : /*
1443 : * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1444 : * PARALLEL_KEY_TUPLESORT tuplesort workspace
1445 : */
1446 148 : estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
1447 148 : shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
1448 148 : estsort = tuplesort_estimate_shared(scantuplesortstates);
1449 148 : shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1450 :
1451 : /*
1452 : * Unique case requires a second spool, and so we may have to account for
1453 : * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1454 : */
1455 148 : if (!btspool->isunique)
1456 84 : shm_toc_estimate_keys(&pcxt->estimator, 2);
1457 : else
1458 : {
1459 64 : shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1460 64 : shm_toc_estimate_keys(&pcxt->estimator, 3);
1461 : }
1462 :
1463 : /*
1464 : * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
1465 : * and PARALLEL_KEY_BUFFER_USAGE.
1466 : *
1467 : * If there are no extensions loaded that care, we could skip this. We
1468 : * have no way of knowing whether anyone's looking at pgWalUsage or
1469 : * pgBufferUsage, so do it unconditionally.
1470 : */
1471 148 : shm_toc_estimate_chunk(&pcxt->estimator,
1472 : mul_size(sizeof(WalUsage), pcxt->nworkers));
1473 148 : shm_toc_estimate_keys(&pcxt->estimator, 1);
1474 148 : shm_toc_estimate_chunk(&pcxt->estimator,
1475 : mul_size(sizeof(BufferUsage), pcxt->nworkers));
1476 148 : shm_toc_estimate_keys(&pcxt->estimator, 1);
1477 :
1478 : /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1479 148 : if (debug_query_string)
1480 : {
1481 148 : querylen = strlen(debug_query_string);
1482 148 : shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
1483 148 : shm_toc_estimate_keys(&pcxt->estimator, 1);
1484 : }
1485 : else
1486 0 : querylen = 0; /* keep compiler quiet */
1487 :
1488 : /* Everyone's had a chance to ask for space, so now create the DSM */
1489 148 : InitializeParallelDSM(pcxt);
1490 :
1491 : /* If no DSM segment was available, back out (do serial build) */
1492 148 : if (pcxt->seg == NULL)
1493 : {
1494 0 : if (IsMVCCSnapshot(snapshot))
1495 0 : UnregisterSnapshot(snapshot);
1496 0 : DestroyParallelContext(pcxt);
1497 0 : ExitParallelMode();
1498 0 : return;
1499 : }
1500 :
1501 : /* Store shared build state, for which we reserved space */
1502 148 : btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1503 : /* Initialize immutable state */
1504 148 : btshared->heaprelid = RelationGetRelid(btspool->heap);
1505 148 : btshared->indexrelid = RelationGetRelid(btspool->index);
1506 148 : btshared->isunique = btspool->isunique;
1507 148 : btshared->nulls_not_distinct = btspool->nulls_not_distinct;
1508 148 : btshared->isconcurrent = isconcurrent;
1509 148 : btshared->scantuplesortstates = scantuplesortstates;
1510 148 : btshared->queryid = pgstat_get_my_query_id();
1511 148 : ConditionVariableInit(&btshared->workersdonecv);
1512 148 : SpinLockInit(&btshared->mutex);
1513 : /* Initialize mutable state */
1514 148 : btshared->nparticipantsdone = 0;
1515 148 : btshared->reltuples = 0.0;
1516 148 : btshared->havedead = false;
1517 148 : btshared->indtuples = 0.0;
1518 148 : btshared->brokenhotchain = false;
1519 148 : table_parallelscan_initialize(btspool->heap,
1520 : ParallelTableScanFromBTShared(btshared),
1521 : snapshot);
1522 :
1523 : /*
1524 : * Store shared tuplesort-private state, for which we reserved space.
1525 : * Then, initialize opaque state using tuplesort routine.
1526 : */
1527 148 : sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1528 148 : tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1529 : pcxt->seg);
1530 :
1531 148 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
1532 148 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1533 :
1534 : /* Unique case requires a second spool, and associated shared state */
1535 148 : if (!btspool->isunique)
1536 84 : sharedsort2 = NULL;
1537 : else
1538 : {
1539 : /*
1540 : * Store additional shared tuplesort-private state, for which we
1541 : * reserved space. Then, initialize opaque state using tuplesort
1542 : * routine.
1543 : */
1544 64 : sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1545 64 : tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1546 : pcxt->seg);
1547 :
1548 64 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
1549 : }
1550 :
1551 : /* Store query string for workers */
1552 148 : if (debug_query_string)
1553 : {
1554 : char *sharedquery;
1555 :
1556 148 : sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1557 148 : memcpy(sharedquery, debug_query_string, querylen + 1);
1558 148 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
1559 : }
1560 :
1561 : /*
1562 : * Allocate space for each worker's WalUsage and BufferUsage; no need to
1563 : * initialize.
1564 : */
1565 148 : walusage = shm_toc_allocate(pcxt->toc,
1566 148 : mul_size(sizeof(WalUsage), pcxt->nworkers));
1567 148 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
1568 148 : bufferusage = shm_toc_allocate(pcxt->toc,
1569 148 : mul_size(sizeof(BufferUsage), pcxt->nworkers));
1570 148 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1571 :
1572 : /* Launch workers, saving status for leader/caller */
1573 148 : LaunchParallelWorkers(pcxt);
1574 148 : btleader->pcxt = pcxt;
1575 148 : btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1576 148 : if (leaderparticipates)
1577 148 : btleader->nparticipanttuplesorts++;
1578 148 : btleader->btshared = btshared;
1579 148 : btleader->sharedsort = sharedsort;
1580 148 : btleader->sharedsort2 = sharedsort2;
1581 148 : btleader->snapshot = snapshot;
1582 148 : btleader->walusage = walusage;
1583 148 : btleader->bufferusage = bufferusage;
1584 :
1585 : /* If no workers were successfully launched, back out (do serial build) */
1586 148 : if (pcxt->nworkers_launched == 0)
1587 : {
1588 0 : _bt_end_parallel(btleader);
1589 0 : return;
1590 : }
1591 :
1592 : /* Save leader state now that it's clear build will be parallel */
1593 148 : buildstate->btleader = btleader;
1594 :
1595 : /* Join heap scan ourselves */
1596 148 : if (leaderparticipates)
1597 148 : _bt_leader_participate_as_worker(buildstate);
1598 :
1599 : /*
1600 : * Caller needs to wait for all launched workers when we return. Make
1601 : * sure that the failure-to-start case will not hang forever.
1602 : */
1603 148 : WaitForParallelWorkersToAttach(pcxt);
1604 : }
1605 :
1606 : /*
1607 : * Shut down workers, destroy parallel context, and end parallel mode.
1608 : */
1609 : static void
1610 148 : _bt_end_parallel(BTLeader *btleader)
1611 : {
1612 : int i;
1613 :
1614 : /* Shutdown worker processes */
1615 148 : WaitForParallelWorkersToFinish(btleader->pcxt);
1616 :
1617 : /*
1618 : * Next, accumulate WAL usage. (This must wait for the workers to finish,
1619 : * or we might get incomplete data.)
1620 : */
1621 296 : for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
1622 148 : InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
1623 :
1624 : /* Free last reference to MVCC snapshot, if one was used */
1625 148 : if (IsMVCCSnapshot(btleader->snapshot))
1626 0 : UnregisterSnapshot(btleader->snapshot);
1627 148 : DestroyParallelContext(btleader->pcxt);
1628 148 : ExitParallelMode();
1629 148 : }
1630 :
1631 : /*
1632 : * Returns size of shared memory required to store state for a parallel
1633 : * btree index build based on the snapshot its parallel scan will use.
1634 : */
1635 : static Size
1636 148 : _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
1637 : {
1638 : /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1639 148 : return add_size(BUFFERALIGN(sizeof(BTShared)),
1640 : table_parallelscan_estimate(heap, snapshot));
1641 : }
1642 :
1643 : /*
1644 : * Within leader, wait for end of heap scan.
1645 : *
1646 : * When called, parallel heap scan started by _bt_begin_parallel() will
1647 : * already be underway within worker processes (when leader participates
1648 : * as a worker, we should end up here just as workers are finishing).
1649 : *
1650 : * Fills in fields needed for ambuild statistics, and lets caller set
1651 : * field indicating that some worker encountered a broken HOT chain.
1652 : *
1653 : * Returns the total number of heap tuples scanned.
1654 : */
1655 : static double
1656 148 : _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
1657 : {
1658 148 : BTShared *btshared = buildstate->btleader->btshared;
1659 : int nparticipanttuplesorts;
1660 : double reltuples;
1661 :
1662 148 : nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1663 : for (;;)
1664 : {
1665 396 : SpinLockAcquire(&btshared->mutex);
1666 396 : if (btshared->nparticipantsdone == nparticipanttuplesorts)
1667 : {
1668 148 : buildstate->havedead = btshared->havedead;
1669 148 : buildstate->indtuples = btshared->indtuples;
1670 148 : *brokenhotchain = btshared->brokenhotchain;
1671 148 : reltuples = btshared->reltuples;
1672 148 : SpinLockRelease(&btshared->mutex);
1673 148 : break;
1674 : }
1675 248 : SpinLockRelease(&btshared->mutex);
1676 :
1677 248 : ConditionVariableSleep(&btshared->workersdonecv,
1678 : WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
1679 : }
1680 :
1681 148 : ConditionVariableCancelSleep();
1682 :
1683 148 : return reltuples;
1684 : }
1685 :
1686 : /*
1687 : * Within leader, participate as a parallel worker.
1688 : */
1689 : static void
1690 148 : _bt_leader_participate_as_worker(BTBuildState *buildstate)
1691 : {
1692 148 : BTLeader *btleader = buildstate->btleader;
1693 : BTSpool *leaderworker;
1694 : BTSpool *leaderworker2;
1695 : int sortmem;
1696 :
1697 : /* Allocate memory and initialize private spool */
1698 148 : leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
1699 148 : leaderworker->heap = buildstate->spool->heap;
1700 148 : leaderworker->index = buildstate->spool->index;
1701 148 : leaderworker->isunique = buildstate->spool->isunique;
1702 148 : leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;
1703 :
1704 : /* Initialize second spool, if required */
1705 148 : if (!btleader->btshared->isunique)
1706 84 : leaderworker2 = NULL;
1707 : else
1708 : {
1709 : /* Allocate memory for worker's own private secondary spool */
1710 64 : leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
1711 :
1712 : /* Initialize worker's own secondary spool */
1713 64 : leaderworker2->heap = leaderworker->heap;
1714 64 : leaderworker2->index = leaderworker->index;
1715 64 : leaderworker2->isunique = false;
1716 : }
1717 :
1718 : /*
1719 : * Might as well use reliable figure when doling out maintenance_work_mem
1720 : * (when requested number of workers were not launched, this will be
1721 : * somewhat higher than it is for other workers).
1722 : */
1723 148 : sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
1724 :
1725 : /* Perform work common to all participants */
1726 148 : _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
1727 : btleader->sharedsort, btleader->sharedsort2,
1728 : sortmem, true);
1729 :
1730 : #ifdef BTREE_BUILD_STATS
1731 : if (log_btree_build_stats)
1732 : {
1733 : ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1734 : ResetUsage();
1735 : }
1736 : #endif /* BTREE_BUILD_STATS */
1737 148 : }
1738 :
1739 : /*
1740 : * Perform work within a launched parallel process.
1741 : */
1742 : void
1743 148 : _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
1744 : {
1745 : char *sharedquery;
1746 : BTSpool *btspool;
1747 : BTSpool *btspool2;
1748 : BTShared *btshared;
1749 : Sharedsort *sharedsort;
1750 : Sharedsort *sharedsort2;
1751 : Relation heapRel;
1752 : Relation indexRel;
1753 : LOCKMODE heapLockmode;
1754 : LOCKMODE indexLockmode;
1755 : WalUsage *walusage;
1756 : BufferUsage *bufferusage;
1757 : int sortmem;
1758 :
1759 : #ifdef BTREE_BUILD_STATS
1760 : if (log_btree_build_stats)
1761 : ResetUsage();
1762 : #endif /* BTREE_BUILD_STATS */
1763 :
1764 : /*
1765 : * The only possible status flag that can be set to the parallel worker is
1766 : * PROC_IN_SAFE_IC.
1767 : */
1768 : Assert((MyProc->statusFlags == 0) ||
1769 : (MyProc->statusFlags == PROC_IN_SAFE_IC));
1770 :
1771 : /* Set debug_query_string for individual workers first */
1772 148 : sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
1773 148 : debug_query_string = sharedquery;
1774 :
1775 : /* Report the query string from leader */
1776 148 : pgstat_report_activity(STATE_RUNNING, debug_query_string);
1777 :
1778 : /* Look up nbtree shared state */
1779 148 : btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1780 :
1781 : /* Open relations using lock modes known to be obtained by index.c */
1782 148 : if (!btshared->isconcurrent)
1783 : {
1784 148 : heapLockmode = ShareLock;
1785 148 : indexLockmode = AccessExclusiveLock;
1786 : }
1787 : else
1788 : {
1789 0 : heapLockmode = ShareUpdateExclusiveLock;
1790 0 : indexLockmode = RowExclusiveLock;
1791 : }
1792 :
1793 : /* Track query ID */
1794 148 : pgstat_report_query_id(btshared->queryid, false);
1795 :
1796 : /* Open relations within worker */
1797 148 : heapRel = table_open(btshared->heaprelid, heapLockmode);
1798 148 : indexRel = index_open(btshared->indexrelid, indexLockmode);
1799 :
1800 : /* Initialize worker's own spool */
1801 148 : btspool = (BTSpool *) palloc0(sizeof(BTSpool));
1802 148 : btspool->heap = heapRel;
1803 148 : btspool->index = indexRel;
1804 148 : btspool->isunique = btshared->isunique;
1805 148 : btspool->nulls_not_distinct = btshared->nulls_not_distinct;
1806 :
1807 : /* Look up shared state private to tuplesort.c */
1808 148 : sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1809 148 : tuplesort_attach_shared(sharedsort, seg);
1810 148 : if (!btshared->isunique)
1811 : {
1812 84 : btspool2 = NULL;
1813 84 : sharedsort2 = NULL;
1814 : }
1815 : else
1816 : {
1817 : /* Allocate memory for worker's own private secondary spool */
1818 64 : btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
1819 :
1820 : /* Initialize worker's own secondary spool */
1821 64 : btspool2->heap = btspool->heap;
1822 64 : btspool2->index = btspool->index;
1823 64 : btspool2->isunique = false;
1824 : /* Look up shared state private to tuplesort.c */
1825 64 : sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1826 64 : tuplesort_attach_shared(sharedsort2, seg);
1827 : }
1828 :
1829 : /* Prepare to track buffer usage during parallel execution */
1830 148 : InstrStartParallelQuery();
1831 :
1832 : /* Perform sorting of spool, and possibly a spool2 */
1833 148 : sortmem = maintenance_work_mem / btshared->scantuplesortstates;
1834 148 : _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1835 : sharedsort2, sortmem, false);
1836 :
1837 : /* Report WAL/buffer usage during parallel execution */
1838 148 : bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
1839 148 : walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
1840 148 : InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
1841 148 : &walusage[ParallelWorkerNumber]);
1842 :
1843 : #ifdef BTREE_BUILD_STATS
1844 : if (log_btree_build_stats)
1845 : {
1846 : ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1847 : ResetUsage();
1848 : }
1849 : #endif /* BTREE_BUILD_STATS */
1850 :
1851 148 : index_close(indexRel, indexLockmode);
1852 148 : table_close(heapRel, heapLockmode);
1853 148 : }
1854 :
1855 : /*
1856 : * Perform a worker's portion of a parallel sort.
1857 : *
1858 : * This generates a tuplesort for passed btspool, and a second tuplesort
1859 : * state if a second btspool is need (i.e. for unique index builds). All
1860 : * other spool fields should already be set when this is called.
1861 : *
1862 : * sortmem is the amount of working memory to use within each worker,
1863 : * expressed in KBs.
1864 : *
1865 : * When this returns, workers are done, and need only release resources.
1866 : */
1867 : static void
1868 296 : _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
1869 : BTShared *btshared, Sharedsort *sharedsort,
1870 : Sharedsort *sharedsort2, int sortmem, bool progress)
1871 : {
1872 : SortCoordinate coordinate;
1873 : BTBuildState buildstate;
1874 : TableScanDesc scan;
1875 : double reltuples;
1876 : IndexInfo *indexInfo;
1877 :
1878 : /* Initialize local tuplesort coordination state */
1879 296 : coordinate = palloc0(sizeof(SortCoordinateData));
1880 296 : coordinate->isWorker = true;
1881 296 : coordinate->nParticipants = -1;
1882 296 : coordinate->sharedsort = sharedsort;
1883 :
1884 : /* Begin "partial" tuplesort */
1885 592 : btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1886 : btspool->index,
1887 296 : btspool->isunique,
1888 296 : btspool->nulls_not_distinct,
1889 : sortmem, coordinate,
1890 : TUPLESORT_NONE);
1891 :
1892 : /*
1893 : * Just as with serial case, there may be a second spool. If so, a
1894 : * second, dedicated spool2 partial tuplesort is required.
1895 : */
1896 296 : if (btspool2)
1897 : {
1898 : SortCoordinate coordinate2;
1899 :
1900 : /*
1901 : * We expect that the second one (for dead tuples) won't get very
1902 : * full, so we give it only work_mem (unless sortmem is less for
1903 : * worker). Worker processes are generally permitted to allocate
1904 : * work_mem independently.
1905 : */
1906 128 : coordinate2 = palloc0(sizeof(SortCoordinateData));
1907 128 : coordinate2->isWorker = true;
1908 128 : coordinate2->nParticipants = -1;
1909 128 : coordinate2->sharedsort = sharedsort2;
1910 128 : btspool2->sortstate =
1911 128 : tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,
1912 : Min(sortmem, work_mem), coordinate2,
1913 : false);
1914 : }
1915 :
1916 : /* Fill in buildstate for _bt_build_callback() */
1917 296 : buildstate.isunique = btshared->isunique;
1918 296 : buildstate.nulls_not_distinct = btshared->nulls_not_distinct;
1919 296 : buildstate.havedead = false;
1920 296 : buildstate.heap = btspool->heap;
1921 296 : buildstate.spool = btspool;
1922 296 : buildstate.spool2 = btspool2;
1923 296 : buildstate.indtuples = 0;
1924 296 : buildstate.btleader = NULL;
1925 :
1926 : /* Join parallel scan */
1927 296 : indexInfo = BuildIndexInfo(btspool->index);
1928 296 : indexInfo->ii_Concurrent = btshared->isconcurrent;
1929 296 : scan = table_beginscan_parallel(btspool->heap,
1930 : ParallelTableScanFromBTShared(btshared));
1931 296 : reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
1932 : true, progress, _bt_build_callback,
1933 : &buildstate, scan);
1934 :
1935 : /* Execute this worker's part of the sort */
1936 296 : if (progress)
1937 148 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1938 : PROGRESS_BTREE_PHASE_PERFORMSORT_1);
1939 296 : tuplesort_performsort(btspool->sortstate);
1940 296 : if (btspool2)
1941 : {
1942 128 : if (progress)
1943 64 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1944 : PROGRESS_BTREE_PHASE_PERFORMSORT_2);
1945 128 : tuplesort_performsort(btspool2->sortstate);
1946 : }
1947 :
1948 : /*
1949 : * Done. Record ambuild statistics, and whether we encountered a broken
1950 : * HOT chain.
1951 : */
1952 296 : SpinLockAcquire(&btshared->mutex);
1953 296 : btshared->nparticipantsdone++;
1954 296 : btshared->reltuples += reltuples;
1955 296 : if (buildstate.havedead)
1956 0 : btshared->havedead = true;
1957 296 : btshared->indtuples += buildstate.indtuples;
1958 296 : if (indexInfo->ii_BrokenHotChain)
1959 0 : btshared->brokenhotchain = true;
1960 296 : SpinLockRelease(&btshared->mutex);
1961 :
1962 : /* Notify leader */
1963 296 : ConditionVariableSignal(&btshared->workersdonecv);
1964 :
1965 : /* We can end tuplesorts immediately */
1966 296 : tuplesort_end(btspool->sortstate);
1967 296 : if (btspool2)
1968 128 : tuplesort_end(btspool2->sortstate);
1969 296 : }
|