Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtsort.c
4 : * Build a btree from sorted input by loading leaf pages sequentially.
5 : *
6 : * NOTES
7 : *
8 : * We use tuplesort.c to sort the given index tuples into order.
9 : * Then we scan the index tuples in order and build the btree pages
10 : * for each level. We load source tuples into leaf-level pages.
11 : * Whenever we fill a page at one level, we add a link to it to its
12 : * parent level (starting a new parent level if necessary). When
13 : * done, we write out each final page on each level, adding it to
14 : * its parent level. When we have only one page on a level, it must be
15 : * the root -- it can be attached to the btree metapage and we are done.
16 : *
17 : * It is not wise to pack the pages entirely full, since then *any*
18 : * insertion would cause a split (and not only of the leaf page; the need
19 : * for a split would cascade right up the tree). The steady-state load
20 : * factor for btrees is usually estimated at 70%. We choose to pack leaf
21 : * pages to the user-controllable fill factor (default 90%) while upper pages
22 : * are always packed to 70%. This gives us reasonable density (there aren't
23 : * many upper pages if the keys are reasonable-size) without risking a lot of
24 : * cascading splits during early insertions.
25 : *
26 : * We use the bulk smgr loading facility to bypass the buffer cache and
27 : * WAL-log the pages efficiently.
28 : *
29 : * This code isn't concerned about the FSM at all. The caller is responsible
30 : * for initializing that.
31 : *
32 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
33 : * Portions Copyright (c) 1994, Regents of the University of California
34 : *
35 : * IDENTIFICATION
36 : * src/backend/access/nbtree/nbtsort.c
37 : *
38 : *-------------------------------------------------------------------------
39 : */
40 :
41 : #include "postgres.h"
42 :
43 : #include "access/nbtree.h"
44 : #include "access/parallel.h"
45 : #include "access/relscan.h"
46 : #include "access/table.h"
47 : #include "access/xact.h"
48 : #include "access/xloginsert.h"
49 : #include "catalog/index.h"
50 : #include "commands/progress.h"
51 : #include "executor/instrument.h"
52 : #include "miscadmin.h"
53 : #include "pgstat.h"
54 : #include "storage/bulk_write.h"
55 : #include "tcop/tcopprot.h" /* pgrminclude ignore */
56 : #include "utils/rel.h"
57 : #include "utils/sortsupport.h"
58 : #include "utils/tuplesort.h"
59 :
60 :
61 : /* Magic numbers for parallel state sharing */
62 : #define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001)
63 : #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002)
64 : #define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003)
65 : #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004)
66 : #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xA000000000000005)
67 : #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xA000000000000006)
68 :
69 : /*
70 : * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
71 : * parallel index builds. This may be useful as a debugging aid.
72 : #undef DISABLE_LEADER_PARTICIPATION
73 : */
74 :
75 : /*
76 : * Status record for spooling/sorting phase. (Note we may have two of
77 : * these due to the special requirements for uniqueness-checking with
78 : * dead tuples.)
79 : */
80 : typedef struct BTSpool
81 : {
82 : Tuplesortstate *sortstate; /* state data for tuplesort.c */
83 : Relation heap;
84 : Relation index;
85 : bool isunique;
86 : bool nulls_not_distinct;
87 : } BTSpool;
88 :
89 : /*
90 : * Status for index builds performed in parallel. This is allocated in a
91 : * dynamic shared memory segment. Note that there is a separate tuplesort TOC
92 : * entry, private to tuplesort.c but allocated by this module on its behalf.
93 : */
94 : typedef struct BTShared
95 : {
96 : /*
97 : * These fields are not modified during the sort. They primarily exist
98 : * for the benefit of worker processes that need to create BTSpool state
99 : * corresponding to that used by the leader.
100 : */
101 : Oid heaprelid;
102 : Oid indexrelid;
103 : bool isunique;
104 : bool nulls_not_distinct;
105 : bool isconcurrent;
106 : int scantuplesortstates;
107 :
108 : /*
109 : * workersdonecv is used to monitor the progress of workers. All parallel
110 : * participants must indicate that they are done before leader can use
111 : * mutable state that workers maintain during scan (and before leader can
112 : * proceed to tuplesort_performsort()).
113 : */
114 : ConditionVariable workersdonecv;
115 :
116 : /*
117 : * mutex protects all fields before heapdesc.
118 : *
119 : * These fields contain status information of interest to B-Tree index
120 : * builds that must work just the same when an index is built in parallel.
121 : */
122 : slock_t mutex;
123 :
124 : /*
125 : * Mutable state that is maintained by workers, and reported back to
126 : * leader at end of parallel scan.
127 : *
128 : * nparticipantsdone is number of worker processes finished.
129 : *
130 : * reltuples is the total number of input heap tuples.
131 : *
132 : * havedead indicates if RECENTLY_DEAD tuples were encountered during
133 : * build.
134 : *
135 : * indtuples is the total number of tuples that made it into the index.
136 : *
137 : * brokenhotchain indicates if any worker detected a broken HOT chain
138 : * during build.
139 : */
140 : int nparticipantsdone;
141 : double reltuples;
142 : bool havedead;
143 : double indtuples;
144 : bool brokenhotchain;
145 :
146 : /*
147 : * ParallelTableScanDescData data follows. Can't directly embed here, as
148 : * implementations of the parallel table scan desc interface might need
149 : * stronger alignment.
150 : */
151 : } BTShared;
152 :
153 : /*
154 : * Return pointer to a BTShared's parallel table scan.
155 : *
156 : * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
157 : * MAXALIGN.
158 : */
159 : #define ParallelTableScanFromBTShared(shared) \
160 : (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
161 :
162 : /*
163 : * Status for leader in parallel index build.
164 : */
165 : typedef struct BTLeader
166 : {
167 : /* parallel context itself */
168 : ParallelContext *pcxt;
169 :
170 : /*
171 : * nparticipanttuplesorts is the exact number of worker processes
172 : * successfully launched, plus one leader process if it participates as a
173 : * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
174 : * participating as a worker).
175 : */
176 : int nparticipanttuplesorts;
177 :
178 : /*
179 : * Leader process convenience pointers to shared state (leader avoids TOC
180 : * lookups).
181 : *
182 : * btshared is the shared state for entire build. sharedsort is the
183 : * shared, tuplesort-managed state passed to each process tuplesort.
184 : * sharedsort2 is the corresponding btspool2 shared state, used only when
185 : * building unique indexes. snapshot is the snapshot used by the scan iff
186 : * an MVCC snapshot is required.
187 : */
188 : BTShared *btshared;
189 : Sharedsort *sharedsort;
190 : Sharedsort *sharedsort2;
191 : Snapshot snapshot;
192 : WalUsage *walusage;
193 : BufferUsage *bufferusage;
194 : } BTLeader;
195 :
196 : /*
197 : * Working state for btbuild and its callback.
198 : *
199 : * When parallel CREATE INDEX is used, there is a BTBuildState for each
200 : * participant.
201 : */
202 : typedef struct BTBuildState
203 : {
204 : bool isunique;
205 : bool nulls_not_distinct;
206 : bool havedead;
207 : Relation heap;
208 : BTSpool *spool;
209 :
210 : /*
211 : * spool2 is needed only when the index is a unique index. Dead tuples are
212 : * put into spool2 instead of spool in order to avoid uniqueness check.
213 : */
214 : BTSpool *spool2;
215 : double indtuples;
216 :
217 : /*
218 : * btleader is only present when a parallel index build is performed, and
219 : * only in the leader process. (Actually, only the leader has a
220 : * BTBuildState. Workers have their own spool and spool2, though.)
221 : */
222 : BTLeader *btleader;
223 : } BTBuildState;
224 :
225 : /*
226 : * Status record for a btree page being built. We have one of these
227 : * for each active tree level.
228 : */
229 : typedef struct BTPageState
230 : {
231 : BulkWriteBuffer btps_buf; /* workspace for page building */
232 : BlockNumber btps_blkno; /* block # to write this page at */
233 : IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */
234 : OffsetNumber btps_lastoff; /* last item offset loaded */
235 : Size btps_lastextra; /* last item's extra posting list space */
236 : uint32 btps_level; /* tree level (0 = leaf) */
237 : Size btps_full; /* "full" if less than this much free space */
238 : struct BTPageState *btps_next; /* link to parent level, if any */
239 : } BTPageState;
240 :
241 : /*
242 : * Overall status record for index writing phase.
243 : */
244 : typedef struct BTWriteState
245 : {
246 : Relation heap;
247 : Relation index;
248 : BulkWriteState *bulkstate;
249 : BTScanInsert inskey; /* generic insertion scankey */
250 : BlockNumber btws_pages_alloced; /* # pages allocated */
251 : } BTWriteState;
252 :
253 :
254 : static double _bt_spools_heapscan(Relation heap, Relation index,
255 : BTBuildState *buildstate, IndexInfo *indexInfo);
256 : static void _bt_spooldestroy(BTSpool *btspool);
257 : static void _bt_spool(BTSpool *btspool, ItemPointer self,
258 : Datum *values, bool *isnull);
259 : static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
260 : static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
261 : bool *isnull, bool tupleIsAlive, void *state);
262 : static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level);
263 : static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
264 : static void _bt_slideleft(Page rightmostpage);
265 : static void _bt_sortaddtup(Page page, Size itemsize,
266 : IndexTuple itup, OffsetNumber itup_off,
267 : bool newfirstdataitem);
268 : static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
269 : IndexTuple itup, Size truncextra);
270 : static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
271 : BTPageState *state,
272 : BTDedupState dstate);
273 : static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
274 : static void _bt_load(BTWriteState *wstate,
275 : BTSpool *btspool, BTSpool *btspool2);
276 : static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
277 : int request);
278 : static void _bt_end_parallel(BTLeader *btleader);
279 : static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
280 : static double _bt_parallel_heapscan(BTBuildState *buildstate,
281 : bool *brokenhotchain);
282 : static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
283 : static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
284 : BTShared *btshared, Sharedsort *sharedsort,
285 : Sharedsort *sharedsort2, int sortmem,
286 : bool progress);
287 :
288 :
289 : /*
290 : * btbuild() -- build a new btree index.
291 : */
292 : IndexBuildResult *
293 45682 : btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
294 : {
295 : IndexBuildResult *result;
296 : BTBuildState buildstate;
297 : double reltuples;
298 :
299 : #ifdef BTREE_BUILD_STATS
300 : if (log_btree_build_stats)
301 : ResetUsage();
302 : #endif /* BTREE_BUILD_STATS */
303 :
304 45682 : buildstate.isunique = indexInfo->ii_Unique;
305 45682 : buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
306 45682 : buildstate.havedead = false;
307 45682 : buildstate.heap = heap;
308 45682 : buildstate.spool = NULL;
309 45682 : buildstate.spool2 = NULL;
310 45682 : buildstate.indtuples = 0;
311 45682 : buildstate.btleader = NULL;
312 :
313 : /*
314 : * We expect to be called exactly once for any index relation. If that's
315 : * not the case, big trouble's what we have.
316 : */
317 45682 : if (RelationGetNumberOfBlocks(index) != 0)
318 0 : elog(ERROR, "index \"%s\" already contains data",
319 : RelationGetRelationName(index));
320 :
321 45682 : reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
322 :
323 : /*
324 : * Finish the build by (1) completing the sort of the spool file, (2)
325 : * inserting the sorted tuples into btree pages and (3) building the upper
326 : * levels. Finally, it may also be necessary to end use of parallelism.
327 : */
328 45670 : _bt_leafbuild(buildstate.spool, buildstate.spool2);
329 45586 : _bt_spooldestroy(buildstate.spool);
330 45586 : if (buildstate.spool2)
331 18 : _bt_spooldestroy(buildstate.spool2);
332 45586 : if (buildstate.btleader)
333 142 : _bt_end_parallel(buildstate.btleader);
334 :
335 45586 : result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
336 :
337 45586 : result->heap_tuples = reltuples;
338 45586 : result->index_tuples = buildstate.indtuples;
339 :
340 : #ifdef BTREE_BUILD_STATS
341 : if (log_btree_build_stats)
342 : {
343 : ShowUsage("BTREE BUILD STATS");
344 : ResetUsage();
345 : }
346 : #endif /* BTREE_BUILD_STATS */
347 :
348 45586 : return result;
349 : }
350 :
351 : /*
352 : * Create and initialize one or two spool structures, and save them in caller's
353 : * buildstate argument. May also fill-in fields within indexInfo used by index
354 : * builds.
355 : *
356 : * Scans the heap, possibly in parallel, filling spools with IndexTuples. This
357 : * routine encapsulates all aspects of managing parallelism. Caller need only
358 : * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
359 : *
360 : * Returns the total number of heap tuples scanned.
361 : */
362 : static double
363 45682 : _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
364 : IndexInfo *indexInfo)
365 : {
366 45682 : BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
367 45682 : SortCoordinate coordinate = NULL;
368 45682 : double reltuples = 0;
369 :
370 : /*
371 : * We size the sort area as maintenance_work_mem rather than work_mem to
372 : * speed index creation. This should be OK since a single backend can't
373 : * run multiple index creations in parallel (see also: notes on
374 : * parallelism and maintenance_work_mem below).
375 : */
376 45682 : btspool->heap = heap;
377 45682 : btspool->index = index;
378 45682 : btspool->isunique = indexInfo->ii_Unique;
379 45682 : btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
380 :
381 : /* Save as primary spool */
382 45682 : buildstate->spool = btspool;
383 :
384 : /* Report table scan phase started */
385 45682 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
386 : PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);
387 :
388 : /* Attempt to launch parallel worker scan when required */
389 45682 : if (indexInfo->ii_ParallelWorkers > 0)
390 142 : _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
391 : indexInfo->ii_ParallelWorkers);
392 :
393 : /*
394 : * If parallel build requested and at least one worker process was
395 : * successfully launched, set up coordination state
396 : */
397 45682 : if (buildstate->btleader)
398 : {
399 142 : coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
400 142 : coordinate->isWorker = false;
401 142 : coordinate->nParticipants =
402 142 : buildstate->btleader->nparticipanttuplesorts;
403 142 : coordinate->sharedsort = buildstate->btleader->sharedsort;
404 : }
405 :
406 : /*
407 : * Begin serial/leader tuplesort.
408 : *
409 : * In cases where parallelism is involved, the leader receives the same
410 : * share of maintenance_work_mem as a serial sort (it is generally treated
411 : * in the same way as a serial sort once we return). Parallel worker
412 : * Tuplesortstates will have received only a fraction of
413 : * maintenance_work_mem, though.
414 : *
415 : * We rely on the lifetime of the Leader Tuplesortstate almost not
416 : * overlapping with any worker Tuplesortstate's lifetime. There may be
417 : * some small overlap, but that's okay because we rely on leader
418 : * Tuplesortstate only allocating a small, fixed amount of memory here.
419 : * When its tuplesort_performsort() is called (by our caller), and
420 : * significant amounts of memory are likely to be used, all workers must
421 : * have already freed almost all memory held by their Tuplesortstates
422 : * (they are about to go away completely, too). The overall effect is
423 : * that maintenance_work_mem always represents an absolute high watermark
424 : * on the amount of memory used by a CREATE INDEX operation, regardless of
425 : * the use of parallelism or any other factor.
426 : */
427 91364 : buildstate->spool->sortstate =
428 45682 : tuplesort_begin_index_btree(heap, index, buildstate->isunique,
429 45682 : buildstate->nulls_not_distinct,
430 : maintenance_work_mem, coordinate,
431 : TUPLESORT_NONE);
432 :
433 : /*
434 : * If building a unique index, put dead tuples in a second spool to keep
435 : * them out of the uniqueness check. We expect that the second spool (for
436 : * dead tuples) won't get very full, so we give it only work_mem.
437 : */
438 45682 : if (indexInfo->ii_Unique)
439 : {
440 37066 : BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
441 37066 : SortCoordinate coordinate2 = NULL;
442 :
443 : /* Initialize secondary spool */
444 37066 : btspool2->heap = heap;
445 37066 : btspool2->index = index;
446 37066 : btspool2->isunique = false;
447 : /* Save as secondary spool */
448 37066 : buildstate->spool2 = btspool2;
449 :
450 37066 : if (buildstate->btleader)
451 : {
452 : /*
453 : * Set up non-private state that is passed to
454 : * tuplesort_begin_index_btree() about the basic high level
455 : * coordination of a parallel sort.
456 : */
457 64 : coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
458 64 : coordinate2->isWorker = false;
459 64 : coordinate2->nParticipants =
460 64 : buildstate->btleader->nparticipanttuplesorts;
461 64 : coordinate2->sharedsort = buildstate->btleader->sharedsort2;
462 : }
463 :
464 : /*
465 : * We expect that the second one (for dead tuples) won't get very
466 : * full, so we give it only work_mem
467 : */
468 37066 : buildstate->spool2->sortstate =
469 37066 : tuplesort_begin_index_btree(heap, index, false, false, work_mem,
470 : coordinate2, TUPLESORT_NONE);
471 : }
472 :
473 : /* Fill spool using either serial or parallel heap scan */
474 45682 : if (!buildstate->btleader)
475 45540 : reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
476 : _bt_build_callback, (void *) buildstate,
477 : NULL);
478 : else
479 142 : reltuples = _bt_parallel_heapscan(buildstate,
480 : &indexInfo->ii_BrokenHotChain);
481 :
482 : /*
483 : * Set the progress target for the next phase. Reset the block number
484 : * values set by table_index_build_scan
485 : */
486 : {
487 45670 : const int progress_index[] = {
488 : PROGRESS_CREATEIDX_TUPLES_TOTAL,
489 : PROGRESS_SCAN_BLOCKS_TOTAL,
490 : PROGRESS_SCAN_BLOCKS_DONE
491 : };
492 45670 : const int64 progress_vals[] = {
493 45670 : buildstate->indtuples,
494 : 0, 0
495 : };
496 :
497 45670 : pgstat_progress_update_multi_param(3, progress_index, progress_vals);
498 : }
499 :
500 : /* okay, all heap tuples are spooled */
501 45670 : if (buildstate->spool2 && !buildstate->havedead)
502 : {
503 : /* spool2 turns out to be unnecessary */
504 37048 : _bt_spooldestroy(buildstate->spool2);
505 37048 : buildstate->spool2 = NULL;
506 : }
507 :
508 45670 : return reltuples;
509 : }
510 :
511 : /*
512 : * clean up a spool structure and its substructures.
513 : */
514 : static void
515 82652 : _bt_spooldestroy(BTSpool *btspool)
516 : {
517 82652 : tuplesort_end(btspool->sortstate);
518 82652 : pfree(btspool);
519 82652 : }
520 :
521 : /*
522 : * spool an index entry into the sort file.
523 : */
524 : static void
525 11947808 : _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
526 : {
527 11947808 : tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
528 : self, values, isnull);
529 11947808 : }
530 :
531 : /*
532 : * given a spool loaded by successive calls to _bt_spool,
533 : * create an entire btree.
534 : */
535 : static void
536 45670 : _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
537 : {
538 : BTWriteState wstate;
539 :
540 : #ifdef BTREE_BUILD_STATS
541 : if (log_btree_build_stats)
542 : {
543 : ShowUsage("BTREE BUILD (Spool) STATISTICS");
544 : ResetUsage();
545 : }
546 : #endif /* BTREE_BUILD_STATS */
547 :
548 : /* Execute the sort */
549 45670 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
550 : PROGRESS_BTREE_PHASE_PERFORMSORT_1);
551 45670 : tuplesort_performsort(btspool->sortstate);
552 45586 : if (btspool2)
553 : {
554 18 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
555 : PROGRESS_BTREE_PHASE_PERFORMSORT_2);
556 18 : tuplesort_performsort(btspool2->sortstate);
557 : }
558 :
559 45586 : wstate.heap = btspool->heap;
560 45586 : wstate.index = btspool->index;
561 45586 : wstate.inskey = _bt_mkscankey(wstate.index, NULL);
562 : /* _bt_mkscankey() won't set allequalimage without metapage */
563 45586 : wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
564 :
565 : /* reserve the metapage */
566 45586 : wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
567 :
568 45586 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
569 : PROGRESS_BTREE_PHASE_LEAF_LOAD);
570 45586 : _bt_load(&wstate, btspool, btspool2);
571 45586 : }
572 :
573 : /*
574 : * Per-tuple callback for table_index_build_scan
575 : */
576 : static void
577 11947808 : _bt_build_callback(Relation index,
578 : ItemPointer tid,
579 : Datum *values,
580 : bool *isnull,
581 : bool tupleIsAlive,
582 : void *state)
583 : {
584 11947808 : BTBuildState *buildstate = (BTBuildState *) state;
585 :
586 : /*
587 : * insert the index tuple into the appropriate spool file for subsequent
588 : * processing
589 : */
590 11947808 : if (tupleIsAlive || buildstate->spool2 == NULL)
591 11947352 : _bt_spool(buildstate->spool, tid, values, isnull);
592 : else
593 : {
594 : /* dead tuples are put into spool2 */
595 456 : buildstate->havedead = true;
596 456 : _bt_spool(buildstate->spool2, tid, values, isnull);
597 : }
598 :
599 11947808 : buildstate->indtuples += 1;
600 11947808 : }
601 :
602 : /*
603 : * allocate workspace for a new, clean btree page, not linked to any siblings.
604 : */
605 : static BulkWriteBuffer
606 48712 : _bt_blnewpage(BTWriteState *wstate, uint32 level)
607 : {
608 : BulkWriteBuffer buf;
609 : Page page;
610 : BTPageOpaque opaque;
611 :
612 48712 : buf = smgr_bulk_get_buf(wstate->bulkstate);
613 48712 : page = (Page) buf;
614 :
615 : /* Zero the page and set up standard page header info */
616 48712 : _bt_pageinit(page, BLCKSZ);
617 :
618 : /* Initialize BT opaque state */
619 48712 : opaque = BTPageGetOpaque(page);
620 48712 : opaque->btpo_prev = opaque->btpo_next = P_NONE;
621 48712 : opaque->btpo_level = level;
622 48712 : opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
623 48712 : opaque->btpo_cycleid = 0;
624 :
625 : /* Make the P_HIKEY line pointer appear allocated */
626 48712 : ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
627 :
628 48712 : return buf;
629 : }
630 :
631 : /*
632 : * emit a completed btree page, and release the working storage.
633 : */
634 : static void
635 94298 : _bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
636 : {
637 94298 : smgr_bulk_write(wstate->bulkstate, blkno, buf, true);
638 : /* smgr_bulk_write took ownership of 'buf' */
639 94298 : }
640 :
641 : /*
642 : * allocate and initialize a new BTPageState. the returned structure
643 : * is suitable for immediate use by _bt_buildadd.
644 : */
645 : static BTPageState *
646 11222 : _bt_pagestate(BTWriteState *wstate, uint32 level)
647 : {
648 11222 : BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
649 :
650 : /* create initial page for level */
651 11222 : state->btps_buf = _bt_blnewpage(wstate, level);
652 :
653 : /* and assign it a page position */
654 11222 : state->btps_blkno = wstate->btws_pages_alloced++;
655 :
656 11222 : state->btps_lowkey = NULL;
657 : /* initialize lastoff so first item goes into P_FIRSTKEY */
658 11222 : state->btps_lastoff = P_HIKEY;
659 11222 : state->btps_lastextra = 0;
660 11222 : state->btps_level = level;
661 : /* set "full" threshold based on level. See notes at head of file. */
662 11222 : if (level > 0)
663 2266 : state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
664 : else
665 8956 : state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
666 :
667 : /* no parent level, yet */
668 11222 : state->btps_next = NULL;
669 :
670 11222 : return state;
671 : }
672 :
673 : /*
674 : * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
675 : * P_HIKEY, overwriting P_HIKEY).
676 : *
677 : * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
678 : * rightmost page on its level is not supposed to get a high key. Now that
679 : * it's clear that this page is a rightmost page, remove the unneeded empty
680 : * P_HIKEY line pointer space.
681 : */
682 : static void
683 11222 : _bt_slideleft(Page rightmostpage)
684 : {
685 : OffsetNumber off;
686 : OffsetNumber maxoff;
687 : ItemId previi;
688 :
689 11222 : maxoff = PageGetMaxOffsetNumber(rightmostpage);
690 : Assert(maxoff >= P_FIRSTKEY);
691 11222 : previi = PageGetItemId(rightmostpage, P_HIKEY);
692 650326 : for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
693 : {
694 639104 : ItemId thisii = PageGetItemId(rightmostpage, off);
695 :
696 639104 : *previi = *thisii;
697 639104 : previi = thisii;
698 : }
699 11222 : ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
700 11222 : }
701 :
702 : /*
703 : * Add an item to a page being built.
704 : *
705 : * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
706 : * raises an error directly.
707 : *
708 : * Note that our nbtsort.c caller does not know yet if the page will be
709 : * rightmost. Offset P_FIRSTKEY is always assumed to be the first data key by
710 : * caller. Page that turns out to be the rightmost on its level is fixed by
711 : * calling _bt_slideleft().
712 : */
713 : static void
714 10721530 : _bt_sortaddtup(Page page,
715 : Size itemsize,
716 : IndexTuple itup,
717 : OffsetNumber itup_off,
718 : bool newfirstdataitem)
719 : {
720 : IndexTupleData trunctuple;
721 :
722 10721530 : if (newfirstdataitem)
723 : {
724 2390 : trunctuple = *itup;
725 2390 : trunctuple.t_info = sizeof(IndexTupleData);
726 2390 : BTreeTupleSetNAtts(&trunctuple, 0, false);
727 2390 : itup = &trunctuple;
728 2390 : itemsize = sizeof(IndexTupleData);
729 : }
730 :
731 10721530 : if (PageAddItem(page, (Item) itup, itemsize, itup_off,
732 : false, false) == InvalidOffsetNumber)
733 0 : elog(ERROR, "failed to add item to the index page");
734 10721530 : }
735 :
736 : /*----------
737 : * Add an item to a disk page from the sort output (or add a posting list
738 : * item formed from the sort output).
739 : *
740 : * We must be careful to observe the page layout conventions of nbtsearch.c:
741 : * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
742 : * - on non-leaf pages, the key portion of the first item need not be
743 : * stored, we should store only the link.
744 : *
745 : * A leaf page being built looks like:
746 : *
747 : * +----------------+---------------------------------+
748 : * | PageHeaderData | linp0 linp1 linp2 ... |
749 : * +-----------+----+---------------------------------+
750 : * | ... linpN | |
751 : * +-----------+--------------------------------------+
752 : * | ^ last |
753 : * | |
754 : * +-------------+------------------------------------+
755 : * | | itemN ... |
756 : * +-------------+------------------+-----------------+
757 : * | ... item3 item2 item1 | "special space" |
758 : * +--------------------------------+-----------------+
759 : *
760 : * Contrast this with the diagram in bufpage.h; note the mismatch
761 : * between linps and items. This is because we reserve linp0 as a
762 : * placeholder for the pointer to the "high key" item; when we have
763 : * filled up the page, we will set linp0 to point to itemN and clear
764 : * linpN. On the other hand, if we find this is the last (rightmost)
765 : * page, we leave the items alone and slide the linp array over. If
766 : * the high key is to be truncated, offset 1 is deleted, and we insert
767 : * the truncated high key at offset 1.
768 : *
769 : * 'last' pointer indicates the last offset added to the page.
770 : *
771 : * 'truncextra' is the size of the posting list in itup, if any. This
772 : * information is stashed for the next call here, when we may benefit
773 : * from considering the impact of truncating away the posting list on
774 : * the page before deciding to finish the page off. Posting lists are
775 : * often relatively large, so it is worth going to the trouble of
776 : * accounting for the saving from truncating away the posting list of
777 : * the tuple that becomes the high key (that may be the only way to
778 : * get close to target free space on the page). Note that this is
779 : * only used for the soft fillfactor-wise limit, not the critical hard
780 : * limit.
781 : *----------
782 : */
783 : static void
784 10684040 : _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
785 : Size truncextra)
786 : {
787 : BulkWriteBuffer nbuf;
788 : Page npage;
789 : BlockNumber nblkno;
790 : OffsetNumber last_off;
791 : Size last_truncextra;
792 : Size pgspc;
793 : Size itupsz;
794 : bool isleaf;
795 :
796 : /*
797 : * This is a handy place to check for cancel interrupts during the btree
798 : * load phase of index creation.
799 : */
800 10684040 : CHECK_FOR_INTERRUPTS();
801 :
802 10684040 : nbuf = state->btps_buf;
803 10684040 : npage = (Page) nbuf;
804 10684040 : nblkno = state->btps_blkno;
805 10684040 : last_off = state->btps_lastoff;
806 10684040 : last_truncextra = state->btps_lastextra;
807 10684040 : state->btps_lastextra = truncextra;
808 :
809 10684040 : pgspc = PageGetFreeSpace(npage);
810 10684040 : itupsz = IndexTupleSize(itup);
811 10684040 : itupsz = MAXALIGN(itupsz);
812 : /* Leaf case has slightly different rules due to suffix truncation */
813 10684040 : isleaf = (state->btps_level == 0);
814 :
815 : /*
816 : * Check whether the new item can fit on a btree page on current level at
817 : * all.
818 : *
819 : * Every newly built index will treat heap TID as part of the keyspace,
820 : * which imposes the requirement that new high keys must occasionally have
821 : * a heap TID appended within _bt_truncate(). That may leave a new pivot
822 : * tuple one or two MAXALIGN() quantums larger than the original
823 : * firstright tuple it's derived from. v4 deals with the problem by
824 : * decreasing the limit on the size of tuples inserted on the leaf level
825 : * by the same small amount. Enforce the new v4+ limit on the leaf level,
826 : * and the old limit on internal levels, since pivot tuples may need to
827 : * make use of the reserved space. This should never fail on internal
828 : * pages.
829 : */
830 10684040 : if (unlikely(itupsz > BTMaxItemSize(npage)))
831 264 : _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
832 : itup);
833 :
834 : /*
835 : * Check to see if current page will fit new item, with space left over to
836 : * append a heap TID during suffix truncation when page is a leaf page.
837 : *
838 : * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
839 : * high key with heap TID when finishing off a leaf page, since we rely on
840 : * _bt_check_third_page() rejecting oversized non-pivot tuples. On
841 : * internal pages we can always fit 3 pivot tuples with larger internal
842 : * page tuple limit (includes page high key).
843 : *
844 : * Most of the time, a page is only "full" in the sense that the soft
845 : * fillfactor-wise limit has been exceeded. However, we must always leave
846 : * at least two items plus a high key on each page before starting a new
847 : * page. Disregard fillfactor and insert on "full" current page if we
848 : * don't have the minimum number of items yet. (Note that we deliberately
849 : * assume that suffix truncation neither enlarges nor shrinks new high key
850 : * when applying soft limit, except when last tuple has a posting list.)
851 : */
852 : Assert(last_truncextra == 0 || isleaf);
853 10684040 : if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
854 10682892 : (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
855 : {
856 : /*
857 : * Finish off the page and write it out.
858 : */
859 37490 : BulkWriteBuffer obuf = nbuf;
860 37490 : Page opage = npage;
861 37490 : BlockNumber oblkno = nblkno;
862 : ItemId ii;
863 : ItemId hii;
864 : IndexTuple oitup;
865 :
866 : /* Create new page of same level */
867 37490 : nbuf = _bt_blnewpage(wstate, state->btps_level);
868 37490 : npage = (Page) nbuf;
869 :
870 : /* and assign it a page position */
871 37490 : nblkno = wstate->btws_pages_alloced++;
872 :
873 : /*
874 : * We copy the last item on the page into the new page, and then
875 : * rearrange the old page so that the 'last item' becomes its high key
876 : * rather than a true data item. There had better be at least two
877 : * items on the page already, else the page would be empty of useful
878 : * data.
879 : */
880 : Assert(last_off > P_FIRSTKEY);
881 37490 : ii = PageGetItemId(opage, last_off);
882 37490 : oitup = (IndexTuple) PageGetItem(opage, ii);
883 37490 : _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
884 37490 : !isleaf);
885 :
886 : /*
887 : * Move 'last' into the high key position on opage. _bt_blnewpage()
888 : * allocated empty space for a line pointer when opage was first
889 : * created, so this is a matter of rearranging already-allocated space
890 : * on page, and initializing high key line pointer. (Actually, leaf
891 : * pages must also swap oitup with a truncated version of oitup, which
892 : * is sometimes larger than oitup, though never by more than the space
893 : * needed to append a heap TID.)
894 : */
895 37490 : hii = PageGetItemId(opage, P_HIKEY);
896 37490 : *hii = *ii;
897 37490 : ItemIdSetUnused(ii); /* redundant */
898 37490 : ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
899 :
900 37490 : if (isleaf)
901 : {
902 : IndexTuple lastleft;
903 : IndexTuple truncated;
904 :
905 : /*
906 : * Truncate away any unneeded attributes from high key on leaf
907 : * level. This is only done at the leaf level because downlinks
908 : * in internal pages are either negative infinity items, or get
909 : * their contents from copying from one level down. See also:
910 : * _bt_split().
911 : *
912 : * We don't try to bias our choice of split point to make it more
913 : * likely that _bt_truncate() can truncate away more attributes,
914 : * whereas the split point used within _bt_split() is chosen much
915 : * more delicately. Even still, the lastleft and firstright
916 : * tuples passed to _bt_truncate() here are at least not fully
917 : * equal to each other when deduplication is used, unless there is
918 : * a large group of duplicates (also, unique index builds usually
919 : * have few or no spool2 duplicates). When the split point is
920 : * between two unequal tuples, _bt_truncate() will avoid including
921 : * a heap TID in the new high key, which is the most important
922 : * benefit of suffix truncation.
923 : *
924 : * Overwrite the old item with new truncated high key directly.
925 : * oitup is already located at the physical beginning of tuple
926 : * space, so this should directly reuse the existing tuple space.
927 : */
928 37366 : ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
929 37366 : lastleft = (IndexTuple) PageGetItem(opage, ii);
930 :
931 : Assert(IndexTupleSize(oitup) > last_truncextra);
932 37366 : truncated = _bt_truncate(wstate->index, lastleft, oitup,
933 : wstate->inskey);
934 37366 : if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated,
935 37366 : IndexTupleSize(truncated)))
936 0 : elog(ERROR, "failed to add high key to the index page");
937 37366 : pfree(truncated);
938 :
939 : /* oitup should continue to point to the page's high key */
940 37366 : hii = PageGetItemId(opage, P_HIKEY);
941 37366 : oitup = (IndexTuple) PageGetItem(opage, hii);
942 : }
943 :
944 : /*
945 : * Link the old page into its parent, using its low key. If we don't
946 : * have a parent, we have to create one; this adds a new btree level.
947 : */
948 37490 : if (state->btps_next == NULL)
949 2266 : state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
950 :
951 : Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
952 : IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
953 : BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
954 : P_LEFTMOST(BTPageGetOpaque(opage)));
955 : Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
956 : !P_LEFTMOST(BTPageGetOpaque(opage)));
957 37490 : BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
958 37490 : _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
959 37490 : pfree(state->btps_lowkey);
960 :
961 : /*
962 : * Save a copy of the high key from the old page. It is also the low
963 : * key for the new page.
964 : */
965 37490 : state->btps_lowkey = CopyIndexTuple(oitup);
966 :
967 : /*
968 : * Set the sibling links for both pages.
969 : */
970 : {
971 37490 : BTPageOpaque oopaque = BTPageGetOpaque(opage);
972 37490 : BTPageOpaque nopaque = BTPageGetOpaque(npage);
973 :
974 37490 : oopaque->btpo_next = nblkno;
975 37490 : nopaque->btpo_prev = oblkno;
976 37490 : nopaque->btpo_next = P_NONE; /* redundant */
977 : }
978 :
979 : /*
980 : * Write out the old page. _bt_blwritepage takes ownership of the
981 : * 'opage' buffer.
982 : */
983 37490 : _bt_blwritepage(wstate, obuf, oblkno);
984 :
985 : /*
986 : * Reset last_off to point to new page
987 : */
988 37490 : last_off = P_FIRSTKEY;
989 : }
990 :
991 : /*
992 : * By here, either original page is still the current page, or a new page
993 : * was created that became the current page. Either way, the current page
994 : * definitely has space for new item.
995 : *
996 : * If the new item is the first for its page, it must also be the first
997 : * item on its entire level. On later same-level pages, a low key for a
998 : * page will be copied from the prior page in the code above. Generate a
999 : * minus infinity low key here instead.
1000 : */
1001 10684040 : if (last_off == P_HIKEY)
1002 : {
1003 : Assert(state->btps_lowkey == NULL);
1004 11222 : state->btps_lowkey = palloc0(sizeof(IndexTupleData));
1005 11222 : state->btps_lowkey->t_info = sizeof(IndexTupleData);
1006 11222 : BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
1007 : }
1008 :
1009 : /*
1010 : * Add the new item into the current page.
1011 : */
1012 10684040 : last_off = OffsetNumberNext(last_off);
1013 10684040 : _bt_sortaddtup(npage, itupsz, itup, last_off,
1014 10684040 : !isleaf && last_off == P_FIRSTKEY);
1015 :
1016 10684040 : state->btps_buf = nbuf;
1017 10684040 : state->btps_blkno = nblkno;
1018 10684040 : state->btps_lastoff = last_off;
1019 10684040 : }
1020 :
1021 : /*
1022 : * Finalize pending posting list tuple, and add it to the index. Final tuple
1023 : * is based on saved base tuple, and saved list of heap TIDs.
1024 : *
1025 : * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
1026 : * using _bt_buildadd().
1027 : */
1028 : static void
1029 4676382 : _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
1030 : BTDedupState dstate)
1031 : {
1032 : Assert(dstate->nitems > 0);
1033 :
1034 4676382 : if (dstate->nitems == 1)
1035 4638660 : _bt_buildadd(wstate, state, dstate->base, 0);
1036 : else
1037 : {
1038 : IndexTuple postingtuple;
1039 : Size truncextra;
1040 :
1041 : /* form a tuple with a posting list */
1042 37722 : postingtuple = _bt_form_posting(dstate->base,
1043 : dstate->htids,
1044 : dstate->nhtids);
1045 : /* Calculate posting list overhead */
1046 75444 : truncextra = IndexTupleSize(postingtuple) -
1047 37722 : BTreeTupleGetPostingOffset(postingtuple);
1048 :
1049 37722 : _bt_buildadd(wstate, state, postingtuple, truncextra);
1050 37722 : pfree(postingtuple);
1051 : }
1052 :
1053 4676382 : dstate->nmaxitems = 0;
1054 4676382 : dstate->nhtids = 0;
1055 4676382 : dstate->nitems = 0;
1056 4676382 : dstate->phystupsize = 0;
1057 4676382 : }
1058 :
1059 : /*
1060 : * Finish writing out the completed btree.
1061 : */
1062 : static void
1063 45586 : _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
1064 : {
1065 : BTPageState *s;
1066 45586 : BlockNumber rootblkno = P_NONE;
1067 45586 : uint32 rootlevel = 0;
1068 : BulkWriteBuffer metabuf;
1069 :
1070 : /*
1071 : * Each iteration of this loop completes one more level of the tree.
1072 : */
1073 56808 : for (s = state; s != NULL; s = s->btps_next)
1074 : {
1075 : BlockNumber blkno;
1076 : BTPageOpaque opaque;
1077 :
1078 11222 : blkno = s->btps_blkno;
1079 11222 : opaque = BTPageGetOpaque((Page) s->btps_buf);
1080 :
1081 : /*
1082 : * We have to link the last page on this level to somewhere.
1083 : *
1084 : * If we're at the top, it's the root, so attach it to the metapage.
1085 : * Otherwise, add an entry for it to its parent using its low key.
1086 : * This may cause the last page of the parent level to split, but
1087 : * that's not a problem -- we haven't gotten to it yet.
1088 : */
1089 11222 : if (s->btps_next == NULL)
1090 : {
1091 8956 : opaque->btpo_flags |= BTP_ROOT;
1092 8956 : rootblkno = blkno;
1093 8956 : rootlevel = s->btps_level;
1094 : }
1095 : else
1096 : {
1097 : Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
1098 : IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
1099 : BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
1100 : P_LEFTMOST(opaque));
1101 : Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
1102 : !P_LEFTMOST(opaque));
1103 2266 : BTreeTupleSetDownLink(s->btps_lowkey, blkno);
1104 2266 : _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
1105 2266 : pfree(s->btps_lowkey);
1106 2266 : s->btps_lowkey = NULL;
1107 : }
1108 :
1109 : /*
1110 : * This is the rightmost page, so the ItemId array needs to be slid
1111 : * back one slot. Then we can dump out the page.
1112 : */
1113 11222 : _bt_slideleft((Page) s->btps_buf);
1114 11222 : _bt_blwritepage(wstate, s->btps_buf, s->btps_blkno);
1115 11222 : s->btps_buf = NULL; /* writepage took ownership of the buffer */
1116 : }
1117 :
1118 : /*
1119 : * As the last step in the process, construct the metapage and make it
1120 : * point to the new root (unless we had no data at all, in which case it's
1121 : * set to point to "P_NONE"). This changes the index to the "valid" state
1122 : * by filling in a valid magic number in the metapage.
1123 : */
1124 45586 : metabuf = smgr_bulk_get_buf(wstate->bulkstate);
1125 45586 : _bt_initmetapage((Page) metabuf, rootblkno, rootlevel,
1126 45586 : wstate->inskey->allequalimage);
1127 45586 : _bt_blwritepage(wstate, metabuf, BTREE_METAPAGE);
1128 45586 : }
1129 :
1130 : /*
1131 : * Read tuples in correct sort order from tuplesort, and load them into
1132 : * btree leaves.
1133 : */
1134 : static void
1135 45586 : _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
1136 : {
1137 45586 : BTPageState *state = NULL;
1138 45586 : bool merge = (btspool2 != NULL);
1139 : IndexTuple itup,
1140 45586 : itup2 = NULL;
1141 : bool load1;
1142 45586 : TupleDesc tupdes = RelationGetDescr(wstate->index);
1143 : int i,
1144 45586 : keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
1145 : SortSupport sortKeys;
1146 45586 : int64 tuples_done = 0;
1147 : bool deduplicate;
1148 :
1149 45586 : wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);
1150 :
1151 53956 : deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
1152 8370 : BTGetDeduplicateItems(wstate->index);
1153 :
1154 45586 : if (merge)
1155 : {
1156 : /*
1157 : * Another BTSpool for dead tuples exists. Now we have to merge
1158 : * btspool and btspool2.
1159 : */
1160 :
1161 : /* the preparation of merge */
1162 18 : itup = tuplesort_getindextuple(btspool->sortstate, true);
1163 18 : itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1164 :
1165 : /* Prepare SortSupport data for each column */
1166 18 : sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
1167 :
1168 38 : for (i = 0; i < keysz; i++)
1169 : {
1170 20 : SortSupport sortKey = sortKeys + i;
1171 20 : ScanKey scanKey = wstate->inskey->scankeys + i;
1172 : int16 strategy;
1173 :
1174 20 : sortKey->ssup_cxt = CurrentMemoryContext;
1175 20 : sortKey->ssup_collation = scanKey->sk_collation;
1176 20 : sortKey->ssup_nulls_first =
1177 20 : (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1178 20 : sortKey->ssup_attno = scanKey->sk_attno;
1179 : /* Abbreviation is not supported here */
1180 20 : sortKey->abbreviate = false;
1181 :
1182 : Assert(sortKey->ssup_attno != 0);
1183 :
1184 20 : strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
1185 : BTGreaterStrategyNumber : BTLessStrategyNumber;
1186 :
1187 20 : PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
1188 : }
1189 :
1190 : for (;;)
1191 : {
1192 3270 : load1 = true; /* load BTSpool next ? */
1193 3270 : if (itup2 == NULL)
1194 : {
1195 150 : if (itup == NULL)
1196 18 : break;
1197 : }
1198 3120 : else if (itup != NULL)
1199 : {
1200 2950 : int32 compare = 0;
1201 :
1202 3178 : for (i = 1; i <= keysz; i++)
1203 : {
1204 : SortSupport entry;
1205 : Datum attrDatum1,
1206 : attrDatum2;
1207 : bool isNull1,
1208 : isNull2;
1209 :
1210 2998 : entry = sortKeys + i - 1;
1211 2998 : attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
1212 2998 : attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
1213 :
1214 2998 : compare = ApplySortComparator(attrDatum1, isNull1,
1215 : attrDatum2, isNull2,
1216 : entry);
1217 2998 : if (compare > 0)
1218 : {
1219 256 : load1 = false;
1220 2770 : break;
1221 : }
1222 2742 : else if (compare < 0)
1223 2514 : break;
1224 : }
1225 :
1226 : /*
1227 : * If key values are equal, we sort on ItemPointer. This is
1228 : * required for btree indexes, since heap TID is treated as an
1229 : * implicit last key attribute in order to ensure that all
1230 : * keys in the index are physically unique.
1231 : */
1232 2950 : if (compare == 0)
1233 : {
1234 180 : compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
1235 : Assert(compare != 0);
1236 180 : if (compare > 0)
1237 30 : load1 = false;
1238 : }
1239 : }
1240 : else
1241 170 : load1 = false;
1242 :
1243 : /* When we see first tuple, create first index page */
1244 3252 : if (state == NULL)
1245 18 : state = _bt_pagestate(wstate, 0);
1246 :
1247 3252 : if (load1)
1248 : {
1249 2796 : _bt_buildadd(wstate, state, itup, 0);
1250 2796 : itup = tuplesort_getindextuple(btspool->sortstate, true);
1251 : }
1252 : else
1253 : {
1254 456 : _bt_buildadd(wstate, state, itup2, 0);
1255 456 : itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1256 : }
1257 :
1258 : /* Report progress */
1259 3252 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1260 : ++tuples_done);
1261 : }
1262 18 : pfree(sortKeys);
1263 : }
1264 45568 : else if (deduplicate)
1265 : {
1266 : /* merge is unnecessary, deduplicate into posting lists */
1267 : BTDedupState dstate;
1268 :
1269 8370 : dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
1270 8370 : dstate->deduplicate = true; /* unused */
1271 8370 : dstate->nmaxitems = 0; /* unused */
1272 8370 : dstate->maxpostingsize = 0; /* set later */
1273 : /* Metadata about base tuple of current pending posting list */
1274 8370 : dstate->base = NULL;
1275 8370 : dstate->baseoff = InvalidOffsetNumber; /* unused */
1276 8370 : dstate->basetupsize = 0;
1277 : /* Metadata about current pending posting list TIDs */
1278 8370 : dstate->htids = NULL;
1279 8370 : dstate->nhtids = 0;
1280 8370 : dstate->nitems = 0;
1281 8370 : dstate->phystupsize = 0; /* unused */
1282 8370 : dstate->nintervals = 0; /* unused */
1283 :
1284 5987898 : while ((itup = tuplesort_getindextuple(btspool->sortstate,
1285 : true)) != NULL)
1286 : {
1287 : /* When we see first tuple, create first index page */
1288 5979528 : if (state == NULL)
1289 : {
1290 2296 : state = _bt_pagestate(wstate, 0);
1291 :
1292 : /*
1293 : * Limit size of posting list tuples to 1/10 space we want to
1294 : * leave behind on the page, plus space for final item's line
1295 : * pointer. This is equal to the space that we'd like to
1296 : * leave behind on each leaf page when fillfactor is 90,
1297 : * allowing us to get close to fillfactor% space utilization
1298 : * when there happen to be a great many duplicates. (This
1299 : * makes higher leaf fillfactor settings ineffective when
1300 : * building indexes that have many duplicates, but packing
1301 : * leaf pages full with few very large tuples doesn't seem
1302 : * like a useful goal.)
1303 : */
1304 2296 : dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
1305 : sizeof(ItemIdData);
1306 : Assert(dstate->maxpostingsize <= BTMaxItemSize((Page) state->btps_buf) &&
1307 : dstate->maxpostingsize <= INDEX_SIZE_MASK);
1308 2296 : dstate->htids = palloc(dstate->maxpostingsize);
1309 :
1310 : /* start new pending posting list with itup copy */
1311 2296 : _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
1312 : InvalidOffsetNumber);
1313 : }
1314 5977232 : else if (_bt_keep_natts_fast(wstate->index, dstate->base,
1315 1310846 : itup) > keysz &&
1316 1310846 : _bt_dedup_save_htid(dstate, itup))
1317 : {
1318 : /*
1319 : * Tuple is equal to base tuple of pending posting list. Heap
1320 : * TID from itup has been saved in state.
1321 : */
1322 : }
1323 : else
1324 : {
1325 : /*
1326 : * Tuple is not equal to pending posting list tuple, or
1327 : * _bt_dedup_save_htid() opted to not merge current item into
1328 : * pending posting list.
1329 : */
1330 4674086 : _bt_sort_dedup_finish_pending(wstate, state, dstate);
1331 4674086 : pfree(dstate->base);
1332 :
1333 : /* start new pending posting list with itup copy */
1334 4674086 : _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
1335 : InvalidOffsetNumber);
1336 : }
1337 :
1338 : /* Report progress */
1339 5979528 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1340 : ++tuples_done);
1341 : }
1342 :
1343 8370 : if (state)
1344 : {
1345 : /*
1346 : * Handle the last item (there must be a last item when the
1347 : * tuplesort returned one or more tuples)
1348 : */
1349 2296 : _bt_sort_dedup_finish_pending(wstate, state, dstate);
1350 2296 : pfree(dstate->base);
1351 2296 : pfree(dstate->htids);
1352 : }
1353 :
1354 8370 : pfree(dstate);
1355 : }
1356 : else
1357 : {
1358 : /* merging and deduplication are both unnecessary */
1359 6001848 : while ((itup = tuplesort_getindextuple(btspool->sortstate,
1360 : true)) != NULL)
1361 : {
1362 : /* When we see first tuple, create first index page */
1363 5964650 : if (state == NULL)
1364 6642 : state = _bt_pagestate(wstate, 0);
1365 :
1366 5964650 : _bt_buildadd(wstate, state, itup, 0);
1367 :
1368 : /* Report progress */
1369 5964650 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1370 : ++tuples_done);
1371 : }
1372 : }
1373 :
1374 : /* Close down final pages and write the metapage */
1375 45586 : _bt_uppershutdown(wstate, state);
1376 45586 : smgr_bulk_finish(wstate->bulkstate);
1377 45586 : }
1378 :
1379 : /*
1380 : * Create parallel context, and launch workers for leader.
1381 : *
1382 : * buildstate argument should be initialized (with the exception of the
1383 : * tuplesort state in spools, which may later be created based on shared
1384 : * state initially set up here).
1385 : *
1386 : * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
1387 : *
1388 : * request is the target number of parallel worker processes to launch.
1389 : *
1390 : * Sets buildstate's BTLeader, which caller must use to shut down parallel
1391 : * mode by passing it to _bt_end_parallel() at the very end of its index
1392 : * build. If not even a single worker process can be launched, this is
1393 : * never set, and caller should proceed with a serial index build.
1394 : */
1395 : static void
1396 142 : _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
1397 : {
1398 : ParallelContext *pcxt;
1399 : int scantuplesortstates;
1400 : Snapshot snapshot;
1401 : Size estbtshared;
1402 : Size estsort;
1403 : BTShared *btshared;
1404 : Sharedsort *sharedsort;
1405 : Sharedsort *sharedsort2;
1406 142 : BTSpool *btspool = buildstate->spool;
1407 142 : BTLeader *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
1408 : WalUsage *walusage;
1409 : BufferUsage *bufferusage;
1410 142 : bool leaderparticipates = true;
1411 : int querylen;
1412 :
1413 : #ifdef DISABLE_LEADER_PARTICIPATION
1414 : leaderparticipates = false;
1415 : #endif
1416 :
1417 : /*
1418 : * Enter parallel mode, and create context for parallel build of btree
1419 : * index
1420 : */
1421 142 : EnterParallelMode();
1422 : Assert(request > 0);
1423 142 : pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1424 : request);
1425 :
1426 142 : scantuplesortstates = leaderparticipates ? request + 1 : request;
1427 :
1428 : /*
1429 : * Prepare for scan of the base relation. In a normal index build, we use
1430 : * SnapshotAny because we must retrieve all tuples and do our own time
1431 : * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1432 : * concurrent build, we take a regular MVCC snapshot and index whatever's
1433 : * live according to that.
1434 : */
1435 142 : if (!isconcurrent)
1436 142 : snapshot = SnapshotAny;
1437 : else
1438 0 : snapshot = RegisterSnapshot(GetTransactionSnapshot());
1439 :
1440 : /*
1441 : * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1442 : * PARALLEL_KEY_TUPLESORT tuplesort workspace
1443 : */
1444 142 : estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
1445 142 : shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
1446 142 : estsort = tuplesort_estimate_shared(scantuplesortstates);
1447 142 : shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1448 :
1449 : /*
1450 : * Unique case requires a second spool, and so we may have to account for
1451 : * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1452 : */
1453 142 : if (!btspool->isunique)
1454 78 : shm_toc_estimate_keys(&pcxt->estimator, 2);
1455 : else
1456 : {
1457 64 : shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1458 64 : shm_toc_estimate_keys(&pcxt->estimator, 3);
1459 : }
1460 :
1461 : /*
1462 : * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
1463 : * and PARALLEL_KEY_BUFFER_USAGE.
1464 : *
1465 : * If there are no extensions loaded that care, we could skip this. We
1466 : * have no way of knowing whether anyone's looking at pgWalUsage or
1467 : * pgBufferUsage, so do it unconditionally.
1468 : */
1469 142 : shm_toc_estimate_chunk(&pcxt->estimator,
1470 : mul_size(sizeof(WalUsage), pcxt->nworkers));
1471 142 : shm_toc_estimate_keys(&pcxt->estimator, 1);
1472 142 : shm_toc_estimate_chunk(&pcxt->estimator,
1473 : mul_size(sizeof(BufferUsage), pcxt->nworkers));
1474 142 : shm_toc_estimate_keys(&pcxt->estimator, 1);
1475 :
1476 : /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1477 142 : if (debug_query_string)
1478 : {
1479 142 : querylen = strlen(debug_query_string);
1480 142 : shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
1481 142 : shm_toc_estimate_keys(&pcxt->estimator, 1);
1482 : }
1483 : else
1484 0 : querylen = 0; /* keep compiler quiet */
1485 :
1486 : /* Everyone's had a chance to ask for space, so now create the DSM */
1487 142 : InitializeParallelDSM(pcxt);
1488 :
1489 : /* If no DSM segment was available, back out (do serial build) */
1490 142 : if (pcxt->seg == NULL)
1491 : {
1492 0 : if (IsMVCCSnapshot(snapshot))
1493 0 : UnregisterSnapshot(snapshot);
1494 0 : DestroyParallelContext(pcxt);
1495 0 : ExitParallelMode();
1496 0 : return;
1497 : }
1498 :
1499 : /* Store shared build state, for which we reserved space */
1500 142 : btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1501 : /* Initialize immutable state */
1502 142 : btshared->heaprelid = RelationGetRelid(btspool->heap);
1503 142 : btshared->indexrelid = RelationGetRelid(btspool->index);
1504 142 : btshared->isunique = btspool->isunique;
1505 142 : btshared->nulls_not_distinct = btspool->nulls_not_distinct;
1506 142 : btshared->isconcurrent = isconcurrent;
1507 142 : btshared->scantuplesortstates = scantuplesortstates;
1508 142 : ConditionVariableInit(&btshared->workersdonecv);
1509 142 : SpinLockInit(&btshared->mutex);
1510 : /* Initialize mutable state */
1511 142 : btshared->nparticipantsdone = 0;
1512 142 : btshared->reltuples = 0.0;
1513 142 : btshared->havedead = false;
1514 142 : btshared->indtuples = 0.0;
1515 142 : btshared->brokenhotchain = false;
1516 142 : table_parallelscan_initialize(btspool->heap,
1517 : ParallelTableScanFromBTShared(btshared),
1518 : snapshot);
1519 :
1520 : /*
1521 : * Store shared tuplesort-private state, for which we reserved space.
1522 : * Then, initialize opaque state using tuplesort routine.
1523 : */
1524 142 : sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1525 142 : tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1526 : pcxt->seg);
1527 :
1528 142 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
1529 142 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1530 :
1531 : /* Unique case requires a second spool, and associated shared state */
1532 142 : if (!btspool->isunique)
1533 78 : sharedsort2 = NULL;
1534 : else
1535 : {
1536 : /*
1537 : * Store additional shared tuplesort-private state, for which we
1538 : * reserved space. Then, initialize opaque state using tuplesort
1539 : * routine.
1540 : */
1541 64 : sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1542 64 : tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1543 : pcxt->seg);
1544 :
1545 64 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
1546 : }
1547 :
1548 : /* Store query string for workers */
1549 142 : if (debug_query_string)
1550 : {
1551 : char *sharedquery;
1552 :
1553 142 : sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1554 142 : memcpy(sharedquery, debug_query_string, querylen + 1);
1555 142 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
1556 : }
1557 :
1558 : /*
1559 : * Allocate space for each worker's WalUsage and BufferUsage; no need to
1560 : * initialize.
1561 : */
1562 142 : walusage = shm_toc_allocate(pcxt->toc,
1563 142 : mul_size(sizeof(WalUsage), pcxt->nworkers));
1564 142 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
1565 142 : bufferusage = shm_toc_allocate(pcxt->toc,
1566 142 : mul_size(sizeof(BufferUsage), pcxt->nworkers));
1567 142 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1568 :
1569 : /* Launch workers, saving status for leader/caller */
1570 142 : LaunchParallelWorkers(pcxt);
1571 142 : btleader->pcxt = pcxt;
1572 142 : btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1573 142 : if (leaderparticipates)
1574 142 : btleader->nparticipanttuplesorts++;
1575 142 : btleader->btshared = btshared;
1576 142 : btleader->sharedsort = sharedsort;
1577 142 : btleader->sharedsort2 = sharedsort2;
1578 142 : btleader->snapshot = snapshot;
1579 142 : btleader->walusage = walusage;
1580 142 : btleader->bufferusage = bufferusage;
1581 :
1582 : /* If no workers were successfully launched, back out (do serial build) */
1583 142 : if (pcxt->nworkers_launched == 0)
1584 : {
1585 0 : _bt_end_parallel(btleader);
1586 0 : return;
1587 : }
1588 :
1589 : /* Save leader state now that it's clear build will be parallel */
1590 142 : buildstate->btleader = btleader;
1591 :
1592 : /* Join heap scan ourselves */
1593 142 : if (leaderparticipates)
1594 142 : _bt_leader_participate_as_worker(buildstate);
1595 :
1596 : /*
1597 : * Caller needs to wait for all launched workers when we return. Make
1598 : * sure that the failure-to-start case will not hang forever.
1599 : */
1600 142 : WaitForParallelWorkersToAttach(pcxt);
1601 : }
1602 :
1603 : /*
1604 : * Shut down workers, destroy parallel context, and end parallel mode.
1605 : */
1606 : static void
1607 142 : _bt_end_parallel(BTLeader *btleader)
1608 : {
1609 : int i;
1610 :
1611 : /* Shutdown worker processes */
1612 142 : WaitForParallelWorkersToFinish(btleader->pcxt);
1613 :
1614 : /*
1615 : * Next, accumulate WAL usage. (This must wait for the workers to finish,
1616 : * or we might get incomplete data.)
1617 : */
1618 284 : for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
1619 142 : InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
1620 :
1621 : /* Free last reference to MVCC snapshot, if one was used */
1622 142 : if (IsMVCCSnapshot(btleader->snapshot))
1623 0 : UnregisterSnapshot(btleader->snapshot);
1624 142 : DestroyParallelContext(btleader->pcxt);
1625 142 : ExitParallelMode();
1626 142 : }
1627 :
1628 : /*
1629 : * Returns size of shared memory required to store state for a parallel
1630 : * btree index build based on the snapshot its parallel scan will use.
1631 : */
1632 : static Size
1633 142 : _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
1634 : {
1635 : /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1636 142 : return add_size(BUFFERALIGN(sizeof(BTShared)),
1637 : table_parallelscan_estimate(heap, snapshot));
1638 : }
1639 :
1640 : /*
1641 : * Within leader, wait for end of heap scan.
1642 : *
1643 : * When called, parallel heap scan started by _bt_begin_parallel() will
1644 : * already be underway within worker processes (when leader participates
1645 : * as a worker, we should end up here just as workers are finishing).
1646 : *
1647 : * Fills in fields needed for ambuild statistics, and lets caller set
1648 : * field indicating that some worker encountered a broken HOT chain.
1649 : *
1650 : * Returns the total number of heap tuples scanned.
1651 : */
1652 : static double
1653 142 : _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
1654 : {
1655 142 : BTShared *btshared = buildstate->btleader->btshared;
1656 : int nparticipanttuplesorts;
1657 : double reltuples;
1658 :
1659 142 : nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1660 : for (;;)
1661 : {
1662 366 : SpinLockAcquire(&btshared->mutex);
1663 366 : if (btshared->nparticipantsdone == nparticipanttuplesorts)
1664 : {
1665 142 : buildstate->havedead = btshared->havedead;
1666 142 : buildstate->indtuples = btshared->indtuples;
1667 142 : *brokenhotchain = btshared->brokenhotchain;
1668 142 : reltuples = btshared->reltuples;
1669 142 : SpinLockRelease(&btshared->mutex);
1670 142 : break;
1671 : }
1672 224 : SpinLockRelease(&btshared->mutex);
1673 :
1674 224 : ConditionVariableSleep(&btshared->workersdonecv,
1675 : WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
1676 : }
1677 :
1678 142 : ConditionVariableCancelSleep();
1679 :
1680 142 : return reltuples;
1681 : }
1682 :
1683 : /*
1684 : * Within leader, participate as a parallel worker.
1685 : */
1686 : static void
1687 142 : _bt_leader_participate_as_worker(BTBuildState *buildstate)
1688 : {
1689 142 : BTLeader *btleader = buildstate->btleader;
1690 : BTSpool *leaderworker;
1691 : BTSpool *leaderworker2;
1692 : int sortmem;
1693 :
1694 : /* Allocate memory and initialize private spool */
1695 142 : leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
1696 142 : leaderworker->heap = buildstate->spool->heap;
1697 142 : leaderworker->index = buildstate->spool->index;
1698 142 : leaderworker->isunique = buildstate->spool->isunique;
1699 142 : leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;
1700 :
1701 : /* Initialize second spool, if required */
1702 142 : if (!btleader->btshared->isunique)
1703 78 : leaderworker2 = NULL;
1704 : else
1705 : {
1706 : /* Allocate memory for worker's own private secondary spool */
1707 64 : leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
1708 :
1709 : /* Initialize worker's own secondary spool */
1710 64 : leaderworker2->heap = leaderworker->heap;
1711 64 : leaderworker2->index = leaderworker->index;
1712 64 : leaderworker2->isunique = false;
1713 : }
1714 :
1715 : /*
1716 : * Might as well use reliable figure when doling out maintenance_work_mem
1717 : * (when requested number of workers were not launched, this will be
1718 : * somewhat higher than it is for other workers).
1719 : */
1720 142 : sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
1721 :
1722 : /* Perform work common to all participants */
1723 142 : _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
1724 : btleader->sharedsort, btleader->sharedsort2,
1725 : sortmem, true);
1726 :
1727 : #ifdef BTREE_BUILD_STATS
1728 : if (log_btree_build_stats)
1729 : {
1730 : ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1731 : ResetUsage();
1732 : }
1733 : #endif /* BTREE_BUILD_STATS */
1734 142 : }
1735 :
1736 : /*
1737 : * Perform work within a launched parallel process.
1738 : */
1739 : void
1740 142 : _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
1741 : {
1742 : char *sharedquery;
1743 : BTSpool *btspool;
1744 : BTSpool *btspool2;
1745 : BTShared *btshared;
1746 : Sharedsort *sharedsort;
1747 : Sharedsort *sharedsort2;
1748 : Relation heapRel;
1749 : Relation indexRel;
1750 : LOCKMODE heapLockmode;
1751 : LOCKMODE indexLockmode;
1752 : WalUsage *walusage;
1753 : BufferUsage *bufferusage;
1754 : int sortmem;
1755 :
1756 : #ifdef BTREE_BUILD_STATS
1757 : if (log_btree_build_stats)
1758 : ResetUsage();
1759 : #endif /* BTREE_BUILD_STATS */
1760 :
1761 : /*
1762 : * The only possible status flag that can be set to the parallel worker is
1763 : * PROC_IN_SAFE_IC.
1764 : */
1765 : Assert((MyProc->statusFlags == 0) ||
1766 : (MyProc->statusFlags == PROC_IN_SAFE_IC));
1767 :
1768 : /* Set debug_query_string for individual workers first */
1769 142 : sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
1770 142 : debug_query_string = sharedquery;
1771 :
1772 : /* Report the query string from leader */
1773 142 : pgstat_report_activity(STATE_RUNNING, debug_query_string);
1774 :
1775 : /* Look up nbtree shared state */
1776 142 : btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1777 :
1778 : /* Open relations using lock modes known to be obtained by index.c */
1779 142 : if (!btshared->isconcurrent)
1780 : {
1781 142 : heapLockmode = ShareLock;
1782 142 : indexLockmode = AccessExclusiveLock;
1783 : }
1784 : else
1785 : {
1786 0 : heapLockmode = ShareUpdateExclusiveLock;
1787 0 : indexLockmode = RowExclusiveLock;
1788 : }
1789 :
1790 : /* Open relations within worker */
1791 142 : heapRel = table_open(btshared->heaprelid, heapLockmode);
1792 142 : indexRel = index_open(btshared->indexrelid, indexLockmode);
1793 :
1794 : /* Initialize worker's own spool */
1795 142 : btspool = (BTSpool *) palloc0(sizeof(BTSpool));
1796 142 : btspool->heap = heapRel;
1797 142 : btspool->index = indexRel;
1798 142 : btspool->isunique = btshared->isunique;
1799 142 : btspool->nulls_not_distinct = btshared->nulls_not_distinct;
1800 :
1801 : /* Look up shared state private to tuplesort.c */
1802 142 : sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1803 142 : tuplesort_attach_shared(sharedsort, seg);
1804 142 : if (!btshared->isunique)
1805 : {
1806 78 : btspool2 = NULL;
1807 78 : sharedsort2 = NULL;
1808 : }
1809 : else
1810 : {
1811 : /* Allocate memory for worker's own private secondary spool */
1812 64 : btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
1813 :
1814 : /* Initialize worker's own secondary spool */
1815 64 : btspool2->heap = btspool->heap;
1816 64 : btspool2->index = btspool->index;
1817 64 : btspool2->isunique = false;
1818 : /* Look up shared state private to tuplesort.c */
1819 64 : sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1820 64 : tuplesort_attach_shared(sharedsort2, seg);
1821 : }
1822 :
1823 : /* Prepare to track buffer usage during parallel execution */
1824 142 : InstrStartParallelQuery();
1825 :
1826 : /* Perform sorting of spool, and possibly a spool2 */
1827 142 : sortmem = maintenance_work_mem / btshared->scantuplesortstates;
1828 142 : _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1829 : sharedsort2, sortmem, false);
1830 :
1831 : /* Report WAL/buffer usage during parallel execution */
1832 142 : bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
1833 142 : walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
1834 142 : InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
1835 142 : &walusage[ParallelWorkerNumber]);
1836 :
1837 : #ifdef BTREE_BUILD_STATS
1838 : if (log_btree_build_stats)
1839 : {
1840 : ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1841 : ResetUsage();
1842 : }
1843 : #endif /* BTREE_BUILD_STATS */
1844 :
1845 142 : index_close(indexRel, indexLockmode);
1846 142 : table_close(heapRel, heapLockmode);
1847 142 : }
1848 :
1849 : /*
1850 : * Perform a worker's portion of a parallel sort.
1851 : *
1852 : * This generates a tuplesort for passed btspool, and a second tuplesort
1853 : * state if a second btspool is need (i.e. for unique index builds). All
1854 : * other spool fields should already be set when this is called.
1855 : *
1856 : * sortmem is the amount of working memory to use within each worker,
1857 : * expressed in KBs.
1858 : *
1859 : * When this returns, workers are done, and need only release resources.
1860 : */
1861 : static void
1862 284 : _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
1863 : BTShared *btshared, Sharedsort *sharedsort,
1864 : Sharedsort *sharedsort2, int sortmem, bool progress)
1865 : {
1866 : SortCoordinate coordinate;
1867 : BTBuildState buildstate;
1868 : TableScanDesc scan;
1869 : double reltuples;
1870 : IndexInfo *indexInfo;
1871 :
1872 : /* Initialize local tuplesort coordination state */
1873 284 : coordinate = palloc0(sizeof(SortCoordinateData));
1874 284 : coordinate->isWorker = true;
1875 284 : coordinate->nParticipants = -1;
1876 284 : coordinate->sharedsort = sharedsort;
1877 :
1878 : /* Begin "partial" tuplesort */
1879 568 : btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1880 : btspool->index,
1881 284 : btspool->isunique,
1882 284 : btspool->nulls_not_distinct,
1883 : sortmem, coordinate,
1884 : TUPLESORT_NONE);
1885 :
1886 : /*
1887 : * Just as with serial case, there may be a second spool. If so, a
1888 : * second, dedicated spool2 partial tuplesort is required.
1889 : */
1890 284 : if (btspool2)
1891 : {
1892 : SortCoordinate coordinate2;
1893 :
1894 : /*
1895 : * We expect that the second one (for dead tuples) won't get very
1896 : * full, so we give it only work_mem (unless sortmem is less for
1897 : * worker). Worker processes are generally permitted to allocate
1898 : * work_mem independently.
1899 : */
1900 128 : coordinate2 = palloc0(sizeof(SortCoordinateData));
1901 128 : coordinate2->isWorker = true;
1902 128 : coordinate2->nParticipants = -1;
1903 128 : coordinate2->sharedsort = sharedsort2;
1904 128 : btspool2->sortstate =
1905 128 : tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,
1906 : Min(sortmem, work_mem), coordinate2,
1907 : false);
1908 : }
1909 :
1910 : /* Fill in buildstate for _bt_build_callback() */
1911 284 : buildstate.isunique = btshared->isunique;
1912 284 : buildstate.nulls_not_distinct = btshared->nulls_not_distinct;
1913 284 : buildstate.havedead = false;
1914 284 : buildstate.heap = btspool->heap;
1915 284 : buildstate.spool = btspool;
1916 284 : buildstate.spool2 = btspool2;
1917 284 : buildstate.indtuples = 0;
1918 284 : buildstate.btleader = NULL;
1919 :
1920 : /* Join parallel scan */
1921 284 : indexInfo = BuildIndexInfo(btspool->index);
1922 284 : indexInfo->ii_Concurrent = btshared->isconcurrent;
1923 284 : scan = table_beginscan_parallel(btspool->heap,
1924 : ParallelTableScanFromBTShared(btshared));
1925 284 : reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
1926 : true, progress, _bt_build_callback,
1927 : (void *) &buildstate, scan);
1928 :
1929 : /* Execute this worker's part of the sort */
1930 284 : if (progress)
1931 142 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1932 : PROGRESS_BTREE_PHASE_PERFORMSORT_1);
1933 284 : tuplesort_performsort(btspool->sortstate);
1934 284 : if (btspool2)
1935 : {
1936 128 : if (progress)
1937 64 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1938 : PROGRESS_BTREE_PHASE_PERFORMSORT_2);
1939 128 : tuplesort_performsort(btspool2->sortstate);
1940 : }
1941 :
1942 : /*
1943 : * Done. Record ambuild statistics, and whether we encountered a broken
1944 : * HOT chain.
1945 : */
1946 284 : SpinLockAcquire(&btshared->mutex);
1947 284 : btshared->nparticipantsdone++;
1948 284 : btshared->reltuples += reltuples;
1949 284 : if (buildstate.havedead)
1950 0 : btshared->havedead = true;
1951 284 : btshared->indtuples += buildstate.indtuples;
1952 284 : if (indexInfo->ii_BrokenHotChain)
1953 0 : btshared->brokenhotchain = true;
1954 284 : SpinLockRelease(&btshared->mutex);
1955 :
1956 : /* Notify leader */
1957 284 : ConditionVariableSignal(&btshared->workersdonecv);
1958 :
1959 : /* We can end tuplesorts immediately */
1960 284 : tuplesort_end(btspool->sortstate);
1961 284 : if (btspool2)
1962 128 : tuplesort_end(btspool2->sortstate);
1963 284 : }
|