Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtsort.c
4 : * Build a btree from sorted input by loading leaf pages sequentially.
5 : *
6 : * NOTES
7 : *
8 : * We use tuplesort.c to sort the given index tuples into order.
9 : * Then we scan the index tuples in order and build the btree pages
10 : * for each level. We load source tuples into leaf-level pages.
11 : * Whenever we fill a page at one level, we add a link to it to its
12 : * parent level (starting a new parent level if necessary). When
13 : * done, we write out each final page on each level, adding it to
14 : * its parent level. When we have only one page on a level, it must be
15 : * the root -- it can be attached to the btree metapage and we are done.
16 : *
17 : * It is not wise to pack the pages entirely full, since then *any*
18 : * insertion would cause a split (and not only of the leaf page; the need
19 : * for a split would cascade right up the tree). The steady-state load
20 : * factor for btrees is usually estimated at 70%. We choose to pack leaf
21 : * pages to the user-controllable fill factor (default 90%) while upper pages
22 : * are always packed to 70%. This gives us reasonable density (there aren't
23 : * many upper pages if the keys are reasonable-size) without risking a lot of
24 : * cascading splits during early insertions.
25 : *
26 : * We use the bulk smgr loading facility to bypass the buffer cache and
27 : * WAL-log the pages efficiently.
28 : *
29 : * This code isn't concerned about the FSM at all. The caller is responsible
30 : * for initializing that.
31 : *
32 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
33 : * Portions Copyright (c) 1994, Regents of the University of California
34 : *
35 : * IDENTIFICATION
36 : * src/backend/access/nbtree/nbtsort.c
37 : *
38 : *-------------------------------------------------------------------------
39 : */
40 :
41 : #include "postgres.h"
42 :
43 : #include "access/nbtree.h"
44 : #include "access/parallel.h"
45 : #include "access/relscan.h"
46 : #include "access/table.h"
47 : #include "access/tableam.h"
48 : #include "access/xact.h"
49 : #include "catalog/index.h"
50 : #include "commands/progress.h"
51 : #include "executor/instrument.h"
52 : #include "miscadmin.h"
53 : #include "pgstat.h"
54 : #include "storage/bulk_write.h"
55 : #include "storage/condition_variable.h"
56 : #include "storage/proc.h"
57 : #include "tcop/tcopprot.h"
58 : #include "utils/rel.h"
59 : #include "utils/sortsupport.h"
60 : #include "utils/tuplesort.h"
61 : #include "utils/wait_event.h"
62 :
63 :
64 : /* Magic numbers for parallel state sharing */
65 : #define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001)
66 : #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002)
67 : #define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003)
68 : #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004)
69 : #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xA000000000000005)
70 : #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xA000000000000006)
71 :
72 : /*
73 : * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
74 : * parallel index builds. This may be useful as a debugging aid.
75 : */
76 : /* #define DISABLE_LEADER_PARTICIPATION */
77 :
78 : /*
79 : * Status record for spooling/sorting phase. (Note we may have two of
80 : * these due to the special requirements for uniqueness-checking with
81 : * dead tuples.)
82 : */
83 : typedef struct BTSpool
84 : {
85 : Tuplesortstate *sortstate; /* state data for tuplesort.c */
86 : Relation heap;
87 : Relation index;
88 : bool isunique;
89 : bool nulls_not_distinct;
90 : } BTSpool;
91 :
92 : /*
93 : * Status for index builds performed in parallel. This is allocated in a
94 : * dynamic shared memory segment. Note that there is a separate tuplesort TOC
95 : * entry, private to tuplesort.c but allocated by this module on its behalf.
96 : */
97 : typedef struct BTShared
98 : {
99 : /*
100 : * These fields are not modified during the sort. They primarily exist
101 : * for the benefit of worker processes that need to create BTSpool state
102 : * corresponding to that used by the leader.
103 : */
104 : Oid heaprelid;
105 : Oid indexrelid;
106 : bool isunique;
107 : bool nulls_not_distinct;
108 : bool isconcurrent;
109 : int scantuplesortstates;
110 :
111 : /* Query ID, for report in worker processes */
112 : int64 queryid;
113 :
114 : /*
115 : * workersdonecv is used to monitor the progress of workers. All parallel
116 : * participants must indicate that they are done before leader can use
117 : * mutable state that workers maintain during scan (and before leader can
118 : * proceed to tuplesort_performsort()).
119 : */
120 : ConditionVariable workersdonecv;
121 :
122 : /*
123 : * mutex protects all fields before heapdesc.
124 : *
125 : * These fields contain status information of interest to B-Tree index
126 : * builds that must work just the same when an index is built in parallel.
127 : */
128 : slock_t mutex;
129 :
130 : /*
131 : * Mutable state that is maintained by workers, and reported back to
132 : * leader at end of parallel scan.
133 : *
134 : * nparticipantsdone is number of worker processes finished.
135 : *
136 : * reltuples is the total number of input heap tuples.
137 : *
138 : * havedead indicates if RECENTLY_DEAD tuples were encountered during
139 : * build.
140 : *
141 : * indtuples is the total number of tuples that made it into the index.
142 : *
143 : * brokenhotchain indicates if any worker detected a broken HOT chain
144 : * during build.
145 : */
146 : int nparticipantsdone;
147 : double reltuples;
148 : bool havedead;
149 : double indtuples;
150 : bool brokenhotchain;
151 :
152 : /*
153 : * ParallelTableScanDescData data follows. Can't directly embed here, as
154 : * implementations of the parallel table scan desc interface might need
155 : * stronger alignment.
156 : */
157 : } BTShared;
158 :
159 : /*
160 : * Return pointer to a BTShared's parallel table scan.
161 : *
162 : * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
163 : * MAXALIGN.
164 : */
165 : #define ParallelTableScanFromBTShared(shared) \
166 : (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
167 :
168 : /*
169 : * Status for leader in parallel index build.
170 : */
171 : typedef struct BTLeader
172 : {
173 : /* parallel context itself */
174 : ParallelContext *pcxt;
175 :
176 : /*
177 : * nparticipanttuplesorts is the exact number of worker processes
178 : * successfully launched, plus one leader process if it participates as a
179 : * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
180 : * participating as a worker).
181 : */
182 : int nparticipanttuplesorts;
183 :
184 : /*
185 : * Leader process convenience pointers to shared state (leader avoids TOC
186 : * lookups).
187 : *
188 : * btshared is the shared state for entire build. sharedsort is the
189 : * shared, tuplesort-managed state passed to each process tuplesort.
190 : * sharedsort2 is the corresponding btspool2 shared state, used only when
191 : * building unique indexes. snapshot is the snapshot used by the scan iff
192 : * an MVCC snapshot is required.
193 : */
194 : BTShared *btshared;
195 : Sharedsort *sharedsort;
196 : Sharedsort *sharedsort2;
197 : Snapshot snapshot;
198 : WalUsage *walusage;
199 : BufferUsage *bufferusage;
200 : } BTLeader;
201 :
202 : /*
203 : * Working state for btbuild and its callback.
204 : *
205 : * When parallel CREATE INDEX is used, there is a BTBuildState for each
206 : * participant.
207 : */
208 : typedef struct BTBuildState
209 : {
210 : bool isunique;
211 : bool nulls_not_distinct;
212 : bool havedead;
213 : Relation heap;
214 : BTSpool *spool;
215 :
216 : /*
217 : * spool2 is needed only when the index is a unique index. Dead tuples are
218 : * put into spool2 instead of spool in order to avoid uniqueness check.
219 : */
220 : BTSpool *spool2;
221 : double indtuples;
222 :
223 : /*
224 : * btleader is only present when a parallel index build is performed, and
225 : * only in the leader process. (Actually, only the leader has a
226 : * BTBuildState. Workers have their own spool and spool2, though.)
227 : */
228 : BTLeader *btleader;
229 : } BTBuildState;
230 :
231 : /*
232 : * Status record for a btree page being built. We have one of these
233 : * for each active tree level.
234 : */
235 : typedef struct BTPageState
236 : {
237 : BulkWriteBuffer btps_buf; /* workspace for page building */
238 : BlockNumber btps_blkno; /* block # to write this page at */
239 : IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */
240 : OffsetNumber btps_lastoff; /* last item offset loaded */
241 : Size btps_lastextra; /* last item's extra posting list space */
242 : uint32 btps_level; /* tree level (0 = leaf) */
243 : Size btps_full; /* "full" if less than this much free space */
244 : struct BTPageState *btps_next; /* link to parent level, if any */
245 : } BTPageState;
246 :
247 : /*
248 : * Overall status record for index writing phase.
249 : */
250 : typedef struct BTWriteState
251 : {
252 : Relation heap;
253 : Relation index;
254 : BulkWriteState *bulkstate;
255 : BTScanInsert inskey; /* generic insertion scankey */
256 : BlockNumber btws_pages_alloced; /* # pages allocated */
257 : } BTWriteState;
258 :
259 :
260 : static double _bt_spools_heapscan(Relation heap, Relation index,
261 : BTBuildState *buildstate, IndexInfo *indexInfo);
262 : static void _bt_spooldestroy(BTSpool *btspool);
263 : static void _bt_spool(BTSpool *btspool, const ItemPointerData *self,
264 : const Datum *values, const bool *isnull);
265 : static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
266 : static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
267 : bool *isnull, bool tupleIsAlive, void *state);
268 : static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level);
269 : static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
270 : static void _bt_slideleft(Page rightmostpage);
271 : static void _bt_sortaddtup(Page page, Size itemsize,
272 : const IndexTupleData *itup, OffsetNumber itup_off,
273 : bool newfirstdataitem);
274 : static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
275 : IndexTuple itup, Size truncextra);
276 : static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
277 : BTPageState *state,
278 : BTDedupState dstate);
279 : static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
280 : static void _bt_load(BTWriteState *wstate,
281 : BTSpool *btspool, BTSpool *btspool2);
282 : static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
283 : int request);
284 : static void _bt_end_parallel(BTLeader *btleader);
285 : static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
286 : static double _bt_parallel_heapscan(BTBuildState *buildstate,
287 : bool *brokenhotchain);
288 : static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
289 : static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
290 : BTShared *btshared, Sharedsort *sharedsort,
291 : Sharedsort *sharedsort2, int sortmem,
292 : bool progress);
293 :
294 :
295 : /*
296 : * btbuild() -- build a new btree index.
297 : */
298 : IndexBuildResult *
299 30934 : btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
300 : {
301 : IndexBuildResult *result;
302 : BTBuildState buildstate;
303 : double reltuples;
304 :
305 : #ifdef BTREE_BUILD_STATS
306 : if (log_btree_build_stats)
307 : ResetUsage();
308 : #endif /* BTREE_BUILD_STATS */
309 :
310 30934 : buildstate.isunique = indexInfo->ii_Unique;
311 30934 : buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
312 30934 : buildstate.havedead = false;
313 30934 : buildstate.heap = heap;
314 30934 : buildstate.spool = NULL;
315 30934 : buildstate.spool2 = NULL;
316 30934 : buildstate.indtuples = 0;
317 30934 : buildstate.btleader = NULL;
318 :
319 : /*
320 : * We expect to be called exactly once for any index relation. If that's
321 : * not the case, big trouble's what we have.
322 : */
323 30934 : if (RelationGetNumberOfBlocks(index) != 0)
324 0 : elog(ERROR, "index \"%s\" already contains data",
325 : RelationGetRelationName(index));
326 :
327 30934 : reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
328 :
329 : /*
330 : * Finish the build by (1) completing the sort of the spool file, (2)
331 : * inserting the sorted tuples into btree pages and (3) building the upper
332 : * levels. Finally, it may also be necessary to end use of parallelism.
333 : */
334 30926 : _bt_leafbuild(buildstate.spool, buildstate.spool2);
335 30870 : _bt_spooldestroy(buildstate.spool);
336 30870 : if (buildstate.spool2)
337 10 : _bt_spooldestroy(buildstate.spool2);
338 30870 : if (buildstate.btleader)
339 108 : _bt_end_parallel(buildstate.btleader);
340 :
341 30870 : result = palloc_object(IndexBuildResult);
342 :
343 30870 : result->heap_tuples = reltuples;
344 30870 : result->index_tuples = buildstate.indtuples;
345 :
346 : #ifdef BTREE_BUILD_STATS
347 : if (log_btree_build_stats)
348 : {
349 : ShowUsage("BTREE BUILD STATS");
350 : ResetUsage();
351 : }
352 : #endif /* BTREE_BUILD_STATS */
353 :
354 30870 : return result;
355 : }
356 :
357 : /*
358 : * Create and initialize one or two spool structures, and save them in caller's
359 : * buildstate argument. May also fill-in fields within indexInfo used by index
360 : * builds.
361 : *
362 : * Scans the heap, possibly in parallel, filling spools with IndexTuples. This
363 : * routine encapsulates all aspects of managing parallelism. Caller need only
364 : * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
365 : *
366 : * Returns the total number of heap tuples scanned.
367 : */
368 : static double
369 30934 : _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
370 : IndexInfo *indexInfo)
371 : {
372 30934 : BTSpool *btspool = palloc0_object(BTSpool);
373 30934 : SortCoordinate coordinate = NULL;
374 30934 : double reltuples = 0;
375 :
376 : /*
377 : * We size the sort area as maintenance_work_mem rather than work_mem to
378 : * speed index creation. This should be OK since a single backend can't
379 : * run multiple index creations in parallel (see also: notes on
380 : * parallelism and maintenance_work_mem below).
381 : */
382 30934 : btspool->heap = heap;
383 30934 : btspool->index = index;
384 30934 : btspool->isunique = indexInfo->ii_Unique;
385 30934 : btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
386 :
387 : /* Save as primary spool */
388 30934 : buildstate->spool = btspool;
389 :
390 : /* Report table scan phase started */
391 30934 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
392 : PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);
393 :
394 : /* Attempt to launch parallel worker scan when required */
395 30934 : if (indexInfo->ii_ParallelWorkers > 0)
396 108 : _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
397 : indexInfo->ii_ParallelWorkers);
398 :
399 : /*
400 : * If parallel build requested and at least one worker process was
401 : * successfully launched, set up coordination state
402 : */
403 30934 : if (buildstate->btleader)
404 : {
405 108 : coordinate = palloc0_object(SortCoordinateData);
406 108 : coordinate->isWorker = false;
407 108 : coordinate->nParticipants =
408 108 : buildstate->btleader->nparticipanttuplesorts;
409 108 : coordinate->sharedsort = buildstate->btleader->sharedsort;
410 : }
411 :
412 : /*
413 : * Begin serial/leader tuplesort.
414 : *
415 : * In cases where parallelism is involved, the leader receives the same
416 : * share of maintenance_work_mem as a serial sort (it is generally treated
417 : * in the same way as a serial sort once we return). Parallel worker
418 : * Tuplesortstates will have received only a fraction of
419 : * maintenance_work_mem, though.
420 : *
421 : * We rely on the lifetime of the Leader Tuplesortstate almost not
422 : * overlapping with any worker Tuplesortstate's lifetime. There may be
423 : * some small overlap, but that's okay because we rely on leader
424 : * Tuplesortstate only allocating a small, fixed amount of memory here.
425 : * When its tuplesort_performsort() is called (by our caller), and
426 : * significant amounts of memory are likely to be used, all workers must
427 : * have already freed almost all memory held by their Tuplesortstates
428 : * (they are about to go away completely, too). The overall effect is
429 : * that maintenance_work_mem always represents an absolute high watermark
430 : * on the amount of memory used by a CREATE INDEX operation, regardless of
431 : * the use of parallelism or any other factor.
432 : */
433 61868 : buildstate->spool->sortstate =
434 30934 : tuplesort_begin_index_btree(heap, index, buildstate->isunique,
435 30934 : buildstate->nulls_not_distinct,
436 : maintenance_work_mem, coordinate,
437 : TUPLESORT_NONE);
438 :
439 : /*
440 : * If building a unique index, put dead tuples in a second spool to keep
441 : * them out of the uniqueness check. We expect that the second spool (for
442 : * dead tuples) won't get very full, so we give it only work_mem.
443 : */
444 30934 : if (indexInfo->ii_Unique)
445 : {
446 24719 : BTSpool *btspool2 = palloc0_object(BTSpool);
447 24719 : SortCoordinate coordinate2 = NULL;
448 :
449 : /* Initialize secondary spool */
450 24719 : btspool2->heap = heap;
451 24719 : btspool2->index = index;
452 24719 : btspool2->isunique = false;
453 : /* Save as secondary spool */
454 24719 : buildstate->spool2 = btspool2;
455 :
456 24719 : if (buildstate->btleader)
457 : {
458 : /*
459 : * Set up non-private state that is passed to
460 : * tuplesort_begin_index_btree() about the basic high level
461 : * coordination of a parallel sort.
462 : */
463 48 : coordinate2 = palloc0_object(SortCoordinateData);
464 48 : coordinate2->isWorker = false;
465 48 : coordinate2->nParticipants =
466 48 : buildstate->btleader->nparticipanttuplesorts;
467 48 : coordinate2->sharedsort = buildstate->btleader->sharedsort2;
468 : }
469 :
470 : /*
471 : * We expect that the second one (for dead tuples) won't get very
472 : * full, so we give it only work_mem
473 : */
474 24719 : buildstate->spool2->sortstate =
475 24719 : tuplesort_begin_index_btree(heap, index, false, false, work_mem,
476 : coordinate2, TUPLESORT_NONE);
477 : }
478 :
479 : /* Fill spool using either serial or parallel heap scan */
480 30934 : if (!buildstate->btleader)
481 30826 : reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
482 : _bt_build_callback, buildstate,
483 : NULL);
484 : else
485 108 : reltuples = _bt_parallel_heapscan(buildstate,
486 : &indexInfo->ii_BrokenHotChain);
487 :
488 : /*
489 : * Set the progress target for the next phase. Reset the block number
490 : * values set by table_index_build_scan
491 : */
492 : {
493 30926 : const int progress_index[] = {
494 : PROGRESS_CREATEIDX_TUPLES_TOTAL,
495 : PROGRESS_SCAN_BLOCKS_TOTAL,
496 : PROGRESS_SCAN_BLOCKS_DONE
497 : };
498 30926 : const int64 progress_vals[] = {
499 30926 : buildstate->indtuples,
500 : 0, 0
501 : };
502 :
503 30926 : pgstat_progress_update_multi_param(3, progress_index, progress_vals);
504 : }
505 :
506 : /* okay, all heap tuples are spooled */
507 30926 : if (buildstate->spool2 && !buildstate->havedead)
508 : {
509 : /* spool2 turns out to be unnecessary */
510 24709 : _bt_spooldestroy(buildstate->spool2);
511 24709 : buildstate->spool2 = NULL;
512 : }
513 :
514 30926 : return reltuples;
515 : }
516 :
517 : /*
518 : * clean up a spool structure and its substructures.
519 : */
520 : static void
521 55589 : _bt_spooldestroy(BTSpool *btspool)
522 : {
523 55589 : tuplesort_end(btspool->sortstate);
524 55589 : pfree(btspool);
525 55589 : }
526 :
527 : /*
528 : * spool an index entry into the sort file.
529 : */
530 : static void
531 7606729 : _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull)
532 : {
533 7606729 : tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
534 : self, values, isnull);
535 7606729 : }
536 :
537 : /*
538 : * given a spool loaded by successive calls to _bt_spool,
539 : * create an entire btree.
540 : */
541 : static void
542 30926 : _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
543 : {
544 : BTWriteState wstate;
545 :
546 : #ifdef BTREE_BUILD_STATS
547 : if (log_btree_build_stats)
548 : {
549 : ShowUsage("BTREE BUILD (Spool) STATISTICS");
550 : ResetUsage();
551 : }
552 : #endif /* BTREE_BUILD_STATS */
553 :
554 : /* Execute the sort */
555 30926 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
556 : PROGRESS_BTREE_PHASE_PERFORMSORT_1);
557 30926 : tuplesort_performsort(btspool->sortstate);
558 30870 : if (btspool2)
559 : {
560 10 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
561 : PROGRESS_BTREE_PHASE_PERFORMSORT_2);
562 10 : tuplesort_performsort(btspool2->sortstate);
563 : }
564 :
565 30870 : wstate.heap = btspool->heap;
566 30870 : wstate.index = btspool->index;
567 30870 : wstate.inskey = _bt_mkscankey(wstate.index, NULL);
568 : /* _bt_mkscankey() won't set allequalimage without metapage */
569 30870 : wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
570 :
571 : /* reserve the metapage */
572 30870 : wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
573 :
574 30870 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
575 : PROGRESS_BTREE_PHASE_LEAF_LOAD);
576 30870 : _bt_load(&wstate, btspool, btspool2);
577 30870 : }
578 :
579 : /*
580 : * Per-tuple callback for table_index_build_scan
581 : */
582 : static void
583 7606729 : _bt_build_callback(Relation index,
584 : ItemPointer tid,
585 : Datum *values,
586 : bool *isnull,
587 : bool tupleIsAlive,
588 : void *state)
589 : {
590 7606729 : BTBuildState *buildstate = (BTBuildState *) state;
591 :
592 : /*
593 : * insert the index tuple into the appropriate spool file for subsequent
594 : * processing
595 : */
596 7606729 : if (tupleIsAlive || buildstate->spool2 == NULL)
597 7606479 : _bt_spool(buildstate->spool, tid, values, isnull);
598 : else
599 : {
600 : /* dead tuples are put into spool2 */
601 250 : buildstate->havedead = true;
602 250 : _bt_spool(buildstate->spool2, tid, values, isnull);
603 : }
604 :
605 7606729 : buildstate->indtuples += 1;
606 7606729 : }
607 :
608 : /*
609 : * allocate workspace for a new, clean btree page, not linked to any siblings.
610 : */
611 : static BulkWriteBuffer
612 30953 : _bt_blnewpage(BTWriteState *wstate, uint32 level)
613 : {
614 : BulkWriteBuffer buf;
615 : Page page;
616 : BTPageOpaque opaque;
617 :
618 30953 : buf = smgr_bulk_get_buf(wstate->bulkstate);
619 30953 : page = (Page) buf;
620 :
621 : /* Zero the page and set up standard page header info */
622 30953 : _bt_pageinit(page, BLCKSZ);
623 :
624 : /* Initialize BT opaque state */
625 30953 : opaque = BTPageGetOpaque(page);
626 30953 : opaque->btpo_prev = opaque->btpo_next = P_NONE;
627 30953 : opaque->btpo_level = level;
628 30953 : opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
629 30953 : opaque->btpo_cycleid = 0;
630 :
631 : /* Make the P_HIKEY line pointer appear allocated */
632 30953 : ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
633 :
634 30953 : return buf;
635 : }
636 :
637 : /*
638 : * emit a completed btree page, and release the working storage.
639 : */
640 : static void
641 61823 : _bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
642 : {
643 61823 : smgr_bulk_write(wstate->bulkstate, blkno, buf, true);
644 : /* smgr_bulk_write took ownership of 'buf' */
645 61823 : }
646 :
647 : /*
648 : * allocate and initialize a new BTPageState. the returned structure
649 : * is suitable for immediate use by _bt_buildadd.
650 : */
651 : static BTPageState *
652 7491 : _bt_pagestate(BTWriteState *wstate, uint32 level)
653 : {
654 7491 : BTPageState *state = palloc0_object(BTPageState);
655 :
656 : /* create initial page for level */
657 7491 : state->btps_buf = _bt_blnewpage(wstate, level);
658 :
659 : /* and assign it a page position */
660 7491 : state->btps_blkno = wstate->btws_pages_alloced++;
661 :
662 7491 : state->btps_lowkey = NULL;
663 : /* initialize lastoff so first item goes into P_FIRSTKEY */
664 7491 : state->btps_lastoff = P_HIKEY;
665 7491 : state->btps_lastextra = 0;
666 7491 : state->btps_level = level;
667 : /* set "full" threshold based on level. See notes at head of file. */
668 7491 : if (level > 0)
669 1479 : state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
670 : else
671 6012 : state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
672 :
673 : /* no parent level, yet */
674 7491 : state->btps_next = NULL;
675 :
676 7491 : return state;
677 : }
678 :
679 : /*
680 : * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
681 : * P_HIKEY, overwriting P_HIKEY).
682 : *
683 : * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
684 : * rightmost page on its level is not supposed to get a high key. Now that
685 : * it's clear that this page is a rightmost page, remove the unneeded empty
686 : * P_HIKEY line pointer space.
687 : */
688 : static void
689 7491 : _bt_slideleft(Page rightmostpage)
690 : {
691 : OffsetNumber off;
692 : OffsetNumber maxoff;
693 : ItemId previi;
694 :
695 7491 : maxoff = PageGetMaxOffsetNumber(rightmostpage);
696 : Assert(maxoff >= P_FIRSTKEY);
697 7491 : previi = PageGetItemId(rightmostpage, P_HIKEY);
698 443916 : for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
699 : {
700 436425 : ItemId thisii = PageGetItemId(rightmostpage, off);
701 :
702 436425 : *previi = *thisii;
703 436425 : previi = thisii;
704 : }
705 7491 : ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
706 7491 : }
707 :
708 : /*
709 : * Add an item to a page being built.
710 : *
711 : * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
712 : * raises an error directly.
713 : *
714 : * Note that our nbtsort.c caller does not know yet if the page will be
715 : * rightmost. Offset P_FIRSTKEY is always assumed to be the first data key by
716 : * caller. Page that turns out to be the rightmost on its level is fixed by
717 : * calling _bt_slideleft().
718 : */
719 : static void
720 6652160 : _bt_sortaddtup(Page page,
721 : Size itemsize,
722 : const IndexTupleData *itup,
723 : OffsetNumber itup_off,
724 : bool newfirstdataitem)
725 : {
726 : IndexTupleData trunctuple;
727 :
728 6652160 : if (newfirstdataitem)
729 : {
730 1558 : trunctuple = *itup;
731 1558 : trunctuple.t_info = sizeof(IndexTupleData);
732 1558 : BTreeTupleSetNAtts(&trunctuple, 0, false);
733 1558 : itup = &trunctuple;
734 1558 : itemsize = sizeof(IndexTupleData);
735 : }
736 :
737 6652160 : if (PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber)
738 0 : elog(ERROR, "failed to add item to the index page");
739 6652160 : }
740 :
741 : /*----------
742 : * Add an item to a disk page from the sort output (or add a posting list
743 : * item formed from the sort output).
744 : *
745 : * We must be careful to observe the page layout conventions of nbtsearch.c:
746 : * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
747 : * - on non-leaf pages, the key portion of the first item need not be
748 : * stored, we should store only the link.
749 : *
750 : * A leaf page being built looks like:
751 : *
752 : * +----------------+---------------------------------+
753 : * | PageHeaderData | linp0 linp1 linp2 ... |
754 : * +-----------+----+---------------------------------+
755 : * | ... linpN | |
756 : * +-----------+--------------------------------------+
757 : * | ^ last |
758 : * | |
759 : * +-------------+------------------------------------+
760 : * | | itemN ... |
761 : * +-------------+------------------+-----------------+
762 : * | ... item3 item2 item1 | "special space" |
763 : * +--------------------------------+-----------------+
764 : *
765 : * Contrast this with the diagram in bufpage.h; note the mismatch
766 : * between linps and items. This is because we reserve linp0 as a
767 : * placeholder for the pointer to the "high key" item; when we have
768 : * filled up the page, we will set linp0 to point to itemN and clear
769 : * linpN. On the other hand, if we find this is the last (rightmost)
770 : * page, we leave the items alone and slide the linp array over. If
771 : * the high key is to be truncated, offset 1 is deleted, and we insert
772 : * the truncated high key at offset 1.
773 : *
774 : * 'last' pointer indicates the last offset added to the page.
775 : *
776 : * 'truncextra' is the size of the posting list in itup, if any. This
777 : * information is stashed for the next call here, when we may benefit
778 : * from considering the impact of truncating away the posting list on
779 : * the page before deciding to finish the page off. Posting lists are
780 : * often relatively large, so it is worth going to the trouble of
781 : * accounting for the saving from truncating away the posting list of
782 : * the tuple that becomes the high key (that may be the only way to
783 : * get close to target free space on the page). Note that this is
784 : * only used for the soft fillfactor-wise limit, not the critical hard
785 : * limit.
786 : *----------
787 : */
788 : static void
789 6628698 : _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
790 : Size truncextra)
791 : {
792 : BulkWriteBuffer nbuf;
793 : Page npage;
794 : BlockNumber nblkno;
795 : OffsetNumber last_off;
796 : Size last_truncextra;
797 : Size pgspc;
798 : Size itupsz;
799 : bool isleaf;
800 :
801 : /*
802 : * This is a handy place to check for cancel interrupts during the btree
803 : * load phase of index creation.
804 : */
805 6628698 : CHECK_FOR_INTERRUPTS();
806 :
807 6628698 : nbuf = state->btps_buf;
808 6628698 : npage = (Page) nbuf;
809 6628698 : nblkno = state->btps_blkno;
810 6628698 : last_off = state->btps_lastoff;
811 6628698 : last_truncextra = state->btps_lastextra;
812 6628698 : state->btps_lastextra = truncextra;
813 :
814 6628698 : pgspc = PageGetFreeSpace(npage);
815 6628698 : itupsz = IndexTupleSize(itup);
816 6628698 : itupsz = MAXALIGN(itupsz);
817 : /* Leaf case has slightly different rules due to suffix truncation */
818 6628698 : isleaf = (state->btps_level == 0);
819 :
820 : /*
821 : * Check whether the new item can fit on a btree page on current level at
822 : * all.
823 : *
824 : * Every newly built index will treat heap TID as part of the keyspace,
825 : * which imposes the requirement that new high keys must occasionally have
826 : * a heap TID appended within _bt_truncate(). That may leave a new pivot
827 : * tuple one or two MAXALIGN() quantums larger than the original
828 : * firstright tuple it's derived from. v4 deals with the problem by
829 : * decreasing the limit on the size of tuples inserted on the leaf level
830 : * by the same small amount. Enforce the new v4+ limit on the leaf level,
831 : * and the old limit on internal levels, since pivot tuples may need to
832 : * make use of the reserved space. This should never fail on internal
833 : * pages.
834 : */
835 6628698 : if (unlikely(itupsz > BTMaxItemSize))
836 176 : _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
837 : itup);
838 :
839 : /*
840 : * Check to see if current page will fit new item, with space left over to
841 : * append a heap TID during suffix truncation when page is a leaf page.
842 : *
843 : * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
844 : * high key with heap TID when finishing off a leaf page, since we rely on
845 : * _bt_check_third_page() rejecting oversized non-pivot tuples. On
846 : * internal pages we can always fit 3 pivot tuples with larger internal
847 : * page tuple limit (includes page high key).
848 : *
849 : * Most of the time, a page is only "full" in the sense that the soft
850 : * fillfactor-wise limit has been exceeded. However, we must always leave
851 : * at least two items plus a high key on each page before starting a new
852 : * page. Disregard fillfactor and insert on "full" current page if we
853 : * don't have the minimum number of items yet. (Note that we deliberately
854 : * assume that suffix truncation neither enlarges nor shrinks new high key
855 : * when applying soft limit, except when last tuple has a posting list.)
856 : */
857 : Assert(last_truncextra == 0 || isleaf);
858 6628698 : if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
859 6627847 : (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
860 : {
861 : /*
862 : * Finish off the page and write it out.
863 : */
864 23462 : BulkWriteBuffer obuf = nbuf;
865 23462 : Page opage = npage;
866 23462 : BlockNumber oblkno = nblkno;
867 : ItemId ii;
868 : ItemId hii;
869 : IndexTuple oitup;
870 :
871 : /* Create new page of same level */
872 23462 : nbuf = _bt_blnewpage(wstate, state->btps_level);
873 23462 : npage = (Page) nbuf;
874 :
875 : /* and assign it a page position */
876 23462 : nblkno = wstate->btws_pages_alloced++;
877 :
878 : /*
879 : * We copy the last item on the page into the new page, and then
880 : * rearrange the old page so that the 'last item' becomes its high key
881 : * rather than a true data item. There had better be at least two
882 : * items on the page already, else the page would be empty of useful
883 : * data.
884 : */
885 : Assert(last_off > P_FIRSTKEY);
886 23462 : ii = PageGetItemId(opage, last_off);
887 23462 : oitup = (IndexTuple) PageGetItem(opage, ii);
888 23462 : _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
889 23462 : !isleaf);
890 :
891 : /*
892 : * Move 'last' into the high key position on opage. _bt_blnewpage()
893 : * allocated empty space for a line pointer when opage was first
894 : * created, so this is a matter of rearranging already-allocated space
895 : * on page, and initializing high key line pointer. (Actually, leaf
896 : * pages must also swap oitup with a truncated version of oitup, which
897 : * is sometimes larger than oitup, though never by more than the space
898 : * needed to append a heap TID.)
899 : */
900 23462 : hii = PageGetItemId(opage, P_HIKEY);
901 23462 : *hii = *ii;
902 23462 : ItemIdSetUnused(ii); /* redundant */
903 23462 : ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
904 :
905 23462 : if (isleaf)
906 : {
907 : IndexTuple lastleft;
908 : IndexTuple truncated;
909 :
910 : /*
911 : * Truncate away any unneeded attributes from high key on leaf
912 : * level. This is only done at the leaf level because downlinks
913 : * in internal pages are either negative infinity items, or get
914 : * their contents from copying from one level down. See also:
915 : * _bt_split().
916 : *
917 : * We don't try to bias our choice of split point to make it more
918 : * likely that _bt_truncate() can truncate away more attributes,
919 : * whereas the split point used within _bt_split() is chosen much
920 : * more delicately. Even still, the lastleft and firstright
921 : * tuples passed to _bt_truncate() here are at least not fully
922 : * equal to each other when deduplication is used, unless there is
923 : * a large group of duplicates (also, unique index builds usually
924 : * have few or no spool2 duplicates). When the split point is
925 : * between two unequal tuples, _bt_truncate() will avoid including
926 : * a heap TID in the new high key, which is the most important
927 : * benefit of suffix truncation.
928 : *
929 : * Overwrite the old item with new truncated high key directly.
930 : * oitup is already located at the physical beginning of tuple
931 : * space, so this should directly reuse the existing tuple space.
932 : */
933 23383 : ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
934 23383 : lastleft = (IndexTuple) PageGetItem(opage, ii);
935 :
936 : Assert(IndexTupleSize(oitup) > last_truncextra);
937 23383 : truncated = _bt_truncate(wstate->index, lastleft, oitup,
938 : wstate->inskey);
939 23383 : if (!PageIndexTupleOverwrite(opage, P_HIKEY, truncated, IndexTupleSize(truncated)))
940 0 : elog(ERROR, "failed to add high key to the index page");
941 23383 : pfree(truncated);
942 :
943 : /* oitup should continue to point to the page's high key */
944 23383 : hii = PageGetItemId(opage, P_HIKEY);
945 23383 : oitup = (IndexTuple) PageGetItem(opage, hii);
946 : }
947 :
948 : /*
949 : * Link the old page into its parent, using its low key. If we don't
950 : * have a parent, we have to create one; this adds a new btree level.
951 : */
952 23462 : if (state->btps_next == NULL)
953 1479 : state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
954 :
955 : Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
956 : IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
957 : BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
958 : P_LEFTMOST(BTPageGetOpaque(opage)));
959 : Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
960 : !P_LEFTMOST(BTPageGetOpaque(opage)));
961 23462 : BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
962 23462 : _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
963 23462 : pfree(state->btps_lowkey);
964 :
965 : /*
966 : * Save a copy of the high key from the old page. It is also the low
967 : * key for the new page.
968 : */
969 23462 : state->btps_lowkey = CopyIndexTuple(oitup);
970 :
971 : /*
972 : * Set the sibling links for both pages.
973 : */
974 : {
975 23462 : BTPageOpaque oopaque = BTPageGetOpaque(opage);
976 23462 : BTPageOpaque nopaque = BTPageGetOpaque(npage);
977 :
978 23462 : oopaque->btpo_next = nblkno;
979 23462 : nopaque->btpo_prev = oblkno;
980 23462 : nopaque->btpo_next = P_NONE; /* redundant */
981 : }
982 :
983 : /*
984 : * Write out the old page. _bt_blwritepage takes ownership of the
985 : * 'opage' buffer.
986 : */
987 23462 : _bt_blwritepage(wstate, obuf, oblkno);
988 :
989 : /*
990 : * Reset last_off to point to new page
991 : */
992 23462 : last_off = P_FIRSTKEY;
993 : }
994 :
995 : /*
996 : * By here, either original page is still the current page, or a new page
997 : * was created that became the current page. Either way, the current page
998 : * definitely has space for new item.
999 : *
1000 : * If the new item is the first for its page, it must also be the first
1001 : * item on its entire level. On later same-level pages, a low key for a
1002 : * page will be copied from the prior page in the code above. Generate a
1003 : * minus infinity low key here instead.
1004 : */
1005 6628698 : if (last_off == P_HIKEY)
1006 : {
1007 : Assert(state->btps_lowkey == NULL);
1008 7491 : state->btps_lowkey = palloc0_object(IndexTupleData);
1009 7491 : state->btps_lowkey->t_info = sizeof(IndexTupleData);
1010 7491 : BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
1011 : }
1012 :
1013 : /*
1014 : * Add the new item into the current page.
1015 : */
1016 6628698 : last_off = OffsetNumberNext(last_off);
1017 6628698 : _bt_sortaddtup(npage, itupsz, itup, last_off,
1018 6628698 : !isleaf && last_off == P_FIRSTKEY);
1019 :
1020 6628698 : state->btps_buf = nbuf;
1021 6628698 : state->btps_blkno = nblkno;
1022 6628698 : state->btps_lastoff = last_off;
1023 6628698 : }
1024 :
1025 : /*
1026 : * Finalize pending posting list tuple, and add it to the index. Final tuple
1027 : * is based on saved base tuple, and saved list of heap TIDs.
1028 : *
1029 : * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
1030 : * using _bt_buildadd().
1031 : */
1032 : static void
1033 3070081 : _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
1034 : BTDedupState dstate)
1035 : {
1036 : Assert(dstate->nitems > 0);
1037 :
1038 3070081 : if (dstate->nitems == 1)
1039 3044406 : _bt_buildadd(wstate, state, dstate->base, 0);
1040 : else
1041 : {
1042 : IndexTuple postingtuple;
1043 : Size truncextra;
1044 :
1045 : /* form a tuple with a posting list */
1046 25675 : postingtuple = _bt_form_posting(dstate->base,
1047 25675 : dstate->htids,
1048 : dstate->nhtids);
1049 : /* Calculate posting list overhead */
1050 25675 : truncextra = IndexTupleSize(postingtuple) -
1051 25675 : BTreeTupleGetPostingOffset(postingtuple);
1052 :
1053 25675 : _bt_buildadd(wstate, state, postingtuple, truncextra);
1054 25675 : pfree(postingtuple);
1055 : }
1056 :
1057 3070081 : dstate->nmaxitems = 0;
1058 3070081 : dstate->nhtids = 0;
1059 3070081 : dstate->nitems = 0;
1060 3070081 : dstate->phystupsize = 0;
1061 3070081 : }
1062 :
1063 : /*
1064 : * Finish writing out the completed btree.
1065 : */
1066 : static void
1067 30870 : _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
1068 : {
1069 : BTPageState *s;
1070 30870 : BlockNumber rootblkno = P_NONE;
1071 30870 : uint32 rootlevel = 0;
1072 : BulkWriteBuffer metabuf;
1073 :
1074 : /*
1075 : * Each iteration of this loop completes one more level of the tree.
1076 : */
1077 38361 : for (s = state; s != NULL; s = s->btps_next)
1078 : {
1079 : BlockNumber blkno;
1080 : BTPageOpaque opaque;
1081 :
1082 7491 : blkno = s->btps_blkno;
1083 7491 : opaque = BTPageGetOpaque((Page) s->btps_buf);
1084 :
1085 : /*
1086 : * We have to link the last page on this level to somewhere.
1087 : *
1088 : * If we're at the top, it's the root, so attach it to the metapage.
1089 : * Otherwise, add an entry for it to its parent using its low key.
1090 : * This may cause the last page of the parent level to split, but
1091 : * that's not a problem -- we haven't gotten to it yet.
1092 : */
1093 7491 : if (s->btps_next == NULL)
1094 : {
1095 6012 : opaque->btpo_flags |= BTP_ROOT;
1096 6012 : rootblkno = blkno;
1097 6012 : rootlevel = s->btps_level;
1098 : }
1099 : else
1100 : {
1101 : Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
1102 : IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
1103 : BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
1104 : P_LEFTMOST(opaque));
1105 : Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
1106 : !P_LEFTMOST(opaque));
1107 1479 : BTreeTupleSetDownLink(s->btps_lowkey, blkno);
1108 1479 : _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
1109 1479 : pfree(s->btps_lowkey);
1110 1479 : s->btps_lowkey = NULL;
1111 : }
1112 :
1113 : /*
1114 : * This is the rightmost page, so the ItemId array needs to be slid
1115 : * back one slot. Then we can dump out the page.
1116 : */
1117 7491 : _bt_slideleft((Page) s->btps_buf);
1118 7491 : _bt_blwritepage(wstate, s->btps_buf, s->btps_blkno);
1119 7491 : s->btps_buf = NULL; /* writepage took ownership of the buffer */
1120 : }
1121 :
1122 : /*
1123 : * As the last step in the process, construct the metapage and make it
1124 : * point to the new root (unless we had no data at all, in which case it's
1125 : * set to point to "P_NONE"). This changes the index to the "valid" state
1126 : * by filling in a valid magic number in the metapage.
1127 : */
1128 30870 : metabuf = smgr_bulk_get_buf(wstate->bulkstate);
1129 30870 : _bt_initmetapage((Page) metabuf, rootblkno, rootlevel,
1130 30870 : wstate->inskey->allequalimage);
1131 30870 : _bt_blwritepage(wstate, metabuf, BTREE_METAPAGE);
1132 30870 : }
1133 :
1134 : /*
1135 : * Read tuples in correct sort order from tuplesort, and load them into
1136 : * btree leaves.
1137 : */
1138 : static void
1139 30870 : _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
1140 : {
1141 30870 : BTPageState *state = NULL;
1142 30870 : bool merge = (btspool2 != NULL);
1143 : IndexTuple itup,
1144 30870 : itup2 = NULL;
1145 : bool load1;
1146 30870 : TupleDesc tupdes = RelationGetDescr(wstate->index);
1147 : int i,
1148 30870 : keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
1149 : SortSupport sortKeys;
1150 30870 : int64 tuples_done = 0;
1151 : bool deduplicate;
1152 :
1153 30870 : wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);
1154 :
1155 36884 : deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
1156 6014 : BTGetDeduplicateItems(wstate->index);
1157 :
1158 30870 : if (merge)
1159 : {
1160 : /*
1161 : * Another BTSpool for dead tuples exists. Now we have to merge
1162 : * btspool and btspool2.
1163 : */
1164 :
1165 : /* the preparation of merge */
1166 10 : itup = tuplesort_getindextuple(btspool->sortstate, true);
1167 10 : itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1168 :
1169 : /* Prepare SortSupport data for each column */
1170 10 : sortKeys = palloc0_array(SortSupportData, keysz);
1171 :
1172 21 : for (i = 0; i < keysz; i++)
1173 : {
1174 11 : SortSupport sortKey = sortKeys + i;
1175 11 : ScanKey scanKey = wstate->inskey->scankeys + i;
1176 : bool reverse;
1177 :
1178 11 : sortKey->ssup_cxt = CurrentMemoryContext;
1179 11 : sortKey->ssup_collation = scanKey->sk_collation;
1180 11 : sortKey->ssup_nulls_first =
1181 11 : (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1182 11 : sortKey->ssup_attno = scanKey->sk_attno;
1183 : /* Abbreviation is not supported here */
1184 11 : sortKey->abbreviate = false;
1185 :
1186 : Assert(sortKey->ssup_attno != 0);
1187 :
1188 11 : reverse = (scanKey->sk_flags & SK_BT_DESC) != 0;
1189 :
1190 11 : PrepareSortSupportFromIndexRel(wstate->index, reverse, sortKey);
1191 : }
1192 :
1193 : for (;;)
1194 : {
1195 1768 : load1 = true; /* load BTSpool next ? */
1196 1768 : if (itup2 == NULL)
1197 : {
1198 79 : if (itup == NULL)
1199 10 : break;
1200 : }
1201 1689 : else if (itup != NULL)
1202 : {
1203 1594 : int32 compare = 0;
1204 :
1205 1717 : for (i = 1; i <= keysz; i++)
1206 : {
1207 : SortSupport entry;
1208 : Datum attrDatum1,
1209 : attrDatum2;
1210 : bool isNull1,
1211 : isNull2;
1212 :
1213 1620 : entry = sortKeys + i - 1;
1214 1620 : attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
1215 1620 : attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
1216 :
1217 1620 : compare = ApplySortComparator(attrDatum1, isNull1,
1218 : attrDatum2, isNull2,
1219 : entry);
1220 1620 : if (compare > 0)
1221 : {
1222 139 : load1 = false;
1223 1497 : break;
1224 : }
1225 1481 : else if (compare < 0)
1226 1358 : break;
1227 : }
1228 :
1229 : /*
1230 : * If key values are equal, we sort on ItemPointer. This is
1231 : * required for btree indexes, since heap TID is treated as an
1232 : * implicit last key attribute in order to ensure that all
1233 : * keys in the index are physically unique.
1234 : */
1235 1594 : if (compare == 0)
1236 : {
1237 97 : compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
1238 : Assert(compare != 0);
1239 97 : if (compare > 0)
1240 16 : load1 = false;
1241 : }
1242 : }
1243 : else
1244 95 : load1 = false;
1245 :
1246 : /* When we see first tuple, create first index page */
1247 1758 : if (state == NULL)
1248 10 : state = _bt_pagestate(wstate, 0);
1249 :
1250 1758 : if (load1)
1251 : {
1252 1508 : _bt_buildadd(wstate, state, itup, 0);
1253 1508 : itup = tuplesort_getindextuple(btspool->sortstate, true);
1254 : }
1255 : else
1256 : {
1257 250 : _bt_buildadd(wstate, state, itup2, 0);
1258 250 : itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1259 : }
1260 :
1261 : /* Report progress */
1262 1758 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1263 : ++tuples_done);
1264 : }
1265 10 : pfree(sortKeys);
1266 : }
1267 30860 : else if (deduplicate)
1268 : {
1269 : /* merge is unnecessary, deduplicate into posting lists */
1270 : BTDedupState dstate;
1271 :
1272 6014 : dstate = palloc_object(BTDedupStateData);
1273 6014 : dstate->deduplicate = true; /* unused */
1274 6014 : dstate->nmaxitems = 0; /* unused */
1275 6014 : dstate->maxpostingsize = 0; /* set later */
1276 : /* Metadata about base tuple of current pending posting list */
1277 6014 : dstate->base = NULL;
1278 6014 : dstate->baseoff = InvalidOffsetNumber; /* unused */
1279 6014 : dstate->basetupsize = 0;
1280 : /* Metadata about current pending posting list TIDs */
1281 6014 : dstate->htids = NULL;
1282 6014 : dstate->nhtids = 0;
1283 6014 : dstate->nitems = 0;
1284 6014 : dstate->phystupsize = 0; /* unused */
1285 6014 : dstate->nintervals = 0; /* unused */
1286 :
1287 4078815 : while ((itup = tuplesort_getindextuple(btspool->sortstate,
1288 4078815 : true)) != NULL)
1289 : {
1290 : /* When we see first tuple, create first index page */
1291 4072801 : if (state == NULL)
1292 : {
1293 1720 : state = _bt_pagestate(wstate, 0);
1294 :
1295 : /*
1296 : * Limit size of posting list tuples to 1/10 space we want to
1297 : * leave behind on the page, plus space for final item's line
1298 : * pointer. This is equal to the space that we'd like to
1299 : * leave behind on each leaf page when fillfactor is 90,
1300 : * allowing us to get close to fillfactor% space utilization
1301 : * when there happen to be a great many duplicates. (This
1302 : * makes higher leaf fillfactor settings ineffective when
1303 : * building indexes that have many duplicates, but packing
1304 : * leaf pages full with few very large tuples doesn't seem
1305 : * like a useful goal.)
1306 : */
1307 1720 : dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
1308 : sizeof(ItemIdData);
1309 : Assert(dstate->maxpostingsize <= BTMaxItemSize &&
1310 : dstate->maxpostingsize <= INDEX_SIZE_MASK);
1311 1720 : dstate->htids = palloc(dstate->maxpostingsize);
1312 :
1313 : /* start new pending posting list with itup copy */
1314 1720 : _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
1315 : InvalidOffsetNumber);
1316 : }
1317 4071081 : else if (_bt_keep_natts_fast(wstate->index, dstate->base,
1318 1008727 : itup) > keysz &&
1319 1008727 : _bt_dedup_save_htid(dstate, itup))
1320 : {
1321 : /*
1322 : * Tuple is equal to base tuple of pending posting list. Heap
1323 : * TID from itup has been saved in state.
1324 : */
1325 : }
1326 : else
1327 : {
1328 : /*
1329 : * Tuple is not equal to pending posting list tuple, or
1330 : * _bt_dedup_save_htid() opted to not merge current item into
1331 : * pending posting list.
1332 : */
1333 3068361 : _bt_sort_dedup_finish_pending(wstate, state, dstate);
1334 3068361 : pfree(dstate->base);
1335 :
1336 : /* start new pending posting list with itup copy */
1337 3068361 : _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
1338 : InvalidOffsetNumber);
1339 : }
1340 :
1341 : /* Report progress */
1342 4072801 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1343 : ++tuples_done);
1344 : }
1345 :
1346 6014 : if (state)
1347 : {
1348 : /*
1349 : * Handle the last item (there must be a last item when the
1350 : * tuplesort returned one or more tuples)
1351 : */
1352 1720 : _bt_sort_dedup_finish_pending(wstate, state, dstate);
1353 1720 : pfree(dstate->base);
1354 1720 : pfree(dstate->htids);
1355 : }
1356 :
1357 6014 : pfree(dstate);
1358 : }
1359 : else
1360 : {
1361 : /* merging and deduplication are both unnecessary */
1362 3556764 : while ((itup = tuplesort_getindextuple(btspool->sortstate,
1363 3556764 : true)) != NULL)
1364 : {
1365 : /* When we see first tuple, create first index page */
1366 3531918 : if (state == NULL)
1367 4282 : state = _bt_pagestate(wstate, 0);
1368 :
1369 3531918 : _bt_buildadd(wstate, state, itup, 0);
1370 :
1371 : /* Report progress */
1372 3531918 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1373 : ++tuples_done);
1374 : }
1375 : }
1376 :
1377 : /* Close down final pages and write the metapage */
1378 30870 : _bt_uppershutdown(wstate, state);
1379 30870 : smgr_bulk_finish(wstate->bulkstate);
1380 30870 : }
1381 :
1382 : /*
1383 : * Create parallel context, and launch workers for leader.
1384 : *
1385 : * buildstate argument should be initialized (with the exception of the
1386 : * tuplesort state in spools, which may later be created based on shared
1387 : * state initially set up here).
1388 : *
1389 : * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
1390 : *
1391 : * request is the target number of parallel worker processes to launch.
1392 : *
1393 : * Sets buildstate's BTLeader, which caller must use to shut down parallel
1394 : * mode by passing it to _bt_end_parallel() at the very end of its index
1395 : * build. If not even a single worker process can be launched, this is
1396 : * never set, and caller should proceed with a serial index build.
1397 : */
1398 : static void
1399 108 : _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
1400 : {
1401 : ParallelContext *pcxt;
1402 : int scantuplesortstates;
1403 : Snapshot snapshot;
1404 : Size estbtshared;
1405 : Size estsort;
1406 : BTShared *btshared;
1407 : Sharedsort *sharedsort;
1408 : Sharedsort *sharedsort2;
1409 108 : BTSpool *btspool = buildstate->spool;
1410 108 : BTLeader *btleader = palloc0_object(BTLeader);
1411 : WalUsage *walusage;
1412 : BufferUsage *bufferusage;
1413 108 : bool leaderparticipates = true;
1414 : int querylen;
1415 :
1416 : #ifdef DISABLE_LEADER_PARTICIPATION
1417 : leaderparticipates = false;
1418 : #endif
1419 :
1420 : /*
1421 : * Enter parallel mode, and create context for parallel build of btree
1422 : * index
1423 : */
1424 108 : EnterParallelMode();
1425 : Assert(request > 0);
1426 108 : pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1427 : request);
1428 :
1429 108 : scantuplesortstates = leaderparticipates ? request + 1 : request;
1430 :
1431 : /*
1432 : * Prepare for scan of the base relation. In a normal index build, we use
1433 : * SnapshotAny because we must retrieve all tuples and do our own time
1434 : * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1435 : * concurrent build, we take a regular MVCC snapshot and index whatever's
1436 : * live according to that.
1437 : */
1438 108 : if (!isconcurrent)
1439 108 : snapshot = SnapshotAny;
1440 : else
1441 0 : snapshot = RegisterSnapshot(GetTransactionSnapshot());
1442 :
1443 : /*
1444 : * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1445 : * PARALLEL_KEY_TUPLESORT tuplesort workspace
1446 : */
1447 108 : estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
1448 108 : shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
1449 108 : estsort = tuplesort_estimate_shared(scantuplesortstates);
1450 108 : shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1451 :
1452 : /*
1453 : * Unique case requires a second spool, and so we may have to account for
1454 : * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1455 : */
1456 108 : if (!btspool->isunique)
1457 60 : shm_toc_estimate_keys(&pcxt->estimator, 2);
1458 : else
1459 : {
1460 48 : shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1461 48 : shm_toc_estimate_keys(&pcxt->estimator, 3);
1462 : }
1463 :
1464 : /*
1465 : * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
1466 : * and PARALLEL_KEY_BUFFER_USAGE.
1467 : *
1468 : * If there are no extensions loaded that care, we could skip this. We
1469 : * have no way of knowing whether anyone's looking at pgWalUsage or
1470 : * pgBufferUsage, so do it unconditionally.
1471 : */
1472 108 : shm_toc_estimate_chunk(&pcxt->estimator,
1473 : mul_size(sizeof(WalUsage), pcxt->nworkers));
1474 108 : shm_toc_estimate_keys(&pcxt->estimator, 1);
1475 108 : shm_toc_estimate_chunk(&pcxt->estimator,
1476 : mul_size(sizeof(BufferUsage), pcxt->nworkers));
1477 108 : shm_toc_estimate_keys(&pcxt->estimator, 1);
1478 :
1479 : /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1480 108 : if (debug_query_string)
1481 : {
1482 108 : querylen = strlen(debug_query_string);
1483 108 : shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
1484 108 : shm_toc_estimate_keys(&pcxt->estimator, 1);
1485 : }
1486 : else
1487 0 : querylen = 0; /* keep compiler quiet */
1488 :
1489 : /* Everyone's had a chance to ask for space, so now create the DSM */
1490 108 : InitializeParallelDSM(pcxt);
1491 :
1492 : /* If no DSM segment was available, back out (do serial build) */
1493 108 : if (pcxt->seg == NULL)
1494 : {
1495 0 : if (IsMVCCSnapshot(snapshot))
1496 0 : UnregisterSnapshot(snapshot);
1497 0 : DestroyParallelContext(pcxt);
1498 0 : ExitParallelMode();
1499 0 : return;
1500 : }
1501 :
1502 : /* Store shared build state, for which we reserved space */
1503 108 : btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1504 : /* Initialize immutable state */
1505 108 : btshared->heaprelid = RelationGetRelid(btspool->heap);
1506 108 : btshared->indexrelid = RelationGetRelid(btspool->index);
1507 108 : btshared->isunique = btspool->isunique;
1508 108 : btshared->nulls_not_distinct = btspool->nulls_not_distinct;
1509 108 : btshared->isconcurrent = isconcurrent;
1510 108 : btshared->scantuplesortstates = scantuplesortstates;
1511 108 : btshared->queryid = pgstat_get_my_query_id();
1512 108 : ConditionVariableInit(&btshared->workersdonecv);
1513 108 : SpinLockInit(&btshared->mutex);
1514 : /* Initialize mutable state */
1515 108 : btshared->nparticipantsdone = 0;
1516 108 : btshared->reltuples = 0.0;
1517 108 : btshared->havedead = false;
1518 108 : btshared->indtuples = 0.0;
1519 108 : btshared->brokenhotchain = false;
1520 108 : table_parallelscan_initialize(btspool->heap,
1521 : ParallelTableScanFromBTShared(btshared),
1522 : snapshot);
1523 :
1524 : /*
1525 : * Store shared tuplesort-private state, for which we reserved space.
1526 : * Then, initialize opaque state using tuplesort routine.
1527 : */
1528 108 : sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1529 108 : tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1530 : pcxt->seg);
1531 :
1532 108 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
1533 108 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1534 :
1535 : /* Unique case requires a second spool, and associated shared state */
1536 108 : if (!btspool->isunique)
1537 60 : sharedsort2 = NULL;
1538 : else
1539 : {
1540 : /*
1541 : * Store additional shared tuplesort-private state, for which we
1542 : * reserved space. Then, initialize opaque state using tuplesort
1543 : * routine.
1544 : */
1545 48 : sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1546 48 : tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1547 : pcxt->seg);
1548 :
1549 48 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
1550 : }
1551 :
1552 : /* Store query string for workers */
1553 108 : if (debug_query_string)
1554 : {
1555 : char *sharedquery;
1556 :
1557 108 : sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1558 108 : memcpy(sharedquery, debug_query_string, querylen + 1);
1559 108 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
1560 : }
1561 :
1562 : /*
1563 : * Allocate space for each worker's WalUsage and BufferUsage; no need to
1564 : * initialize.
1565 : */
1566 108 : walusage = shm_toc_allocate(pcxt->toc,
1567 108 : mul_size(sizeof(WalUsage), pcxt->nworkers));
1568 108 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
1569 108 : bufferusage = shm_toc_allocate(pcxt->toc,
1570 108 : mul_size(sizeof(BufferUsage), pcxt->nworkers));
1571 108 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1572 :
1573 : /* Launch workers, saving status for leader/caller */
1574 108 : LaunchParallelWorkers(pcxt);
1575 108 : btleader->pcxt = pcxt;
1576 108 : btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1577 108 : if (leaderparticipates)
1578 108 : btleader->nparticipanttuplesorts++;
1579 108 : btleader->btshared = btshared;
1580 108 : btleader->sharedsort = sharedsort;
1581 108 : btleader->sharedsort2 = sharedsort2;
1582 108 : btleader->snapshot = snapshot;
1583 108 : btleader->walusage = walusage;
1584 108 : btleader->bufferusage = bufferusage;
1585 :
1586 : /* If no workers were successfully launched, back out (do serial build) */
1587 108 : if (pcxt->nworkers_launched == 0)
1588 : {
1589 0 : _bt_end_parallel(btleader);
1590 0 : return;
1591 : }
1592 :
1593 : /* Save leader state now that it's clear build will be parallel */
1594 108 : buildstate->btleader = btleader;
1595 :
1596 : /* Join heap scan ourselves */
1597 108 : if (leaderparticipates)
1598 108 : _bt_leader_participate_as_worker(buildstate);
1599 :
1600 : /*
1601 : * Caller needs to wait for all launched workers when we return. Make
1602 : * sure that the failure-to-start case will not hang forever.
1603 : */
1604 108 : WaitForParallelWorkersToAttach(pcxt);
1605 : }
1606 :
1607 : /*
1608 : * Shut down workers, destroy parallel context, and end parallel mode.
1609 : */
1610 : static void
1611 108 : _bt_end_parallel(BTLeader *btleader)
1612 : {
1613 : int i;
1614 :
1615 : /* Shutdown worker processes */
1616 108 : WaitForParallelWorkersToFinish(btleader->pcxt);
1617 :
1618 : /*
1619 : * Next, accumulate WAL usage. (This must wait for the workers to finish,
1620 : * or we might get incomplete data.)
1621 : */
1622 220 : for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
1623 112 : InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
1624 :
1625 : /* Free last reference to MVCC snapshot, if one was used */
1626 108 : if (IsMVCCSnapshot(btleader->snapshot))
1627 0 : UnregisterSnapshot(btleader->snapshot);
1628 108 : DestroyParallelContext(btleader->pcxt);
1629 108 : ExitParallelMode();
1630 108 : }
1631 :
1632 : /*
1633 : * Returns size of shared memory required to store state for a parallel
1634 : * btree index build based on the snapshot its parallel scan will use.
1635 : */
1636 : static Size
1637 108 : _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
1638 : {
1639 : /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1640 108 : return add_size(BUFFERALIGN(sizeof(BTShared)),
1641 : table_parallelscan_estimate(heap, snapshot));
1642 : }
1643 :
1644 : /*
1645 : * Within leader, wait for end of heap scan.
1646 : *
1647 : * When called, parallel heap scan started by _bt_begin_parallel() will
1648 : * already be underway within worker processes (when leader participates
1649 : * as a worker, we should end up here just as workers are finishing).
1650 : *
1651 : * Fills in fields needed for ambuild statistics, and lets caller set
1652 : * field indicating that some worker encountered a broken HOT chain.
1653 : *
1654 : * Returns the total number of heap tuples scanned.
1655 : */
1656 : static double
1657 108 : _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
1658 : {
1659 108 : BTShared *btshared = buildstate->btleader->btshared;
1660 : int nparticipanttuplesorts;
1661 : double reltuples;
1662 :
1663 108 : nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1664 : for (;;)
1665 : {
1666 276 : SpinLockAcquire(&btshared->mutex);
1667 276 : if (btshared->nparticipantsdone == nparticipanttuplesorts)
1668 : {
1669 108 : buildstate->havedead = btshared->havedead;
1670 108 : buildstate->indtuples = btshared->indtuples;
1671 108 : *brokenhotchain = btshared->brokenhotchain;
1672 108 : reltuples = btshared->reltuples;
1673 108 : SpinLockRelease(&btshared->mutex);
1674 108 : break;
1675 : }
1676 168 : SpinLockRelease(&btshared->mutex);
1677 :
1678 168 : ConditionVariableSleep(&btshared->workersdonecv,
1679 : WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
1680 : }
1681 :
1682 108 : ConditionVariableCancelSleep();
1683 :
1684 108 : return reltuples;
1685 : }
1686 :
1687 : /*
1688 : * Within leader, participate as a parallel worker.
1689 : */
1690 : static void
1691 108 : _bt_leader_participate_as_worker(BTBuildState *buildstate)
1692 : {
1693 108 : BTLeader *btleader = buildstate->btleader;
1694 : BTSpool *leaderworker;
1695 : BTSpool *leaderworker2;
1696 : int sortmem;
1697 :
1698 : /* Allocate memory and initialize private spool */
1699 108 : leaderworker = palloc0_object(BTSpool);
1700 108 : leaderworker->heap = buildstate->spool->heap;
1701 108 : leaderworker->index = buildstate->spool->index;
1702 108 : leaderworker->isunique = buildstate->spool->isunique;
1703 108 : leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;
1704 :
1705 : /* Initialize second spool, if required */
1706 108 : if (!btleader->btshared->isunique)
1707 60 : leaderworker2 = NULL;
1708 : else
1709 : {
1710 : /* Allocate memory for worker's own private secondary spool */
1711 48 : leaderworker2 = palloc0_object(BTSpool);
1712 :
1713 : /* Initialize worker's own secondary spool */
1714 48 : leaderworker2->heap = leaderworker->heap;
1715 48 : leaderworker2->index = leaderworker->index;
1716 48 : leaderworker2->isunique = false;
1717 : }
1718 :
1719 : /*
1720 : * Might as well use reliable figure when doling out maintenance_work_mem
1721 : * (when requested number of workers were not launched, this will be
1722 : * somewhat higher than it is for other workers).
1723 : */
1724 108 : sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
1725 :
1726 : /* Perform work common to all participants */
1727 108 : _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
1728 : btleader->sharedsort, btleader->sharedsort2,
1729 : sortmem, true);
1730 :
1731 : #ifdef BTREE_BUILD_STATS
1732 : if (log_btree_build_stats)
1733 : {
1734 : ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1735 : ResetUsage();
1736 : }
1737 : #endif /* BTREE_BUILD_STATS */
1738 108 : }
1739 :
1740 : /*
1741 : * Perform work within a launched parallel process.
1742 : */
1743 : void
1744 112 : _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
1745 : {
1746 : char *sharedquery;
1747 : BTSpool *btspool;
1748 : BTSpool *btspool2;
1749 : BTShared *btshared;
1750 : Sharedsort *sharedsort;
1751 : Sharedsort *sharedsort2;
1752 : Relation heapRel;
1753 : Relation indexRel;
1754 : LOCKMODE heapLockmode;
1755 : LOCKMODE indexLockmode;
1756 : WalUsage *walusage;
1757 : BufferUsage *bufferusage;
1758 : int sortmem;
1759 :
1760 : #ifdef BTREE_BUILD_STATS
1761 : if (log_btree_build_stats)
1762 : ResetUsage();
1763 : #endif /* BTREE_BUILD_STATS */
1764 :
1765 : /*
1766 : * The only possible status flag that can be set to the parallel worker is
1767 : * PROC_IN_SAFE_IC.
1768 : */
1769 : Assert((MyProc->statusFlags == 0) ||
1770 : (MyProc->statusFlags == PROC_IN_SAFE_IC));
1771 :
1772 : /* Set debug_query_string for individual workers first */
1773 112 : sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
1774 112 : debug_query_string = sharedquery;
1775 :
1776 : /* Report the query string from leader */
1777 112 : pgstat_report_activity(STATE_RUNNING, debug_query_string);
1778 :
1779 : /* Look up nbtree shared state */
1780 112 : btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1781 :
1782 : /* Open relations using lock modes known to be obtained by index.c */
1783 112 : if (!btshared->isconcurrent)
1784 : {
1785 112 : heapLockmode = ShareLock;
1786 112 : indexLockmode = AccessExclusiveLock;
1787 : }
1788 : else
1789 : {
1790 0 : heapLockmode = ShareUpdateExclusiveLock;
1791 0 : indexLockmode = RowExclusiveLock;
1792 : }
1793 :
1794 : /* Track query ID */
1795 112 : pgstat_report_query_id(btshared->queryid, false);
1796 :
1797 : /* Open relations within worker */
1798 112 : heapRel = table_open(btshared->heaprelid, heapLockmode);
1799 112 : indexRel = index_open(btshared->indexrelid, indexLockmode);
1800 :
1801 : /* Initialize worker's own spool */
1802 112 : btspool = palloc0_object(BTSpool);
1803 112 : btspool->heap = heapRel;
1804 112 : btspool->index = indexRel;
1805 112 : btspool->isunique = btshared->isunique;
1806 112 : btspool->nulls_not_distinct = btshared->nulls_not_distinct;
1807 :
1808 : /* Look up shared state private to tuplesort.c */
1809 112 : sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1810 112 : tuplesort_attach_shared(sharedsort, seg);
1811 112 : if (!btshared->isunique)
1812 : {
1813 64 : btspool2 = NULL;
1814 64 : sharedsort2 = NULL;
1815 : }
1816 : else
1817 : {
1818 : /* Allocate memory for worker's own private secondary spool */
1819 48 : btspool2 = palloc0_object(BTSpool);
1820 :
1821 : /* Initialize worker's own secondary spool */
1822 48 : btspool2->heap = btspool->heap;
1823 48 : btspool2->index = btspool->index;
1824 48 : btspool2->isunique = false;
1825 : /* Look up shared state private to tuplesort.c */
1826 48 : sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1827 48 : tuplesort_attach_shared(sharedsort2, seg);
1828 : }
1829 :
1830 : /* Prepare to track buffer usage during parallel execution */
1831 112 : InstrStartParallelQuery();
1832 :
1833 : /* Perform sorting of spool, and possibly a spool2 */
1834 112 : sortmem = maintenance_work_mem / btshared->scantuplesortstates;
1835 112 : _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1836 : sharedsort2, sortmem, false);
1837 :
1838 : /* Report WAL/buffer usage during parallel execution */
1839 112 : bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
1840 112 : walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
1841 112 : InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
1842 112 : &walusage[ParallelWorkerNumber]);
1843 :
1844 : #ifdef BTREE_BUILD_STATS
1845 : if (log_btree_build_stats)
1846 : {
1847 : ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1848 : ResetUsage();
1849 : }
1850 : #endif /* BTREE_BUILD_STATS */
1851 :
1852 112 : index_close(indexRel, indexLockmode);
1853 112 : table_close(heapRel, heapLockmode);
1854 112 : }
1855 :
1856 : /*
1857 : * Perform a worker's portion of a parallel sort.
1858 : *
1859 : * This generates a tuplesort for passed btspool, and a second tuplesort
1860 : * state if a second btspool is need (i.e. for unique index builds). All
1861 : * other spool fields should already be set when this is called.
1862 : *
1863 : * sortmem is the amount of working memory to use within each worker,
1864 : * expressed in KBs.
1865 : *
1866 : * When this returns, workers are done, and need only release resources.
1867 : */
1868 : static void
1869 220 : _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
1870 : BTShared *btshared, Sharedsort *sharedsort,
1871 : Sharedsort *sharedsort2, int sortmem, bool progress)
1872 : {
1873 : SortCoordinate coordinate;
1874 : BTBuildState buildstate;
1875 : TableScanDesc scan;
1876 : double reltuples;
1877 : IndexInfo *indexInfo;
1878 :
1879 : /* Initialize local tuplesort coordination state */
1880 220 : coordinate = palloc0_object(SortCoordinateData);
1881 220 : coordinate->isWorker = true;
1882 220 : coordinate->nParticipants = -1;
1883 220 : coordinate->sharedsort = sharedsort;
1884 :
1885 : /* Begin "partial" tuplesort */
1886 440 : btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1887 : btspool->index,
1888 220 : btspool->isunique,
1889 220 : btspool->nulls_not_distinct,
1890 : sortmem, coordinate,
1891 : TUPLESORT_NONE);
1892 :
1893 : /*
1894 : * Just as with serial case, there may be a second spool. If so, a
1895 : * second, dedicated spool2 partial tuplesort is required.
1896 : */
1897 220 : if (btspool2)
1898 : {
1899 : SortCoordinate coordinate2;
1900 :
1901 : /*
1902 : * We expect that the second one (for dead tuples) won't get very
1903 : * full, so we give it only work_mem (unless sortmem is less for
1904 : * worker). Worker processes are generally permitted to allocate
1905 : * work_mem independently.
1906 : */
1907 96 : coordinate2 = palloc0_object(SortCoordinateData);
1908 96 : coordinate2->isWorker = true;
1909 96 : coordinate2->nParticipants = -1;
1910 96 : coordinate2->sharedsort = sharedsort2;
1911 96 : btspool2->sortstate =
1912 96 : tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,
1913 : Min(sortmem, work_mem), coordinate2,
1914 : false);
1915 : }
1916 :
1917 : /* Fill in buildstate for _bt_build_callback() */
1918 220 : buildstate.isunique = btshared->isunique;
1919 220 : buildstate.nulls_not_distinct = btshared->nulls_not_distinct;
1920 220 : buildstate.havedead = false;
1921 220 : buildstate.heap = btspool->heap;
1922 220 : buildstate.spool = btspool;
1923 220 : buildstate.spool2 = btspool2;
1924 220 : buildstate.indtuples = 0;
1925 220 : buildstate.btleader = NULL;
1926 :
1927 : /* Join parallel scan */
1928 220 : indexInfo = BuildIndexInfo(btspool->index);
1929 220 : indexInfo->ii_Concurrent = btshared->isconcurrent;
1930 220 : scan = table_beginscan_parallel(btspool->heap,
1931 : ParallelTableScanFromBTShared(btshared));
1932 220 : reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
1933 : true, progress, _bt_build_callback,
1934 : &buildstate, scan);
1935 :
1936 : /* Execute this worker's part of the sort */
1937 220 : if (progress)
1938 108 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1939 : PROGRESS_BTREE_PHASE_PERFORMSORT_1);
1940 220 : tuplesort_performsort(btspool->sortstate);
1941 220 : if (btspool2)
1942 : {
1943 96 : if (progress)
1944 48 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1945 : PROGRESS_BTREE_PHASE_PERFORMSORT_2);
1946 96 : tuplesort_performsort(btspool2->sortstate);
1947 : }
1948 :
1949 : /*
1950 : * Done. Record ambuild statistics, and whether we encountered a broken
1951 : * HOT chain.
1952 : */
1953 220 : SpinLockAcquire(&btshared->mutex);
1954 220 : btshared->nparticipantsdone++;
1955 220 : btshared->reltuples += reltuples;
1956 220 : if (buildstate.havedead)
1957 0 : btshared->havedead = true;
1958 220 : btshared->indtuples += buildstate.indtuples;
1959 220 : if (indexInfo->ii_BrokenHotChain)
1960 0 : btshared->brokenhotchain = true;
1961 220 : SpinLockRelease(&btshared->mutex);
1962 :
1963 : /* Notify leader */
1964 220 : ConditionVariableSignal(&btshared->workersdonecv);
1965 :
1966 : /* We can end tuplesorts immediately */
1967 220 : tuplesort_end(btspool->sortstate);
1968 220 : if (btspool2)
1969 96 : tuplesort_end(btspool2->sortstate);
1970 220 : }
|