Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtsort.c
4 : * Build a btree from sorted input by loading leaf pages sequentially.
5 : *
6 : * NOTES
7 : *
8 : * We use tuplesort.c to sort the given index tuples into order.
9 : * Then we scan the index tuples in order and build the btree pages
10 : * for each level. We load source tuples into leaf-level pages.
11 : * Whenever we fill a page at one level, we add a link to it to its
12 : * parent level (starting a new parent level if necessary). When
13 : * done, we write out each final page on each level, adding it to
14 : * its parent level. When we have only one page on a level, it must be
15 : * the root -- it can be attached to the btree metapage and we are done.
16 : *
17 : * It is not wise to pack the pages entirely full, since then *any*
18 : * insertion would cause a split (and not only of the leaf page; the need
19 : * for a split would cascade right up the tree). The steady-state load
20 : * factor for btrees is usually estimated at 70%. We choose to pack leaf
21 : * pages to the user-controllable fill factor (default 90%) while upper pages
22 : * are always packed to 70%. This gives us reasonable density (there aren't
23 : * many upper pages if the keys are reasonable-size) without risking a lot of
24 : * cascading splits during early insertions.
25 : *
26 : * We use the bulk smgr loading facility to bypass the buffer cache and
27 : * WAL-log the pages efficiently.
28 : *
29 : * This code isn't concerned about the FSM at all. The caller is responsible
30 : * for initializing that.
31 : *
32 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
33 : * Portions Copyright (c) 1994, Regents of the University of California
34 : *
35 : * IDENTIFICATION
36 : * src/backend/access/nbtree/nbtsort.c
37 : *
38 : *-------------------------------------------------------------------------
39 : */
40 :
41 : #include "postgres.h"
42 :
43 : #include "access/nbtree.h"
44 : #include "access/parallel.h"
45 : #include "access/relscan.h"
46 : #include "access/table.h"
47 : #include "access/tableam.h"
48 : #include "access/xact.h"
49 : #include "catalog/index.h"
50 : #include "commands/progress.h"
51 : #include "executor/instrument.h"
52 : #include "miscadmin.h"
53 : #include "pgstat.h"
54 : #include "storage/bulk_write.h"
55 : #include "storage/proc.h"
56 : #include "tcop/tcopprot.h"
57 : #include "utils/rel.h"
58 : #include "utils/sortsupport.h"
59 : #include "utils/tuplesort.h"
60 :
61 :
62 : /* Magic numbers for parallel state sharing */
63 : #define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001)
64 : #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002)
65 : #define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003)
66 : #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004)
67 : #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xA000000000000005)
68 : #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xA000000000000006)
69 :
70 : /*
71 : * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
72 : * parallel index builds. This may be useful as a debugging aid.
73 : */
74 : /* #define DISABLE_LEADER_PARTICIPATION */
75 :
76 : /*
77 : * Status record for spooling/sorting phase. (Note we may have two of
78 : * these due to the special requirements for uniqueness-checking with
79 : * dead tuples.)
80 : */
81 : typedef struct BTSpool
82 : {
83 : Tuplesortstate *sortstate; /* state data for tuplesort.c */
84 : Relation heap;
85 : Relation index;
86 : bool isunique;
87 : bool nulls_not_distinct;
88 : } BTSpool;
89 :
90 : /*
91 : * Status for index builds performed in parallel. This is allocated in a
92 : * dynamic shared memory segment. Note that there is a separate tuplesort TOC
93 : * entry, private to tuplesort.c but allocated by this module on its behalf.
94 : */
95 : typedef struct BTShared
96 : {
97 : /*
98 : * These fields are not modified during the sort. They primarily exist
99 : * for the benefit of worker processes that need to create BTSpool state
100 : * corresponding to that used by the leader.
101 : */
102 : Oid heaprelid;
103 : Oid indexrelid;
104 : bool isunique;
105 : bool nulls_not_distinct;
106 : bool isconcurrent;
107 : int scantuplesortstates;
108 :
109 : /* Query ID, for report in worker processes */
110 : int64 queryid;
111 :
112 : /*
113 : * workersdonecv is used to monitor the progress of workers. All parallel
114 : * participants must indicate that they are done before leader can use
115 : * mutable state that workers maintain during scan (and before leader can
116 : * proceed to tuplesort_performsort()).
117 : */
118 : ConditionVariable workersdonecv;
119 :
120 : /*
121 : * mutex protects all fields before heapdesc.
122 : *
123 : * These fields contain status information of interest to B-Tree index
124 : * builds that must work just the same when an index is built in parallel.
125 : */
126 : slock_t mutex;
127 :
128 : /*
129 : * Mutable state that is maintained by workers, and reported back to
130 : * leader at end of parallel scan.
131 : *
132 : * nparticipantsdone is number of worker processes finished.
133 : *
134 : * reltuples is the total number of input heap tuples.
135 : *
136 : * havedead indicates if RECENTLY_DEAD tuples were encountered during
137 : * build.
138 : *
139 : * indtuples is the total number of tuples that made it into the index.
140 : *
141 : * brokenhotchain indicates if any worker detected a broken HOT chain
142 : * during build.
143 : */
144 : int nparticipantsdone;
145 : double reltuples;
146 : bool havedead;
147 : double indtuples;
148 : bool brokenhotchain;
149 :
150 : /*
151 : * ParallelTableScanDescData data follows. Can't directly embed here, as
152 : * implementations of the parallel table scan desc interface might need
153 : * stronger alignment.
154 : */
155 : } BTShared;
156 :
157 : /*
158 : * Return pointer to a BTShared's parallel table scan.
159 : *
160 : * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
161 : * MAXALIGN.
162 : */
163 : #define ParallelTableScanFromBTShared(shared) \
164 : (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
165 :
166 : /*
167 : * Status for leader in parallel index build.
168 : */
169 : typedef struct BTLeader
170 : {
171 : /* parallel context itself */
172 : ParallelContext *pcxt;
173 :
174 : /*
175 : * nparticipanttuplesorts is the exact number of worker processes
176 : * successfully launched, plus one leader process if it participates as a
177 : * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
178 : * participating as a worker).
179 : */
180 : int nparticipanttuplesorts;
181 :
182 : /*
183 : * Leader process convenience pointers to shared state (leader avoids TOC
184 : * lookups).
185 : *
186 : * btshared is the shared state for entire build. sharedsort is the
187 : * shared, tuplesort-managed state passed to each process tuplesort.
188 : * sharedsort2 is the corresponding btspool2 shared state, used only when
189 : * building unique indexes. snapshot is the snapshot used by the scan iff
190 : * an MVCC snapshot is required.
191 : */
192 : BTShared *btshared;
193 : Sharedsort *sharedsort;
194 : Sharedsort *sharedsort2;
195 : Snapshot snapshot;
196 : WalUsage *walusage;
197 : BufferUsage *bufferusage;
198 : } BTLeader;
199 :
200 : /*
201 : * Working state for btbuild and its callback.
202 : *
203 : * When parallel CREATE INDEX is used, there is a BTBuildState for each
204 : * participant.
205 : */
206 : typedef struct BTBuildState
207 : {
208 : bool isunique;
209 : bool nulls_not_distinct;
210 : bool havedead;
211 : Relation heap;
212 : BTSpool *spool;
213 :
214 : /*
215 : * spool2 is needed only when the index is a unique index. Dead tuples are
216 : * put into spool2 instead of spool in order to avoid uniqueness check.
217 : */
218 : BTSpool *spool2;
219 : double indtuples;
220 :
221 : /*
222 : * btleader is only present when a parallel index build is performed, and
223 : * only in the leader process. (Actually, only the leader has a
224 : * BTBuildState. Workers have their own spool and spool2, though.)
225 : */
226 : BTLeader *btleader;
227 : } BTBuildState;
228 :
229 : /*
230 : * Status record for a btree page being built. We have one of these
231 : * for each active tree level.
232 : */
233 : typedef struct BTPageState
234 : {
235 : BulkWriteBuffer btps_buf; /* workspace for page building */
236 : BlockNumber btps_blkno; /* block # to write this page at */
237 : IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */
238 : OffsetNumber btps_lastoff; /* last item offset loaded */
239 : Size btps_lastextra; /* last item's extra posting list space */
240 : uint32 btps_level; /* tree level (0 = leaf) */
241 : Size btps_full; /* "full" if less than this much free space */
242 : struct BTPageState *btps_next; /* link to parent level, if any */
243 : } BTPageState;
244 :
245 : /*
246 : * Overall status record for index writing phase.
247 : */
248 : typedef struct BTWriteState
249 : {
250 : Relation heap;
251 : Relation index;
252 : BulkWriteState *bulkstate;
253 : BTScanInsert inskey; /* generic insertion scankey */
254 : BlockNumber btws_pages_alloced; /* # pages allocated */
255 : } BTWriteState;
256 :
257 :
258 : static double _bt_spools_heapscan(Relation heap, Relation index,
259 : BTBuildState *buildstate, IndexInfo *indexInfo);
260 : static void _bt_spooldestroy(BTSpool *btspool);
261 : static void _bt_spool(BTSpool *btspool, const ItemPointerData *self,
262 : const Datum *values, const bool *isnull);
263 : static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
264 : static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
265 : bool *isnull, bool tupleIsAlive, void *state);
266 : static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level);
267 : static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
268 : static void _bt_slideleft(Page rightmostpage);
269 : static void _bt_sortaddtup(Page page, Size itemsize,
270 : const IndexTupleData *itup, OffsetNumber itup_off,
271 : bool newfirstdataitem);
272 : static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
273 : IndexTuple itup, Size truncextra);
274 : static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
275 : BTPageState *state,
276 : BTDedupState dstate);
277 : static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
278 : static void _bt_load(BTWriteState *wstate,
279 : BTSpool *btspool, BTSpool *btspool2);
280 : static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
281 : int request);
282 : static void _bt_end_parallel(BTLeader *btleader);
283 : static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
284 : static double _bt_parallel_heapscan(BTBuildState *buildstate,
285 : bool *brokenhotchain);
286 : static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
287 : static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
288 : BTShared *btshared, Sharedsort *sharedsort,
289 : Sharedsort *sharedsort2, int sortmem,
290 : bool progress);
291 :
292 :
293 : /*
294 : * btbuild() -- build a new btree index.
295 : */
296 : IndexBuildResult *
297 26176 : btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
298 : {
299 : IndexBuildResult *result;
300 : BTBuildState buildstate;
301 : double reltuples;
302 :
303 : #ifdef BTREE_BUILD_STATS
304 : if (log_btree_build_stats)
305 : ResetUsage();
306 : #endif /* BTREE_BUILD_STATS */
307 :
308 26176 : buildstate.isunique = indexInfo->ii_Unique;
309 26176 : buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
310 26176 : buildstate.havedead = false;
311 26176 : buildstate.heap = heap;
312 26176 : buildstate.spool = NULL;
313 26176 : buildstate.spool2 = NULL;
314 26176 : buildstate.indtuples = 0;
315 26176 : buildstate.btleader = NULL;
316 :
317 : /*
318 : * We expect to be called exactly once for any index relation. If that's
319 : * not the case, big trouble's what we have.
320 : */
321 26176 : if (RelationGetNumberOfBlocks(index) != 0)
322 0 : elog(ERROR, "index \"%s\" already contains data",
323 : RelationGetRelationName(index));
324 :
325 26176 : reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
326 :
327 : /*
328 : * Finish the build by (1) completing the sort of the spool file, (2)
329 : * inserting the sorted tuples into btree pages and (3) building the upper
330 : * levels. Finally, it may also be necessary to end use of parallelism.
331 : */
332 26170 : _bt_leafbuild(buildstate.spool, buildstate.spool2);
333 26128 : _bt_spooldestroy(buildstate.spool);
334 26128 : if (buildstate.spool2)
335 7 : _bt_spooldestroy(buildstate.spool2);
336 26128 : if (buildstate.btleader)
337 80 : _bt_end_parallel(buildstate.btleader);
338 :
339 26128 : result = palloc_object(IndexBuildResult);
340 :
341 26128 : result->heap_tuples = reltuples;
342 26128 : result->index_tuples = buildstate.indtuples;
343 :
344 : #ifdef BTREE_BUILD_STATS
345 : if (log_btree_build_stats)
346 : {
347 : ShowUsage("BTREE BUILD STATS");
348 : ResetUsage();
349 : }
350 : #endif /* BTREE_BUILD_STATS */
351 :
352 26128 : return result;
353 : }
354 :
355 : /*
356 : * Create and initialize one or two spool structures, and save them in caller's
357 : * buildstate argument. May also fill-in fields within indexInfo used by index
358 : * builds.
359 : *
360 : * Scans the heap, possibly in parallel, filling spools with IndexTuples. This
361 : * routine encapsulates all aspects of managing parallelism. Caller need only
362 : * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
363 : *
364 : * Returns the total number of heap tuples scanned.
365 : */
366 : static double
367 26176 : _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
368 : IndexInfo *indexInfo)
369 : {
370 26176 : BTSpool *btspool = palloc0_object(BTSpool);
371 26176 : SortCoordinate coordinate = NULL;
372 26176 : double reltuples = 0;
373 :
374 : /*
375 : * We size the sort area as maintenance_work_mem rather than work_mem to
376 : * speed index creation. This should be OK since a single backend can't
377 : * run multiple index creations in parallel (see also: notes on
378 : * parallelism and maintenance_work_mem below).
379 : */
380 26176 : btspool->heap = heap;
381 26176 : btspool->index = index;
382 26176 : btspool->isunique = indexInfo->ii_Unique;
383 26176 : btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
384 :
385 : /* Save as primary spool */
386 26176 : buildstate->spool = btspool;
387 :
388 : /* Report table scan phase started */
389 26176 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
390 : PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);
391 :
392 : /* Attempt to launch parallel worker scan when required */
393 26176 : if (indexInfo->ii_ParallelWorkers > 0)
394 80 : _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
395 : indexInfo->ii_ParallelWorkers);
396 :
397 : /*
398 : * If parallel build requested and at least one worker process was
399 : * successfully launched, set up coordination state
400 : */
401 26176 : if (buildstate->btleader)
402 : {
403 80 : coordinate = palloc0_object(SortCoordinateData);
404 80 : coordinate->isWorker = false;
405 80 : coordinate->nParticipants =
406 80 : buildstate->btleader->nparticipanttuplesorts;
407 80 : coordinate->sharedsort = buildstate->btleader->sharedsort;
408 : }
409 :
410 : /*
411 : * Begin serial/leader tuplesort.
412 : *
413 : * In cases where parallelism is involved, the leader receives the same
414 : * share of maintenance_work_mem as a serial sort (it is generally treated
415 : * in the same way as a serial sort once we return). Parallel worker
416 : * Tuplesortstates will have received only a fraction of
417 : * maintenance_work_mem, though.
418 : *
419 : * We rely on the lifetime of the Leader Tuplesortstate almost not
420 : * overlapping with any worker Tuplesortstate's lifetime. There may be
421 : * some small overlap, but that's okay because we rely on leader
422 : * Tuplesortstate only allocating a small, fixed amount of memory here.
423 : * When its tuplesort_performsort() is called (by our caller), and
424 : * significant amounts of memory are likely to be used, all workers must
425 : * have already freed almost all memory held by their Tuplesortstates
426 : * (they are about to go away completely, too). The overall effect is
427 : * that maintenance_work_mem always represents an absolute high watermark
428 : * on the amount of memory used by a CREATE INDEX operation, regardless of
429 : * the use of parallelism or any other factor.
430 : */
431 52352 : buildstate->spool->sortstate =
432 26176 : tuplesort_begin_index_btree(heap, index, buildstate->isunique,
433 26176 : buildstate->nulls_not_distinct,
434 : maintenance_work_mem, coordinate,
435 : TUPLESORT_NONE);
436 :
437 : /*
438 : * If building a unique index, put dead tuples in a second spool to keep
439 : * them out of the uniqueness check. We expect that the second spool (for
440 : * dead tuples) won't get very full, so we give it only work_mem.
441 : */
442 26176 : if (indexInfo->ii_Unique)
443 : {
444 21241 : BTSpool *btspool2 = palloc0_object(BTSpool);
445 21241 : SortCoordinate coordinate2 = NULL;
446 :
447 : /* Initialize secondary spool */
448 21241 : btspool2->heap = heap;
449 21241 : btspool2->index = index;
450 21241 : btspool2->isunique = false;
451 : /* Save as secondary spool */
452 21241 : buildstate->spool2 = btspool2;
453 :
454 21241 : if (buildstate->btleader)
455 : {
456 : /*
457 : * Set up non-private state that is passed to
458 : * tuplesort_begin_index_btree() about the basic high level
459 : * coordination of a parallel sort.
460 : */
461 35 : coordinate2 = palloc0_object(SortCoordinateData);
462 35 : coordinate2->isWorker = false;
463 35 : coordinate2->nParticipants =
464 35 : buildstate->btleader->nparticipanttuplesorts;
465 35 : coordinate2->sharedsort = buildstate->btleader->sharedsort2;
466 : }
467 :
468 : /*
469 : * We expect that the second one (for dead tuples) won't get very
470 : * full, so we give it only work_mem
471 : */
472 21241 : buildstate->spool2->sortstate =
473 21241 : tuplesort_begin_index_btree(heap, index, false, false, work_mem,
474 : coordinate2, TUPLESORT_NONE);
475 : }
476 :
477 : /* Fill spool using either serial or parallel heap scan */
478 26176 : if (!buildstate->btleader)
479 26096 : reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
480 : _bt_build_callback, buildstate,
481 : NULL);
482 : else
483 80 : reltuples = _bt_parallel_heapscan(buildstate,
484 : &indexInfo->ii_BrokenHotChain);
485 :
486 : /*
487 : * Set the progress target for the next phase. Reset the block number
488 : * values set by table_index_build_scan
489 : */
490 : {
491 26170 : const int progress_index[] = {
492 : PROGRESS_CREATEIDX_TUPLES_TOTAL,
493 : PROGRESS_SCAN_BLOCKS_TOTAL,
494 : PROGRESS_SCAN_BLOCKS_DONE
495 : };
496 26170 : const int64 progress_vals[] = {
497 26170 : buildstate->indtuples,
498 : 0, 0
499 : };
500 :
501 26170 : pgstat_progress_update_multi_param(3, progress_index, progress_vals);
502 : }
503 :
504 : /* okay, all heap tuples are spooled */
505 26170 : if (buildstate->spool2 && !buildstate->havedead)
506 : {
507 : /* spool2 turns out to be unnecessary */
508 21234 : _bt_spooldestroy(buildstate->spool2);
509 21234 : buildstate->spool2 = NULL;
510 : }
511 :
512 26170 : return reltuples;
513 : }
514 :
515 : /*
516 : * clean up a spool structure and its substructures.
517 : */
518 : static void
519 47369 : _bt_spooldestroy(BTSpool *btspool)
520 : {
521 47369 : tuplesort_end(btspool->sortstate);
522 47369 : pfree(btspool);
523 47369 : }
524 :
525 : /*
526 : * spool an index entry into the sort file.
527 : */
528 : static void
529 6439087 : _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull)
530 : {
531 6439087 : tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
532 : self, values, isnull);
533 6439087 : }
534 :
535 : /*
536 : * given a spool loaded by successive calls to _bt_spool,
537 : * create an entire btree.
538 : */
539 : static void
540 26170 : _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
541 : {
542 : BTWriteState wstate;
543 :
544 : #ifdef BTREE_BUILD_STATS
545 : if (log_btree_build_stats)
546 : {
547 : ShowUsage("BTREE BUILD (Spool) STATISTICS");
548 : ResetUsage();
549 : }
550 : #endif /* BTREE_BUILD_STATS */
551 :
552 : /* Execute the sort */
553 26170 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
554 : PROGRESS_BTREE_PHASE_PERFORMSORT_1);
555 26170 : tuplesort_performsort(btspool->sortstate);
556 26128 : if (btspool2)
557 : {
558 7 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
559 : PROGRESS_BTREE_PHASE_PERFORMSORT_2);
560 7 : tuplesort_performsort(btspool2->sortstate);
561 : }
562 :
563 26128 : wstate.heap = btspool->heap;
564 26128 : wstate.index = btspool->index;
565 26128 : wstate.inskey = _bt_mkscankey(wstate.index, NULL);
566 : /* _bt_mkscankey() won't set allequalimage without metapage */
567 26128 : wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
568 :
569 : /* reserve the metapage */
570 26128 : wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
571 :
572 26128 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
573 : PROGRESS_BTREE_PHASE_LEAF_LOAD);
574 26128 : _bt_load(&wstate, btspool, btspool2);
575 26128 : }
576 :
577 : /*
578 : * Per-tuple callback for table_index_build_scan
579 : */
580 : static void
581 6439087 : _bt_build_callback(Relation index,
582 : ItemPointer tid,
583 : Datum *values,
584 : bool *isnull,
585 : bool tupleIsAlive,
586 : void *state)
587 : {
588 6439087 : BTBuildState *buildstate = (BTBuildState *) state;
589 :
590 : /*
591 : * insert the index tuple into the appropriate spool file for subsequent
592 : * processing
593 : */
594 6439087 : if (tupleIsAlive || buildstate->spool2 == NULL)
595 6438850 : _bt_spool(buildstate->spool, tid, values, isnull);
596 : else
597 : {
598 : /* dead tuples are put into spool2 */
599 237 : buildstate->havedead = true;
600 237 : _bt_spool(buildstate->spool2, tid, values, isnull);
601 : }
602 :
603 6439087 : buildstate->indtuples += 1;
604 6439087 : }
605 :
606 : /*
607 : * allocate workspace for a new, clean btree page, not linked to any siblings.
608 : */
609 : static BulkWriteBuffer
610 26799 : _bt_blnewpage(BTWriteState *wstate, uint32 level)
611 : {
612 : BulkWriteBuffer buf;
613 : Page page;
614 : BTPageOpaque opaque;
615 :
616 26799 : buf = smgr_bulk_get_buf(wstate->bulkstate);
617 26799 : page = (Page) buf;
618 :
619 : /* Zero the page and set up standard page header info */
620 26799 : _bt_pageinit(page, BLCKSZ);
621 :
622 : /* Initialize BT opaque state */
623 26799 : opaque = BTPageGetOpaque(page);
624 26799 : opaque->btpo_prev = opaque->btpo_next = P_NONE;
625 26799 : opaque->btpo_level = level;
626 26799 : opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
627 26799 : opaque->btpo_cycleid = 0;
628 :
629 : /* Make the P_HIKEY line pointer appear allocated */
630 26799 : ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
631 :
632 26799 : return buf;
633 : }
634 :
635 : /*
636 : * emit a completed btree page, and release the working storage.
637 : */
638 : static void
639 52927 : _bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
640 : {
641 52927 : smgr_bulk_write(wstate->bulkstate, blkno, buf, true);
642 : /* smgr_bulk_write took ownership of 'buf' */
643 52927 : }
644 :
645 : /*
646 : * allocate and initialize a new BTPageState. the returned structure
647 : * is suitable for immediate use by _bt_buildadd.
648 : */
649 : static BTPageState *
650 6649 : _bt_pagestate(BTWriteState *wstate, uint32 level)
651 : {
652 6649 : BTPageState *state = palloc0_object(BTPageState);
653 :
654 : /* create initial page for level */
655 6649 : state->btps_buf = _bt_blnewpage(wstate, level);
656 :
657 : /* and assign it a page position */
658 6649 : state->btps_blkno = wstate->btws_pages_alloced++;
659 :
660 6649 : state->btps_lowkey = NULL;
661 : /* initialize lastoff so first item goes into P_FIRSTKEY */
662 6649 : state->btps_lastoff = P_HIKEY;
663 6649 : state->btps_lastextra = 0;
664 6649 : state->btps_level = level;
665 : /* set "full" threshold based on level. See notes at head of file. */
666 6649 : if (level > 0)
667 1367 : state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
668 : else
669 5282 : state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
670 :
671 : /* no parent level, yet */
672 6649 : state->btps_next = NULL;
673 :
674 6649 : return state;
675 : }
676 :
677 : /*
678 : * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
679 : * P_HIKEY, overwriting P_HIKEY).
680 : *
681 : * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
682 : * rightmost page on its level is not supposed to get a high key. Now that
683 : * it's clear that this page is a rightmost page, remove the unneeded empty
684 : * P_HIKEY line pointer space.
685 : */
686 : static void
687 6649 : _bt_slideleft(Page rightmostpage)
688 : {
689 : OffsetNumber off;
690 : OffsetNumber maxoff;
691 : ItemId previi;
692 :
693 6649 : maxoff = PageGetMaxOffsetNumber(rightmostpage);
694 : Assert(maxoff >= P_FIRSTKEY);
695 6649 : previi = PageGetItemId(rightmostpage, P_HIKEY);
696 399535 : for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
697 : {
698 392886 : ItemId thisii = PageGetItemId(rightmostpage, off);
699 :
700 392886 : *previi = *thisii;
701 392886 : previi = thisii;
702 : }
703 6649 : ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
704 6649 : }
705 :
706 : /*
707 : * Add an item to a page being built.
708 : *
709 : * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
710 : * raises an error directly.
711 : *
712 : * Note that our nbtsort.c caller does not know yet if the page will be
713 : * rightmost. Offset P_FIRSTKEY is always assumed to be the first data key by
714 : * caller. Page that turns out to be the rightmost on its level is fixed by
715 : * calling _bt_slideleft().
716 : */
717 : static void
718 5791878 : _bt_sortaddtup(Page page,
719 : Size itemsize,
720 : const IndexTupleData *itup,
721 : OffsetNumber itup_off,
722 : bool newfirstdataitem)
723 : {
724 : IndexTupleData trunctuple;
725 :
726 5791878 : if (newfirstdataitem)
727 : {
728 1429 : trunctuple = *itup;
729 1429 : trunctuple.t_info = sizeof(IndexTupleData);
730 1429 : BTreeTupleSetNAtts(&trunctuple, 0, false);
731 1429 : itup = &trunctuple;
732 1429 : itemsize = sizeof(IndexTupleData);
733 : }
734 :
735 5791878 : if (PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber)
736 0 : elog(ERROR, "failed to add item to the index page");
737 5791878 : }
738 :
739 : /*----------
740 : * Add an item to a disk page from the sort output (or add a posting list
741 : * item formed from the sort output).
742 : *
743 : * We must be careful to observe the page layout conventions of nbtsearch.c:
744 : * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
745 : * - on non-leaf pages, the key portion of the first item need not be
746 : * stored, we should store only the link.
747 : *
748 : * A leaf page being built looks like:
749 : *
750 : * +----------------+---------------------------------+
751 : * | PageHeaderData | linp0 linp1 linp2 ... |
752 : * +-----------+----+---------------------------------+
753 : * | ... linpN | |
754 : * +-----------+--------------------------------------+
755 : * | ^ last |
756 : * | |
757 : * +-------------+------------------------------------+
758 : * | | itemN ... |
759 : * +-------------+------------------+-----------------+
760 : * | ... item3 item2 item1 | "special space" |
761 : * +--------------------------------+-----------------+
762 : *
763 : * Contrast this with the diagram in bufpage.h; note the mismatch
764 : * between linps and items. This is because we reserve linp0 as a
765 : * placeholder for the pointer to the "high key" item; when we have
766 : * filled up the page, we will set linp0 to point to itemN and clear
767 : * linpN. On the other hand, if we find this is the last (rightmost)
768 : * page, we leave the items alone and slide the linp array over. If
769 : * the high key is to be truncated, offset 1 is deleted, and we insert
770 : * the truncated high key at offset 1.
771 : *
772 : * 'last' pointer indicates the last offset added to the page.
773 : *
774 : * 'truncextra' is the size of the posting list in itup, if any. This
775 : * information is stashed for the next call here, when we may benefit
776 : * from considering the impact of truncating away the posting list on
777 : * the page before deciding to finish the page off. Posting lists are
778 : * often relatively large, so it is worth going to the trouble of
779 : * accounting for the saving from truncating away the posting list of
780 : * the tuple that becomes the high key (that may be the only way to
781 : * get close to target free space on the page). Note that this is
782 : * only used for the soft fillfactor-wise limit, not the critical hard
783 : * limit.
784 : *----------
785 : */
786 : static void
787 5771728 : _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
788 : Size truncextra)
789 : {
790 : BulkWriteBuffer nbuf;
791 : Page npage;
792 : BlockNumber nblkno;
793 : OffsetNumber last_off;
794 : Size last_truncextra;
795 : Size pgspc;
796 : Size itupsz;
797 : bool isleaf;
798 :
799 : /*
800 : * This is a handy place to check for cancel interrupts during the btree
801 : * load phase of index creation.
802 : */
803 5771728 : CHECK_FOR_INTERRUPTS();
804 :
805 5771728 : nbuf = state->btps_buf;
806 5771728 : npage = (Page) nbuf;
807 5771728 : nblkno = state->btps_blkno;
808 5771728 : last_off = state->btps_lastoff;
809 5771728 : last_truncextra = state->btps_lastextra;
810 5771728 : state->btps_lastextra = truncextra;
811 :
812 5771728 : pgspc = PageGetFreeSpace(npage);
813 5771728 : itupsz = IndexTupleSize(itup);
814 5771728 : itupsz = MAXALIGN(itupsz);
815 : /* Leaf case has slightly different rules due to suffix truncation */
816 5771728 : isleaf = (state->btps_level == 0);
817 :
818 : /*
819 : * Check whether the new item can fit on a btree page on current level at
820 : * all.
821 : *
822 : * Every newly built index will treat heap TID as part of the keyspace,
823 : * which imposes the requirement that new high keys must occasionally have
824 : * a heap TID appended within _bt_truncate(). That may leave a new pivot
825 : * tuple one or two MAXALIGN() quantums larger than the original
826 : * firstright tuple it's derived from. v4 deals with the problem by
827 : * decreasing the limit on the size of tuples inserted on the leaf level
828 : * by the same small amount. Enforce the new v4+ limit on the leaf level,
829 : * and the old limit on internal levels, since pivot tuples may need to
830 : * make use of the reserved space. This should never fail on internal
831 : * pages.
832 : */
833 5771728 : if (unlikely(itupsz > BTMaxItemSize))
834 132 : _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
835 : itup);
836 :
837 : /*
838 : * Check to see if current page will fit new item, with space left over to
839 : * append a heap TID during suffix truncation when page is a leaf page.
840 : *
841 : * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
842 : * high key with heap TID when finishing off a leaf page, since we rely on
843 : * _bt_check_third_page() rejecting oversized non-pivot tuples. On
844 : * internal pages we can always fit 3 pivot tuples with larger internal
845 : * page tuple limit (includes page high key).
846 : *
847 : * Most of the time, a page is only "full" in the sense that the soft
848 : * fillfactor-wise limit has been exceeded. However, we must always leave
849 : * at least two items plus a high key on each page before starting a new
850 : * page. Disregard fillfactor and insert on "full" current page if we
851 : * don't have the minimum number of items yet. (Note that we deliberately
852 : * assume that suffix truncation neither enlarges nor shrinks new high key
853 : * when applying soft limit, except when last tuple has a posting list.)
854 : */
855 : Assert(last_truncextra == 0 || isleaf);
856 5771728 : if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
857 5771150 : (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
858 : {
859 : /*
860 : * Finish off the page and write it out.
861 : */
862 20150 : BulkWriteBuffer obuf = nbuf;
863 20150 : Page opage = npage;
864 20150 : BlockNumber oblkno = nblkno;
865 : ItemId ii;
866 : ItemId hii;
867 : IndexTuple oitup;
868 :
869 : /* Create new page of same level */
870 20150 : nbuf = _bt_blnewpage(wstate, state->btps_level);
871 20150 : npage = (Page) nbuf;
872 :
873 : /* and assign it a page position */
874 20150 : nblkno = wstate->btws_pages_alloced++;
875 :
876 : /*
877 : * We copy the last item on the page into the new page, and then
878 : * rearrange the old page so that the 'last item' becomes its high key
879 : * rather than a true data item. There had better be at least two
880 : * items on the page already, else the page would be empty of useful
881 : * data.
882 : */
883 : Assert(last_off > P_FIRSTKEY);
884 20150 : ii = PageGetItemId(opage, last_off);
885 20150 : oitup = (IndexTuple) PageGetItem(opage, ii);
886 20150 : _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
887 20150 : !isleaf);
888 :
889 : /*
890 : * Move 'last' into the high key position on opage. _bt_blnewpage()
891 : * allocated empty space for a line pointer when opage was first
892 : * created, so this is a matter of rearranging already-allocated space
893 : * on page, and initializing high key line pointer. (Actually, leaf
894 : * pages must also swap oitup with a truncated version of oitup, which
895 : * is sometimes larger than oitup, though never by more than the space
896 : * needed to append a heap TID.)
897 : */
898 20150 : hii = PageGetItemId(opage, P_HIKEY);
899 20150 : *hii = *ii;
900 20150 : ItemIdSetUnused(ii); /* redundant */
901 20150 : ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
902 :
903 20150 : if (isleaf)
904 : {
905 : IndexTuple lastleft;
906 : IndexTuple truncated;
907 :
908 : /*
909 : * Truncate away any unneeded attributes from high key on leaf
910 : * level. This is only done at the leaf level because downlinks
911 : * in internal pages are either negative infinity items, or get
912 : * their contents from copying from one level down. See also:
913 : * _bt_split().
914 : *
915 : * We don't try to bias our choice of split point to make it more
916 : * likely that _bt_truncate() can truncate away more attributes,
917 : * whereas the split point used within _bt_split() is chosen much
918 : * more delicately. Even still, the lastleft and firstright
919 : * tuples passed to _bt_truncate() here are at least not fully
920 : * equal to each other when deduplication is used, unless there is
921 : * a large group of duplicates (also, unique index builds usually
922 : * have few or no spool2 duplicates). When the split point is
923 : * between two unequal tuples, _bt_truncate() will avoid including
924 : * a heap TID in the new high key, which is the most important
925 : * benefit of suffix truncation.
926 : *
927 : * Overwrite the old item with new truncated high key directly.
928 : * oitup is already located at the physical beginning of tuple
929 : * space, so this should directly reuse the existing tuple space.
930 : */
931 20088 : ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
932 20088 : lastleft = (IndexTuple) PageGetItem(opage, ii);
933 :
934 : Assert(IndexTupleSize(oitup) > last_truncextra);
935 20088 : truncated = _bt_truncate(wstate->index, lastleft, oitup,
936 : wstate->inskey);
937 20088 : if (!PageIndexTupleOverwrite(opage, P_HIKEY, truncated, IndexTupleSize(truncated)))
938 0 : elog(ERROR, "failed to add high key to the index page");
939 20088 : pfree(truncated);
940 :
941 : /* oitup should continue to point to the page's high key */
942 20088 : hii = PageGetItemId(opage, P_HIKEY);
943 20088 : oitup = (IndexTuple) PageGetItem(opage, hii);
944 : }
945 :
946 : /*
947 : * Link the old page into its parent, using its low key. If we don't
948 : * have a parent, we have to create one; this adds a new btree level.
949 : */
950 20150 : if (state->btps_next == NULL)
951 1367 : state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
952 :
953 : Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
954 : IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
955 : BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
956 : P_LEFTMOST(BTPageGetOpaque(opage)));
957 : Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
958 : !P_LEFTMOST(BTPageGetOpaque(opage)));
959 20150 : BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
960 20150 : _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
961 20150 : pfree(state->btps_lowkey);
962 :
963 : /*
964 : * Save a copy of the high key from the old page. It is also the low
965 : * key for the new page.
966 : */
967 20150 : state->btps_lowkey = CopyIndexTuple(oitup);
968 :
969 : /*
970 : * Set the sibling links for both pages.
971 : */
972 : {
973 20150 : BTPageOpaque oopaque = BTPageGetOpaque(opage);
974 20150 : BTPageOpaque nopaque = BTPageGetOpaque(npage);
975 :
976 20150 : oopaque->btpo_next = nblkno;
977 20150 : nopaque->btpo_prev = oblkno;
978 20150 : nopaque->btpo_next = P_NONE; /* redundant */
979 : }
980 :
981 : /*
982 : * Write out the old page. _bt_blwritepage takes ownership of the
983 : * 'opage' buffer.
984 : */
985 20150 : _bt_blwritepage(wstate, obuf, oblkno);
986 :
987 : /*
988 : * Reset last_off to point to new page
989 : */
990 20150 : last_off = P_FIRSTKEY;
991 : }
992 :
993 : /*
994 : * By here, either original page is still the current page, or a new page
995 : * was created that became the current page. Either way, the current page
996 : * definitely has space for new item.
997 : *
998 : * If the new item is the first for its page, it must also be the first
999 : * item on its entire level. On later same-level pages, a low key for a
1000 : * page will be copied from the prior page in the code above. Generate a
1001 : * minus infinity low key here instead.
1002 : */
1003 5771728 : if (last_off == P_HIKEY)
1004 : {
1005 : Assert(state->btps_lowkey == NULL);
1006 6649 : state->btps_lowkey = palloc0_object(IndexTupleData);
1007 6649 : state->btps_lowkey->t_info = sizeof(IndexTupleData);
1008 6649 : BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
1009 : }
1010 :
1011 : /*
1012 : * Add the new item into the current page.
1013 : */
1014 5771728 : last_off = OffsetNumberNext(last_off);
1015 5771728 : _bt_sortaddtup(npage, itupsz, itup, last_off,
1016 5771728 : !isleaf && last_off == P_FIRSTKEY);
1017 :
1018 5771728 : state->btps_buf = nbuf;
1019 5771728 : state->btps_blkno = nblkno;
1020 5771728 : state->btps_lastoff = last_off;
1021 5771728 : }
1022 :
1023 : /*
1024 : * Finalize pending posting list tuple, and add it to the index. Final tuple
1025 : * is based on saved base tuple, and saved list of heap TIDs.
1026 : *
1027 : * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
1028 : * using _bt_buildadd().
1029 : */
1030 : static void
1031 2379209 : _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
1032 : BTDedupState dstate)
1033 : {
1034 : Assert(dstate->nitems > 0);
1035 :
1036 2379209 : if (dstate->nitems == 1)
1037 2359022 : _bt_buildadd(wstate, state, dstate->base, 0);
1038 : else
1039 : {
1040 : IndexTuple postingtuple;
1041 : Size truncextra;
1042 :
1043 : /* form a tuple with a posting list */
1044 20187 : postingtuple = _bt_form_posting(dstate->base,
1045 20187 : dstate->htids,
1046 : dstate->nhtids);
1047 : /* Calculate posting list overhead */
1048 20187 : truncextra = IndexTupleSize(postingtuple) -
1049 20187 : BTreeTupleGetPostingOffset(postingtuple);
1050 :
1051 20187 : _bt_buildadd(wstate, state, postingtuple, truncextra);
1052 20187 : pfree(postingtuple);
1053 : }
1054 :
1055 2379209 : dstate->nmaxitems = 0;
1056 2379209 : dstate->nhtids = 0;
1057 2379209 : dstate->nitems = 0;
1058 2379209 : dstate->phystupsize = 0;
1059 2379209 : }
1060 :
1061 : /*
1062 : * Finish writing out the completed btree.
1063 : */
1064 : static void
1065 26128 : _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
1066 : {
1067 : BTPageState *s;
1068 26128 : BlockNumber rootblkno = P_NONE;
1069 26128 : uint32 rootlevel = 0;
1070 : BulkWriteBuffer metabuf;
1071 :
1072 : /*
1073 : * Each iteration of this loop completes one more level of the tree.
1074 : */
1075 32777 : for (s = state; s != NULL; s = s->btps_next)
1076 : {
1077 : BlockNumber blkno;
1078 : BTPageOpaque opaque;
1079 :
1080 6649 : blkno = s->btps_blkno;
1081 6649 : opaque = BTPageGetOpaque((Page) s->btps_buf);
1082 :
1083 : /*
1084 : * We have to link the last page on this level to somewhere.
1085 : *
1086 : * If we're at the top, it's the root, so attach it to the metapage.
1087 : * Otherwise, add an entry for it to its parent using its low key.
1088 : * This may cause the last page of the parent level to split, but
1089 : * that's not a problem -- we haven't gotten to it yet.
1090 : */
1091 6649 : if (s->btps_next == NULL)
1092 : {
1093 5282 : opaque->btpo_flags |= BTP_ROOT;
1094 5282 : rootblkno = blkno;
1095 5282 : rootlevel = s->btps_level;
1096 : }
1097 : else
1098 : {
1099 : Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
1100 : IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
1101 : BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
1102 : P_LEFTMOST(opaque));
1103 : Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
1104 : !P_LEFTMOST(opaque));
1105 1367 : BTreeTupleSetDownLink(s->btps_lowkey, blkno);
1106 1367 : _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
1107 1367 : pfree(s->btps_lowkey);
1108 1367 : s->btps_lowkey = NULL;
1109 : }
1110 :
1111 : /*
1112 : * This is the rightmost page, so the ItemId array needs to be slid
1113 : * back one slot. Then we can dump out the page.
1114 : */
1115 6649 : _bt_slideleft((Page) s->btps_buf);
1116 6649 : _bt_blwritepage(wstate, s->btps_buf, s->btps_blkno);
1117 6649 : s->btps_buf = NULL; /* writepage took ownership of the buffer */
1118 : }
1119 :
1120 : /*
1121 : * As the last step in the process, construct the metapage and make it
1122 : * point to the new root (unless we had no data at all, in which case it's
1123 : * set to point to "P_NONE"). This changes the index to the "valid" state
1124 : * by filling in a valid magic number in the metapage.
1125 : */
1126 26128 : metabuf = smgr_bulk_get_buf(wstate->bulkstate);
1127 26128 : _bt_initmetapage((Page) metabuf, rootblkno, rootlevel,
1128 26128 : wstate->inskey->allequalimage);
1129 26128 : _bt_blwritepage(wstate, metabuf, BTREE_METAPAGE);
1130 26128 : }
1131 :
1132 : /*
1133 : * Read tuples in correct sort order from tuplesort, and load them into
1134 : * btree leaves.
1135 : */
1136 : static void
1137 26128 : _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
1138 : {
1139 26128 : BTPageState *state = NULL;
1140 26128 : bool merge = (btspool2 != NULL);
1141 : IndexTuple itup,
1142 26128 : itup2 = NULL;
1143 : bool load1;
1144 26128 : TupleDesc tupdes = RelationGetDescr(wstate->index);
1145 : int i,
1146 26128 : keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
1147 : SortSupport sortKeys;
1148 26128 : int64 tuples_done = 0;
1149 : bool deduplicate;
1150 :
1151 26128 : wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);
1152 :
1153 30929 : deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
1154 4801 : BTGetDeduplicateItems(wstate->index);
1155 :
1156 26128 : if (merge)
1157 : {
1158 : /*
1159 : * Another BTSpool for dead tuples exists. Now we have to merge
1160 : * btspool and btspool2.
1161 : */
1162 :
1163 : /* the preparation of merge */
1164 7 : itup = tuplesort_getindextuple(btspool->sortstate, true);
1165 7 : itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1166 :
1167 : /* Prepare SortSupport data for each column */
1168 7 : sortKeys = palloc0_array(SortSupportData, keysz);
1169 :
1170 16 : for (i = 0; i < keysz; i++)
1171 : {
1172 9 : SortSupport sortKey = sortKeys + i;
1173 9 : ScanKey scanKey = wstate->inskey->scankeys + i;
1174 : bool reverse;
1175 :
1176 9 : sortKey->ssup_cxt = CurrentMemoryContext;
1177 9 : sortKey->ssup_collation = scanKey->sk_collation;
1178 9 : sortKey->ssup_nulls_first =
1179 9 : (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1180 9 : sortKey->ssup_attno = scanKey->sk_attno;
1181 : /* Abbreviation is not supported here */
1182 9 : sortKey->abbreviate = false;
1183 :
1184 : Assert(sortKey->ssup_attno != 0);
1185 :
1186 9 : reverse = (scanKey->sk_flags & SK_BT_DESC) != 0;
1187 :
1188 9 : PrepareSortSupportFromIndexRel(wstate->index, reverse, sortKey);
1189 : }
1190 :
1191 : for (;;)
1192 : {
1193 1635 : load1 = true; /* load BTSpool next ? */
1194 1635 : if (itup2 == NULL)
1195 : {
1196 72 : if (itup == NULL)
1197 7 : break;
1198 : }
1199 1563 : else if (itup != NULL)
1200 : {
1201 1469 : int32 compare = 0;
1202 :
1203 1582 : for (i = 1; i <= keysz; i++)
1204 : {
1205 : SortSupport entry;
1206 : Datum attrDatum1,
1207 : attrDatum2;
1208 : bool isNull1,
1209 : isNull2;
1210 :
1211 1496 : entry = sortKeys + i - 1;
1212 1496 : attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
1213 1496 : attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
1214 :
1215 1496 : compare = ApplySortComparator(attrDatum1, isNull1,
1216 : attrDatum2, isNull2,
1217 : entry);
1218 1496 : if (compare > 0)
1219 : {
1220 137 : load1 = false;
1221 1383 : break;
1222 : }
1223 1359 : else if (compare < 0)
1224 1246 : break;
1225 : }
1226 :
1227 : /*
1228 : * If key values are equal, we sort on ItemPointer. This is
1229 : * required for btree indexes, since heap TID is treated as an
1230 : * implicit last key attribute in order to ensure that all
1231 : * keys in the index are physically unique.
1232 : */
1233 1469 : if (compare == 0)
1234 : {
1235 86 : compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
1236 : Assert(compare != 0);
1237 86 : if (compare > 0)
1238 6 : load1 = false;
1239 : }
1240 : }
1241 : else
1242 94 : load1 = false;
1243 :
1244 : /* When we see first tuple, create first index page */
1245 1628 : if (state == NULL)
1246 7 : state = _bt_pagestate(wstate, 0);
1247 :
1248 1628 : if (load1)
1249 : {
1250 1391 : _bt_buildadd(wstate, state, itup, 0);
1251 1391 : itup = tuplesort_getindextuple(btspool->sortstate, true);
1252 : }
1253 : else
1254 : {
1255 237 : _bt_buildadd(wstate, state, itup2, 0);
1256 237 : itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1257 : }
1258 :
1259 : /* Report progress */
1260 1628 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1261 : ++tuples_done);
1262 : }
1263 7 : pfree(sortKeys);
1264 : }
1265 26121 : else if (deduplicate)
1266 : {
1267 : /* merge is unnecessary, deduplicate into posting lists */
1268 : BTDedupState dstate;
1269 :
1270 4801 : dstate = palloc_object(BTDedupStateData);
1271 4801 : dstate->deduplicate = true; /* unused */
1272 4801 : dstate->nmaxitems = 0; /* unused */
1273 4801 : dstate->maxpostingsize = 0; /* set later */
1274 : /* Metadata about base tuple of current pending posting list */
1275 4801 : dstate->base = NULL;
1276 4801 : dstate->baseoff = InvalidOffsetNumber; /* unused */
1277 4801 : dstate->basetupsize = 0;
1278 : /* Metadata about current pending posting list TIDs */
1279 4801 : dstate->htids = NULL;
1280 4801 : dstate->nhtids = 0;
1281 4801 : dstate->nitems = 0;
1282 4801 : dstate->phystupsize = 0; /* unused */
1283 4801 : dstate->nintervals = 0; /* unused */
1284 :
1285 3072697 : while ((itup = tuplesort_getindextuple(btspool->sortstate,
1286 3072697 : true)) != NULL)
1287 : {
1288 : /* When we see first tuple, create first index page */
1289 3067896 : if (state == NULL)
1290 : {
1291 1325 : state = _bt_pagestate(wstate, 0);
1292 :
1293 : /*
1294 : * Limit size of posting list tuples to 1/10 space we want to
1295 : * leave behind on the page, plus space for final item's line
1296 : * pointer. This is equal to the space that we'd like to
1297 : * leave behind on each leaf page when fillfactor is 90,
1298 : * allowing us to get close to fillfactor% space utilization
1299 : * when there happen to be a great many duplicates. (This
1300 : * makes higher leaf fillfactor settings ineffective when
1301 : * building indexes that have many duplicates, but packing
1302 : * leaf pages full with few very large tuples doesn't seem
1303 : * like a useful goal.)
1304 : */
1305 1325 : dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
1306 : sizeof(ItemIdData);
1307 : Assert(dstate->maxpostingsize <= BTMaxItemSize &&
1308 : dstate->maxpostingsize <= INDEX_SIZE_MASK);
1309 1325 : dstate->htids = palloc(dstate->maxpostingsize);
1310 :
1311 : /* start new pending posting list with itup copy */
1312 1325 : _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
1313 : InvalidOffsetNumber);
1314 : }
1315 3066571 : else if (_bt_keep_natts_fast(wstate->index, dstate->base,
1316 692677 : itup) > keysz &&
1317 692677 : _bt_dedup_save_htid(dstate, itup))
1318 : {
1319 : /*
1320 : * Tuple is equal to base tuple of pending posting list. Heap
1321 : * TID from itup has been saved in state.
1322 : */
1323 : }
1324 : else
1325 : {
1326 : /*
1327 : * Tuple is not equal to pending posting list tuple, or
1328 : * _bt_dedup_save_htid() opted to not merge current item into
1329 : * pending posting list.
1330 : */
1331 2377884 : _bt_sort_dedup_finish_pending(wstate, state, dstate);
1332 2377884 : pfree(dstate->base);
1333 :
1334 : /* start new pending posting list with itup copy */
1335 2377884 : _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
1336 : InvalidOffsetNumber);
1337 : }
1338 :
1339 : /* Report progress */
1340 3067896 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1341 : ++tuples_done);
1342 : }
1343 :
1344 4801 : if (state)
1345 : {
1346 : /*
1347 : * Handle the last item (there must be a last item when the
1348 : * tuplesort returned one or more tuples)
1349 : */
1350 1325 : _bt_sort_dedup_finish_pending(wstate, state, dstate);
1351 1325 : pfree(dstate->base);
1352 1325 : pfree(dstate->htids);
1353 : }
1354 :
1355 4801 : pfree(dstate);
1356 : }
1357 : else
1358 : {
1359 : /* merging and deduplication are both unnecessary */
1360 3390694 : while ((itup = tuplesort_getindextuple(btspool->sortstate,
1361 3390694 : true)) != NULL)
1362 : {
1363 : /* When we see first tuple, create first index page */
1364 3369374 : if (state == NULL)
1365 3950 : state = _bt_pagestate(wstate, 0);
1366 :
1367 3369374 : _bt_buildadd(wstate, state, itup, 0);
1368 :
1369 : /* Report progress */
1370 3369374 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1371 : ++tuples_done);
1372 : }
1373 : }
1374 :
1375 : /* Close down final pages and write the metapage */
1376 26128 : _bt_uppershutdown(wstate, state);
1377 26128 : smgr_bulk_finish(wstate->bulkstate);
1378 26128 : }
1379 :
1380 : /*
1381 : * Create parallel context, and launch workers for leader.
1382 : *
1383 : * buildstate argument should be initialized (with the exception of the
1384 : * tuplesort state in spools, which may later be created based on shared
1385 : * state initially set up here).
1386 : *
1387 : * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
1388 : *
1389 : * request is the target number of parallel worker processes to launch.
1390 : *
1391 : * Sets buildstate's BTLeader, which caller must use to shut down parallel
1392 : * mode by passing it to _bt_end_parallel() at the very end of its index
1393 : * build. If not even a single worker process can be launched, this is
1394 : * never set, and caller should proceed with a serial index build.
1395 : */
1396 : static void
1397 80 : _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
1398 : {
1399 : ParallelContext *pcxt;
1400 : int scantuplesortstates;
1401 : Snapshot snapshot;
1402 : Size estbtshared;
1403 : Size estsort;
1404 : BTShared *btshared;
1405 : Sharedsort *sharedsort;
1406 : Sharedsort *sharedsort2;
1407 80 : BTSpool *btspool = buildstate->spool;
1408 80 : BTLeader *btleader = palloc0_object(BTLeader);
1409 : WalUsage *walusage;
1410 : BufferUsage *bufferusage;
1411 80 : bool leaderparticipates = true;
1412 : int querylen;
1413 :
1414 : #ifdef DISABLE_LEADER_PARTICIPATION
1415 : leaderparticipates = false;
1416 : #endif
1417 :
1418 : /*
1419 : * Enter parallel mode, and create context for parallel build of btree
1420 : * index
1421 : */
1422 80 : EnterParallelMode();
1423 : Assert(request > 0);
1424 80 : pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1425 : request);
1426 :
1427 80 : scantuplesortstates = leaderparticipates ? request + 1 : request;
1428 :
1429 : /*
1430 : * Prepare for scan of the base relation. In a normal index build, we use
1431 : * SnapshotAny because we must retrieve all tuples and do our own time
1432 : * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1433 : * concurrent build, we take a regular MVCC snapshot and index whatever's
1434 : * live according to that.
1435 : */
1436 80 : if (!isconcurrent)
1437 80 : snapshot = SnapshotAny;
1438 : else
1439 0 : snapshot = RegisterSnapshot(GetTransactionSnapshot());
1440 :
1441 : /*
1442 : * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1443 : * PARALLEL_KEY_TUPLESORT tuplesort workspace
1444 : */
1445 80 : estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
1446 80 : shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
1447 80 : estsort = tuplesort_estimate_shared(scantuplesortstates);
1448 80 : shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1449 :
1450 : /*
1451 : * Unique case requires a second spool, and so we may have to account for
1452 : * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1453 : */
1454 80 : if (!btspool->isunique)
1455 45 : shm_toc_estimate_keys(&pcxt->estimator, 2);
1456 : else
1457 : {
1458 35 : shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1459 35 : shm_toc_estimate_keys(&pcxt->estimator, 3);
1460 : }
1461 :
1462 : /*
1463 : * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
1464 : * and PARALLEL_KEY_BUFFER_USAGE.
1465 : *
1466 : * If there are no extensions loaded that care, we could skip this. We
1467 : * have no way of knowing whether anyone's looking at pgWalUsage or
1468 : * pgBufferUsage, so do it unconditionally.
1469 : */
1470 80 : shm_toc_estimate_chunk(&pcxt->estimator,
1471 : mul_size(sizeof(WalUsage), pcxt->nworkers));
1472 80 : shm_toc_estimate_keys(&pcxt->estimator, 1);
1473 80 : shm_toc_estimate_chunk(&pcxt->estimator,
1474 : mul_size(sizeof(BufferUsage), pcxt->nworkers));
1475 80 : shm_toc_estimate_keys(&pcxt->estimator, 1);
1476 :
1477 : /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1478 80 : if (debug_query_string)
1479 : {
1480 80 : querylen = strlen(debug_query_string);
1481 80 : shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
1482 80 : shm_toc_estimate_keys(&pcxt->estimator, 1);
1483 : }
1484 : else
1485 0 : querylen = 0; /* keep compiler quiet */
1486 :
1487 : /* Everyone's had a chance to ask for space, so now create the DSM */
1488 80 : InitializeParallelDSM(pcxt);
1489 :
1490 : /* If no DSM segment was available, back out (do serial build) */
1491 80 : if (pcxt->seg == NULL)
1492 : {
1493 0 : if (IsMVCCSnapshot(snapshot))
1494 0 : UnregisterSnapshot(snapshot);
1495 0 : DestroyParallelContext(pcxt);
1496 0 : ExitParallelMode();
1497 0 : return;
1498 : }
1499 :
1500 : /* Store shared build state, for which we reserved space */
1501 80 : btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1502 : /* Initialize immutable state */
1503 80 : btshared->heaprelid = RelationGetRelid(btspool->heap);
1504 80 : btshared->indexrelid = RelationGetRelid(btspool->index);
1505 80 : btshared->isunique = btspool->isunique;
1506 80 : btshared->nulls_not_distinct = btspool->nulls_not_distinct;
1507 80 : btshared->isconcurrent = isconcurrent;
1508 80 : btshared->scantuplesortstates = scantuplesortstates;
1509 80 : btshared->queryid = pgstat_get_my_query_id();
1510 80 : ConditionVariableInit(&btshared->workersdonecv);
1511 80 : SpinLockInit(&btshared->mutex);
1512 : /* Initialize mutable state */
1513 80 : btshared->nparticipantsdone = 0;
1514 80 : btshared->reltuples = 0.0;
1515 80 : btshared->havedead = false;
1516 80 : btshared->indtuples = 0.0;
1517 80 : btshared->brokenhotchain = false;
1518 80 : table_parallelscan_initialize(btspool->heap,
1519 : ParallelTableScanFromBTShared(btshared),
1520 : snapshot);
1521 :
1522 : /*
1523 : * Store shared tuplesort-private state, for which we reserved space.
1524 : * Then, initialize opaque state using tuplesort routine.
1525 : */
1526 80 : sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1527 80 : tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1528 : pcxt->seg);
1529 :
1530 80 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
1531 80 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1532 :
1533 : /* Unique case requires a second spool, and associated shared state */
1534 80 : if (!btspool->isunique)
1535 45 : sharedsort2 = NULL;
1536 : else
1537 : {
1538 : /*
1539 : * Store additional shared tuplesort-private state, for which we
1540 : * reserved space. Then, initialize opaque state using tuplesort
1541 : * routine.
1542 : */
1543 35 : sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1544 35 : tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1545 : pcxt->seg);
1546 :
1547 35 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
1548 : }
1549 :
1550 : /* Store query string for workers */
1551 80 : if (debug_query_string)
1552 : {
1553 : char *sharedquery;
1554 :
1555 80 : sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1556 80 : memcpy(sharedquery, debug_query_string, querylen + 1);
1557 80 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
1558 : }
1559 :
1560 : /*
1561 : * Allocate space for each worker's WalUsage and BufferUsage; no need to
1562 : * initialize.
1563 : */
1564 80 : walusage = shm_toc_allocate(pcxt->toc,
1565 80 : mul_size(sizeof(WalUsage), pcxt->nworkers));
1566 80 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
1567 80 : bufferusage = shm_toc_allocate(pcxt->toc,
1568 80 : mul_size(sizeof(BufferUsage), pcxt->nworkers));
1569 80 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1570 :
1571 : /* Launch workers, saving status for leader/caller */
1572 80 : LaunchParallelWorkers(pcxt);
1573 80 : btleader->pcxt = pcxt;
1574 80 : btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1575 80 : if (leaderparticipates)
1576 80 : btleader->nparticipanttuplesorts++;
1577 80 : btleader->btshared = btshared;
1578 80 : btleader->sharedsort = sharedsort;
1579 80 : btleader->sharedsort2 = sharedsort2;
1580 80 : btleader->snapshot = snapshot;
1581 80 : btleader->walusage = walusage;
1582 80 : btleader->bufferusage = bufferusage;
1583 :
1584 : /* If no workers were successfully launched, back out (do serial build) */
1585 80 : if (pcxt->nworkers_launched == 0)
1586 : {
1587 0 : _bt_end_parallel(btleader);
1588 0 : return;
1589 : }
1590 :
1591 : /* Save leader state now that it's clear build will be parallel */
1592 80 : buildstate->btleader = btleader;
1593 :
1594 : /* Join heap scan ourselves */
1595 80 : if (leaderparticipates)
1596 80 : _bt_leader_participate_as_worker(buildstate);
1597 :
1598 : /*
1599 : * Caller needs to wait for all launched workers when we return. Make
1600 : * sure that the failure-to-start case will not hang forever.
1601 : */
1602 80 : WaitForParallelWorkersToAttach(pcxt);
1603 : }
1604 :
1605 : /*
1606 : * Shut down workers, destroy parallel context, and end parallel mode.
1607 : */
1608 : static void
1609 80 : _bt_end_parallel(BTLeader *btleader)
1610 : {
1611 : int i;
1612 :
1613 : /* Shutdown worker processes */
1614 80 : WaitForParallelWorkersToFinish(btleader->pcxt);
1615 :
1616 : /*
1617 : * Next, accumulate WAL usage. (This must wait for the workers to finish,
1618 : * or we might get incomplete data.)
1619 : */
1620 163 : for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
1621 83 : InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
1622 :
1623 : /* Free last reference to MVCC snapshot, if one was used */
1624 80 : if (IsMVCCSnapshot(btleader->snapshot))
1625 0 : UnregisterSnapshot(btleader->snapshot);
1626 80 : DestroyParallelContext(btleader->pcxt);
1627 80 : ExitParallelMode();
1628 80 : }
1629 :
1630 : /*
1631 : * Returns size of shared memory required to store state for a parallel
1632 : * btree index build based on the snapshot its parallel scan will use.
1633 : */
1634 : static Size
1635 80 : _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
1636 : {
1637 : /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1638 80 : return add_size(BUFFERALIGN(sizeof(BTShared)),
1639 : table_parallelscan_estimate(heap, snapshot));
1640 : }
1641 :
1642 : /*
1643 : * Within leader, wait for end of heap scan.
1644 : *
1645 : * When called, parallel heap scan started by _bt_begin_parallel() will
1646 : * already be underway within worker processes (when leader participates
1647 : * as a worker, we should end up here just as workers are finishing).
1648 : *
1649 : * Fills in fields needed for ambuild statistics, and lets caller set
1650 : * field indicating that some worker encountered a broken HOT chain.
1651 : *
1652 : * Returns the total number of heap tuples scanned.
1653 : */
1654 : static double
1655 80 : _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
1656 : {
1657 80 : BTShared *btshared = buildstate->btleader->btshared;
1658 : int nparticipanttuplesorts;
1659 : double reltuples;
1660 :
1661 80 : nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1662 : for (;;)
1663 : {
1664 213 : SpinLockAcquire(&btshared->mutex);
1665 213 : if (btshared->nparticipantsdone == nparticipanttuplesorts)
1666 : {
1667 80 : buildstate->havedead = btshared->havedead;
1668 80 : buildstate->indtuples = btshared->indtuples;
1669 80 : *brokenhotchain = btshared->brokenhotchain;
1670 80 : reltuples = btshared->reltuples;
1671 80 : SpinLockRelease(&btshared->mutex);
1672 80 : break;
1673 : }
1674 133 : SpinLockRelease(&btshared->mutex);
1675 :
1676 133 : ConditionVariableSleep(&btshared->workersdonecv,
1677 : WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
1678 : }
1679 :
1680 80 : ConditionVariableCancelSleep();
1681 :
1682 80 : return reltuples;
1683 : }
1684 :
1685 : /*
1686 : * Within leader, participate as a parallel worker.
1687 : */
1688 : static void
1689 80 : _bt_leader_participate_as_worker(BTBuildState *buildstate)
1690 : {
1691 80 : BTLeader *btleader = buildstate->btleader;
1692 : BTSpool *leaderworker;
1693 : BTSpool *leaderworker2;
1694 : int sortmem;
1695 :
1696 : /* Allocate memory and initialize private spool */
1697 80 : leaderworker = palloc0_object(BTSpool);
1698 80 : leaderworker->heap = buildstate->spool->heap;
1699 80 : leaderworker->index = buildstate->spool->index;
1700 80 : leaderworker->isunique = buildstate->spool->isunique;
1701 80 : leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;
1702 :
1703 : /* Initialize second spool, if required */
1704 80 : if (!btleader->btshared->isunique)
1705 45 : leaderworker2 = NULL;
1706 : else
1707 : {
1708 : /* Allocate memory for worker's own private secondary spool */
1709 35 : leaderworker2 = palloc0_object(BTSpool);
1710 :
1711 : /* Initialize worker's own secondary spool */
1712 35 : leaderworker2->heap = leaderworker->heap;
1713 35 : leaderworker2->index = leaderworker->index;
1714 35 : leaderworker2->isunique = false;
1715 : }
1716 :
1717 : /*
1718 : * Might as well use reliable figure when doling out maintenance_work_mem
1719 : * (when requested number of workers were not launched, this will be
1720 : * somewhat higher than it is for other workers).
1721 : */
1722 80 : sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
1723 :
1724 : /* Perform work common to all participants */
1725 80 : _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
1726 : btleader->sharedsort, btleader->sharedsort2,
1727 : sortmem, true);
1728 :
1729 : #ifdef BTREE_BUILD_STATS
1730 : if (log_btree_build_stats)
1731 : {
1732 : ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1733 : ResetUsage();
1734 : }
1735 : #endif /* BTREE_BUILD_STATS */
1736 80 : }
1737 :
1738 : /*
1739 : * Perform work within a launched parallel process.
1740 : */
1741 : void
1742 83 : _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
1743 : {
1744 : char *sharedquery;
1745 : BTSpool *btspool;
1746 : BTSpool *btspool2;
1747 : BTShared *btshared;
1748 : Sharedsort *sharedsort;
1749 : Sharedsort *sharedsort2;
1750 : Relation heapRel;
1751 : Relation indexRel;
1752 : LOCKMODE heapLockmode;
1753 : LOCKMODE indexLockmode;
1754 : WalUsage *walusage;
1755 : BufferUsage *bufferusage;
1756 : int sortmem;
1757 :
1758 : #ifdef BTREE_BUILD_STATS
1759 : if (log_btree_build_stats)
1760 : ResetUsage();
1761 : #endif /* BTREE_BUILD_STATS */
1762 :
1763 : /*
1764 : * The only possible status flag that can be set to the parallel worker is
1765 : * PROC_IN_SAFE_IC.
1766 : */
1767 : Assert((MyProc->statusFlags == 0) ||
1768 : (MyProc->statusFlags == PROC_IN_SAFE_IC));
1769 :
1770 : /* Set debug_query_string for individual workers first */
1771 83 : sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
1772 83 : debug_query_string = sharedquery;
1773 :
1774 : /* Report the query string from leader */
1775 83 : pgstat_report_activity(STATE_RUNNING, debug_query_string);
1776 :
1777 : /* Look up nbtree shared state */
1778 83 : btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1779 :
1780 : /* Open relations using lock modes known to be obtained by index.c */
1781 83 : if (!btshared->isconcurrent)
1782 : {
1783 83 : heapLockmode = ShareLock;
1784 83 : indexLockmode = AccessExclusiveLock;
1785 : }
1786 : else
1787 : {
1788 0 : heapLockmode = ShareUpdateExclusiveLock;
1789 0 : indexLockmode = RowExclusiveLock;
1790 : }
1791 :
1792 : /* Track query ID */
1793 83 : pgstat_report_query_id(btshared->queryid, false);
1794 :
1795 : /* Open relations within worker */
1796 83 : heapRel = table_open(btshared->heaprelid, heapLockmode);
1797 83 : indexRel = index_open(btshared->indexrelid, indexLockmode);
1798 :
1799 : /* Initialize worker's own spool */
1800 83 : btspool = palloc0_object(BTSpool);
1801 83 : btspool->heap = heapRel;
1802 83 : btspool->index = indexRel;
1803 83 : btspool->isunique = btshared->isunique;
1804 83 : btspool->nulls_not_distinct = btshared->nulls_not_distinct;
1805 :
1806 : /* Look up shared state private to tuplesort.c */
1807 83 : sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1808 83 : tuplesort_attach_shared(sharedsort, seg);
1809 83 : if (!btshared->isunique)
1810 : {
1811 48 : btspool2 = NULL;
1812 48 : sharedsort2 = NULL;
1813 : }
1814 : else
1815 : {
1816 : /* Allocate memory for worker's own private secondary spool */
1817 35 : btspool2 = palloc0_object(BTSpool);
1818 :
1819 : /* Initialize worker's own secondary spool */
1820 35 : btspool2->heap = btspool->heap;
1821 35 : btspool2->index = btspool->index;
1822 35 : btspool2->isunique = false;
1823 : /* Look up shared state private to tuplesort.c */
1824 35 : sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1825 35 : tuplesort_attach_shared(sharedsort2, seg);
1826 : }
1827 :
1828 : /* Prepare to track buffer usage during parallel execution */
1829 83 : InstrStartParallelQuery();
1830 :
1831 : /* Perform sorting of spool, and possibly a spool2 */
1832 83 : sortmem = maintenance_work_mem / btshared->scantuplesortstates;
1833 83 : _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1834 : sharedsort2, sortmem, false);
1835 :
1836 : /* Report WAL/buffer usage during parallel execution */
1837 83 : bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
1838 83 : walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
1839 83 : InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
1840 83 : &walusage[ParallelWorkerNumber]);
1841 :
1842 : #ifdef BTREE_BUILD_STATS
1843 : if (log_btree_build_stats)
1844 : {
1845 : ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1846 : ResetUsage();
1847 : }
1848 : #endif /* BTREE_BUILD_STATS */
1849 :
1850 83 : index_close(indexRel, indexLockmode);
1851 83 : table_close(heapRel, heapLockmode);
1852 83 : }
1853 :
1854 : /*
1855 : * Perform a worker's portion of a parallel sort.
1856 : *
1857 : * This generates a tuplesort for passed btspool, and a second tuplesort
1858 : * state if a second btspool is need (i.e. for unique index builds). All
1859 : * other spool fields should already be set when this is called.
1860 : *
1861 : * sortmem is the amount of working memory to use within each worker,
1862 : * expressed in KBs.
1863 : *
1864 : * When this returns, workers are done, and need only release resources.
1865 : */
1866 : static void
1867 163 : _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
1868 : BTShared *btshared, Sharedsort *sharedsort,
1869 : Sharedsort *sharedsort2, int sortmem, bool progress)
1870 : {
1871 : SortCoordinate coordinate;
1872 : BTBuildState buildstate;
1873 : TableScanDesc scan;
1874 : double reltuples;
1875 : IndexInfo *indexInfo;
1876 :
1877 : /* Initialize local tuplesort coordination state */
1878 163 : coordinate = palloc0_object(SortCoordinateData);
1879 163 : coordinate->isWorker = true;
1880 163 : coordinate->nParticipants = -1;
1881 163 : coordinate->sharedsort = sharedsort;
1882 :
1883 : /* Begin "partial" tuplesort */
1884 326 : btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1885 : btspool->index,
1886 163 : btspool->isunique,
1887 163 : btspool->nulls_not_distinct,
1888 : sortmem, coordinate,
1889 : TUPLESORT_NONE);
1890 :
1891 : /*
1892 : * Just as with serial case, there may be a second spool. If so, a
1893 : * second, dedicated spool2 partial tuplesort is required.
1894 : */
1895 163 : if (btspool2)
1896 : {
1897 : SortCoordinate coordinate2;
1898 :
1899 : /*
1900 : * We expect that the second one (for dead tuples) won't get very
1901 : * full, so we give it only work_mem (unless sortmem is less for
1902 : * worker). Worker processes are generally permitted to allocate
1903 : * work_mem independently.
1904 : */
1905 70 : coordinate2 = palloc0_object(SortCoordinateData);
1906 70 : coordinate2->isWorker = true;
1907 70 : coordinate2->nParticipants = -1;
1908 70 : coordinate2->sharedsort = sharedsort2;
1909 70 : btspool2->sortstate =
1910 70 : tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,
1911 : Min(sortmem, work_mem), coordinate2,
1912 : false);
1913 : }
1914 :
1915 : /* Fill in buildstate for _bt_build_callback() */
1916 163 : buildstate.isunique = btshared->isunique;
1917 163 : buildstate.nulls_not_distinct = btshared->nulls_not_distinct;
1918 163 : buildstate.havedead = false;
1919 163 : buildstate.heap = btspool->heap;
1920 163 : buildstate.spool = btspool;
1921 163 : buildstate.spool2 = btspool2;
1922 163 : buildstate.indtuples = 0;
1923 163 : buildstate.btleader = NULL;
1924 :
1925 : /* Join parallel scan */
1926 163 : indexInfo = BuildIndexInfo(btspool->index);
1927 163 : indexInfo->ii_Concurrent = btshared->isconcurrent;
1928 163 : scan = table_beginscan_parallel(btspool->heap,
1929 : ParallelTableScanFromBTShared(btshared));
1930 163 : reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
1931 : true, progress, _bt_build_callback,
1932 : &buildstate, scan);
1933 :
1934 : /* Execute this worker's part of the sort */
1935 163 : if (progress)
1936 80 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1937 : PROGRESS_BTREE_PHASE_PERFORMSORT_1);
1938 163 : tuplesort_performsort(btspool->sortstate);
1939 163 : if (btspool2)
1940 : {
1941 70 : if (progress)
1942 35 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1943 : PROGRESS_BTREE_PHASE_PERFORMSORT_2);
1944 70 : tuplesort_performsort(btspool2->sortstate);
1945 : }
1946 :
1947 : /*
1948 : * Done. Record ambuild statistics, and whether we encountered a broken
1949 : * HOT chain.
1950 : */
1951 163 : SpinLockAcquire(&btshared->mutex);
1952 163 : btshared->nparticipantsdone++;
1953 163 : btshared->reltuples += reltuples;
1954 163 : if (buildstate.havedead)
1955 0 : btshared->havedead = true;
1956 163 : btshared->indtuples += buildstate.indtuples;
1957 163 : if (indexInfo->ii_BrokenHotChain)
1958 0 : btshared->brokenhotchain = true;
1959 163 : SpinLockRelease(&btshared->mutex);
1960 :
1961 : /* Notify leader */
1962 163 : ConditionVariableSignal(&btshared->workersdonecv);
1963 :
1964 : /* We can end tuplesorts immediately */
1965 163 : tuplesort_end(btspool->sortstate);
1966 163 : if (btspool2)
1967 70 : tuplesort_end(btspool2->sortstate);
1968 163 : }
|