Line data Source code
1 : /*
2 : * brin.c
3 : * Implementation of BRIN indexes for Postgres
4 : *
5 : * See src/backend/access/brin/README for details.
6 : *
7 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/brin/brin.c
12 : *
13 : * TODO
14 : * * ScalarArrayOpExpr (amsearcharray -> SK_SEARCHARRAY)
15 : */
16 : #include "postgres.h"
17 :
18 : #include "access/brin.h"
19 : #include "access/brin_page.h"
20 : #include "access/brin_pageops.h"
21 : #include "access/brin_xlog.h"
22 : #include "access/relation.h"
23 : #include "access/reloptions.h"
24 : #include "access/relscan.h"
25 : #include "access/table.h"
26 : #include "access/tableam.h"
27 : #include "access/xloginsert.h"
28 : #include "catalog/index.h"
29 : #include "catalog/pg_am.h"
30 : #include "commands/vacuum.h"
31 : #include "miscadmin.h"
32 : #include "pgstat.h"
33 : #include "postmaster/autovacuum.h"
34 : #include "storage/bufmgr.h"
35 : #include "storage/freespace.h"
36 : #include "tcop/tcopprot.h" /* pgrminclude ignore */
37 : #include "utils/acl.h"
38 : #include "utils/datum.h"
39 : #include "utils/fmgrprotos.h"
40 : #include "utils/guc.h"
41 : #include "utils/index_selfuncs.h"
42 : #include "utils/memutils.h"
43 : #include "utils/rel.h"
44 : #include "utils/tuplesort.h"
45 :
46 : /* Magic numbers for parallel state sharing */
47 : #define PARALLEL_KEY_BRIN_SHARED UINT64CONST(0xB000000000000001)
48 : #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xB000000000000002)
49 : #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xB000000000000003)
50 : #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xB000000000000004)
51 : #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xB000000000000005)
52 :
53 : /*
54 : * Status for index builds performed in parallel. This is allocated in a
55 : * dynamic shared memory segment.
56 : */
57 : typedef struct BrinShared
58 : {
59 : /*
60 : * These fields are not modified during the build. They primarily exist
61 : * for the benefit of worker processes that need to create state
62 : * corresponding to that used by the leader.
63 : */
64 : Oid heaprelid;
65 : Oid indexrelid;
66 : bool isconcurrent;
67 : BlockNumber pagesPerRange;
68 : int scantuplesortstates;
69 :
70 : /* Query ID, for report in worker processes */
71 : uint64 queryid;
72 :
73 : /*
74 : * workersdonecv is used to monitor the progress of workers. All parallel
75 : * participants must indicate that they are done before leader can use
76 : * results built by the workers (and before leader can write the data into
77 : * the index).
78 : */
79 : ConditionVariable workersdonecv;
80 :
81 : /*
82 : * mutex protects all fields before heapdesc.
83 : *
84 : * These fields contain status information of interest to BRIN index
85 : * builds that must work just the same when an index is built in parallel.
86 : */
87 : slock_t mutex;
88 :
89 : /*
90 : * Mutable state that is maintained by workers, and reported back to
91 : * leader at end of the scans.
92 : *
93 : * nparticipantsdone is number of worker processes finished.
94 : *
95 : * reltuples is the total number of input heap tuples.
96 : *
97 : * indtuples is the total number of tuples that made it into the index.
98 : */
99 : int nparticipantsdone;
100 : double reltuples;
101 : double indtuples;
102 :
103 : /*
104 : * ParallelTableScanDescData data follows. Can't directly embed here, as
105 : * implementations of the parallel table scan desc interface might need
106 : * stronger alignment.
107 : */
108 : } BrinShared;
109 :
110 : /*
111 : * Return pointer to a BrinShared's parallel table scan.
112 : *
113 : * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
114 : * MAXALIGN.
115 : */
116 : #define ParallelTableScanFromBrinShared(shared) \
117 : (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BrinShared)))
118 :
119 : /*
120 : * Status for leader in parallel index build.
121 : */
122 : typedef struct BrinLeader
123 : {
124 : /* parallel context itself */
125 : ParallelContext *pcxt;
126 :
127 : /*
128 : * nparticipanttuplesorts is the exact number of worker processes
129 : * successfully launched, plus one leader process if it participates as a
130 : * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
131 : * participating as a worker).
132 : */
133 : int nparticipanttuplesorts;
134 :
135 : /*
136 : * Leader process convenience pointers to shared state (leader avoids TOC
137 : * lookups).
138 : *
139 : * brinshared is the shared state for entire build. sharedsort is the
140 : * shared, tuplesort-managed state passed to each process tuplesort.
141 : * snapshot is the snapshot used by the scan iff an MVCC snapshot is
142 : * required.
143 : */
144 : BrinShared *brinshared;
145 : Sharedsort *sharedsort;
146 : Snapshot snapshot;
147 : WalUsage *walusage;
148 : BufferUsage *bufferusage;
149 : } BrinLeader;
150 :
151 : /*
152 : * We use a BrinBuildState during initial construction of a BRIN index.
153 : * The running state is kept in a BrinMemTuple.
154 : */
155 : typedef struct BrinBuildState
156 : {
157 : Relation bs_irel;
158 : double bs_numtuples;
159 : double bs_reltuples;
160 : Buffer bs_currentInsertBuf;
161 : BlockNumber bs_pagesPerRange;
162 : BlockNumber bs_currRangeStart;
163 : BlockNumber bs_maxRangeStart;
164 : BrinRevmap *bs_rmAccess;
165 : BrinDesc *bs_bdesc;
166 : BrinMemTuple *bs_dtuple;
167 :
168 : BrinTuple *bs_emptyTuple;
169 : Size bs_emptyTupleLen;
170 : MemoryContext bs_context;
171 :
172 : /*
173 : * bs_leader is only present when a parallel index build is performed, and
174 : * only in the leader process. (Actually, only the leader process has a
175 : * BrinBuildState.)
176 : */
177 : BrinLeader *bs_leader;
178 : int bs_worker_id;
179 :
180 : /*
181 : * The sortstate is used by workers (including the leader). It has to be
182 : * part of the build state, because that's the only thing passed to the
183 : * build callback etc.
184 : */
185 : Tuplesortstate *bs_sortstate;
186 : } BrinBuildState;
187 :
188 : /*
189 : * We use a BrinInsertState to capture running state spanning multiple
190 : * brininsert invocations, within the same command.
191 : */
192 : typedef struct BrinInsertState
193 : {
194 : BrinRevmap *bis_rmAccess;
195 : BrinDesc *bis_desc;
196 : BlockNumber bis_pages_per_range;
197 : } BrinInsertState;
198 :
199 : /*
200 : * Struct used as "opaque" during index scans
201 : */
202 : typedef struct BrinOpaque
203 : {
204 : BlockNumber bo_pagesPerRange;
205 : BrinRevmap *bo_rmAccess;
206 : BrinDesc *bo_bdesc;
207 : } BrinOpaque;
208 :
209 : #define BRIN_ALL_BLOCKRANGES InvalidBlockNumber
210 :
211 : static BrinBuildState *initialize_brin_buildstate(Relation idxRel,
212 : BrinRevmap *revmap,
213 : BlockNumber pagesPerRange,
214 : BlockNumber tablePages);
215 : static BrinInsertState *initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo);
216 : static void terminate_brin_buildstate(BrinBuildState *state);
217 : static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
218 : bool include_partial, double *numSummarized, double *numExisting);
219 : static void form_and_insert_tuple(BrinBuildState *state);
220 : static void form_and_spill_tuple(BrinBuildState *state);
221 : static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a,
222 : BrinTuple *b);
223 : static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy);
224 : static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc,
225 : BrinMemTuple *dtup, const Datum *values, const bool *nulls);
226 : static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys);
227 : static void brin_fill_empty_ranges(BrinBuildState *state,
228 : BlockNumber prevRange, BlockNumber nextRange);
229 :
230 : /* parallel index builds */
231 : static void _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
232 : bool isconcurrent, int request);
233 : static void _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state);
234 : static Size _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot);
235 : static double _brin_parallel_heapscan(BrinBuildState *state);
236 : static double _brin_parallel_merge(BrinBuildState *state);
237 : static void _brin_leader_participate_as_worker(BrinBuildState *buildstate,
238 : Relation heap, Relation index);
239 : static void _brin_parallel_scan_and_build(BrinBuildState *state,
240 : BrinShared *brinshared,
241 : Sharedsort *sharedsort,
242 : Relation heap, Relation index,
243 : int sortmem, bool progress);
244 :
245 : /*
246 : * BRIN handler function: return IndexAmRoutine with access method parameters
247 : * and callbacks.
248 : */
249 : Datum
250 2234 : brinhandler(PG_FUNCTION_ARGS)
251 : {
252 2234 : IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
253 :
254 2234 : amroutine->amstrategies = 0;
255 2234 : amroutine->amsupport = BRIN_LAST_OPTIONAL_PROCNUM;
256 2234 : amroutine->amoptsprocnum = BRIN_PROCNUM_OPTIONS;
257 2234 : amroutine->amcanorder = false;
258 2234 : amroutine->amcanorderbyop = false;
259 2234 : amroutine->amcanbackward = false;
260 2234 : amroutine->amcanunique = false;
261 2234 : amroutine->amcanmulticol = true;
262 2234 : amroutine->amoptionalkey = true;
263 2234 : amroutine->amsearcharray = false;
264 2234 : amroutine->amsearchnulls = true;
265 2234 : amroutine->amstorage = true;
266 2234 : amroutine->amclusterable = false;
267 2234 : amroutine->ampredlocks = false;
268 2234 : amroutine->amcanparallel = false;
269 2234 : amroutine->amcanbuildparallel = true;
270 2234 : amroutine->amcaninclude = false;
271 2234 : amroutine->amusemaintenanceworkmem = false;
272 2234 : amroutine->amsummarizing = true;
273 2234 : amroutine->amparallelvacuumoptions =
274 : VACUUM_OPTION_PARALLEL_CLEANUP;
275 2234 : amroutine->amkeytype = InvalidOid;
276 :
277 2234 : amroutine->ambuild = brinbuild;
278 2234 : amroutine->ambuildempty = brinbuildempty;
279 2234 : amroutine->aminsert = brininsert;
280 2234 : amroutine->aminsertcleanup = brininsertcleanup;
281 2234 : amroutine->ambulkdelete = brinbulkdelete;
282 2234 : amroutine->amvacuumcleanup = brinvacuumcleanup;
283 2234 : amroutine->amcanreturn = NULL;
284 2234 : amroutine->amcostestimate = brincostestimate;
285 2234 : amroutine->amgettreeheight = NULL;
286 2234 : amroutine->amoptions = brinoptions;
287 2234 : amroutine->amproperty = NULL;
288 2234 : amroutine->ambuildphasename = NULL;
289 2234 : amroutine->amvalidate = brinvalidate;
290 2234 : amroutine->amadjustmembers = NULL;
291 2234 : amroutine->ambeginscan = brinbeginscan;
292 2234 : amroutine->amrescan = brinrescan;
293 2234 : amroutine->amgettuple = NULL;
294 2234 : amroutine->amgetbitmap = bringetbitmap;
295 2234 : amroutine->amendscan = brinendscan;
296 2234 : amroutine->ammarkpos = NULL;
297 2234 : amroutine->amrestrpos = NULL;
298 2234 : amroutine->amestimateparallelscan = NULL;
299 2234 : amroutine->aminitparallelscan = NULL;
300 2234 : amroutine->amparallelrescan = NULL;
301 :
302 2234 : PG_RETURN_POINTER(amroutine);
303 : }
304 :
305 : /*
306 : * Initialize a BrinInsertState to maintain state to be used across multiple
307 : * tuple inserts, within the same command.
308 : */
309 : static BrinInsertState *
310 1090 : initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo)
311 : {
312 : BrinInsertState *bistate;
313 : MemoryContext oldcxt;
314 :
315 1090 : oldcxt = MemoryContextSwitchTo(indexInfo->ii_Context);
316 1090 : bistate = palloc0(sizeof(BrinInsertState));
317 1090 : bistate->bis_desc = brin_build_desc(idxRel);
318 1090 : bistate->bis_rmAccess = brinRevmapInitialize(idxRel,
319 : &bistate->bis_pages_per_range);
320 1090 : indexInfo->ii_AmCache = bistate;
321 1090 : MemoryContextSwitchTo(oldcxt);
322 :
323 1090 : return bistate;
324 : }
325 :
326 : /*
327 : * A tuple in the heap is being inserted. To keep a brin index up to date,
328 : * we need to obtain the relevant index tuple and compare its stored values
329 : * with those of the new tuple. If the tuple values are not consistent with
330 : * the summary tuple, we need to update the index tuple.
331 : *
332 : * If autosummarization is enabled, check if we need to summarize the previous
333 : * page range.
334 : *
335 : * If the range is not currently summarized (i.e. the revmap returns NULL for
336 : * it), there's nothing to do for this tuple.
337 : */
338 : bool
339 125926 : brininsert(Relation idxRel, Datum *values, bool *nulls,
340 : ItemPointer heaptid, Relation heapRel,
341 : IndexUniqueCheck checkUnique,
342 : bool indexUnchanged,
343 : IndexInfo *indexInfo)
344 : {
345 : BlockNumber pagesPerRange;
346 : BlockNumber origHeapBlk;
347 : BlockNumber heapBlk;
348 125926 : BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
349 : BrinRevmap *revmap;
350 : BrinDesc *bdesc;
351 125926 : Buffer buf = InvalidBuffer;
352 125926 : MemoryContext tupcxt = NULL;
353 125926 : MemoryContext oldcxt = CurrentMemoryContext;
354 125926 : bool autosummarize = BrinGetAutoSummarize(idxRel);
355 :
356 : /*
357 : * If first time through in this statement, initialize the insert state
358 : * that we keep for all the inserts in the command.
359 : */
360 125926 : if (!bistate)
361 1090 : bistate = initialize_brin_insertstate(idxRel, indexInfo);
362 :
363 125926 : revmap = bistate->bis_rmAccess;
364 125926 : bdesc = bistate->bis_desc;
365 125926 : pagesPerRange = bistate->bis_pages_per_range;
366 :
367 : /*
368 : * origHeapBlk is the block number where the insertion occurred. heapBlk
369 : * is the first block in the corresponding page range.
370 : */
371 125926 : origHeapBlk = ItemPointerGetBlockNumber(heaptid);
372 125926 : heapBlk = (origHeapBlk / pagesPerRange) * pagesPerRange;
373 :
374 : for (;;)
375 0 : {
376 125926 : bool need_insert = false;
377 : OffsetNumber off;
378 : BrinTuple *brtup;
379 : BrinMemTuple *dtup;
380 :
381 125926 : CHECK_FOR_INTERRUPTS();
382 :
383 : /*
384 : * If auto-summarization is enabled and we just inserted the first
385 : * tuple into the first block of a new non-first page range, request a
386 : * summarization run of the previous range.
387 : */
388 125926 : if (autosummarize &&
389 156 : heapBlk > 0 &&
390 156 : heapBlk == origHeapBlk &&
391 156 : ItemPointerGetOffsetNumber(heaptid) == FirstOffsetNumber)
392 : {
393 8 : BlockNumber lastPageRange = heapBlk - 1;
394 : BrinTuple *lastPageTuple;
395 :
396 : lastPageTuple =
397 8 : brinGetTupleForHeapBlock(revmap, lastPageRange, &buf, &off,
398 : NULL, BUFFER_LOCK_SHARE);
399 8 : if (!lastPageTuple)
400 : {
401 : bool recorded;
402 :
403 6 : recorded = AutoVacuumRequestWork(AVW_BRINSummarizeRange,
404 : RelationGetRelid(idxRel),
405 : lastPageRange);
406 6 : if (!recorded)
407 0 : ereport(LOG,
408 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
409 : errmsg("request for BRIN range summarization for index \"%s\" page %u was not recorded",
410 : RelationGetRelationName(idxRel),
411 : lastPageRange)));
412 : }
413 : else
414 2 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
415 : }
416 :
417 125926 : brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off,
418 : NULL, BUFFER_LOCK_SHARE);
419 :
420 : /* if range is unsummarized, there's nothing to do */
421 125926 : if (!brtup)
422 78120 : break;
423 :
424 : /* First time through in this brininsert call? */
425 47806 : if (tupcxt == NULL)
426 : {
427 47806 : tupcxt = AllocSetContextCreate(CurrentMemoryContext,
428 : "brininsert cxt",
429 : ALLOCSET_DEFAULT_SIZES);
430 47806 : MemoryContextSwitchTo(tupcxt);
431 : }
432 :
433 47806 : dtup = brin_deform_tuple(bdesc, brtup, NULL);
434 :
435 47806 : need_insert = add_values_to_range(idxRel, bdesc, dtup, values, nulls);
436 :
437 47806 : if (!need_insert)
438 : {
439 : /*
440 : * The tuple is consistent with the new values, so there's nothing
441 : * to do.
442 : */
443 23898 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
444 : }
445 : else
446 : {
447 23908 : Page page = BufferGetPage(buf);
448 23908 : ItemId lp = PageGetItemId(page, off);
449 : Size origsz;
450 : BrinTuple *origtup;
451 : Size newsz;
452 : BrinTuple *newtup;
453 : bool samepage;
454 :
455 : /*
456 : * Make a copy of the old tuple, so that we can compare it after
457 : * re-acquiring the lock.
458 : */
459 23908 : origsz = ItemIdGetLength(lp);
460 23908 : origtup = brin_copy_tuple(brtup, origsz, NULL, NULL);
461 :
462 : /*
463 : * Before releasing the lock, check if we can attempt a same-page
464 : * update. Another process could insert a tuple concurrently in
465 : * the same page though, so downstream we must be prepared to cope
466 : * if this turns out to not be possible after all.
467 : */
468 23908 : newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz);
469 23908 : samepage = brin_can_do_samepage_update(buf, origsz, newsz);
470 23908 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
471 :
472 : /*
473 : * Try to update the tuple. If this doesn't work for whatever
474 : * reason, we need to restart from the top; the revmap might be
475 : * pointing at a different tuple for this block now, so we need to
476 : * recompute to ensure both our new heap tuple and the other
477 : * inserter's are covered by the combined tuple. It might be that
478 : * we don't need to update at all.
479 : */
480 23908 : if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk,
481 : buf, off, origtup, origsz, newtup, newsz,
482 : samepage))
483 : {
484 : /* no luck; start over */
485 0 : MemoryContextReset(tupcxt);
486 0 : continue;
487 : }
488 : }
489 :
490 : /* success! */
491 47806 : break;
492 : }
493 :
494 125926 : if (BufferIsValid(buf))
495 47808 : ReleaseBuffer(buf);
496 125926 : MemoryContextSwitchTo(oldcxt);
497 125926 : if (tupcxt != NULL)
498 47806 : MemoryContextDelete(tupcxt);
499 :
500 125926 : return false;
501 : }
502 :
503 : /*
504 : * Callback to clean up the BrinInsertState once all tuple inserts are done.
505 : */
506 : void
507 1096 : brininsertcleanup(Relation index, IndexInfo *indexInfo)
508 : {
509 1096 : BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
510 :
511 : /* bail out if cache not initialized */
512 1096 : if (indexInfo->ii_AmCache == NULL)
513 6 : return;
514 :
515 : /*
516 : * Clean up the revmap. Note that the brinDesc has already been cleaned up
517 : * as part of its own memory context.
518 : */
519 1090 : brinRevmapTerminate(bistate->bis_rmAccess);
520 1090 : bistate->bis_rmAccess = NULL;
521 1090 : bistate->bis_desc = NULL;
522 : }
523 :
524 : /*
525 : * Initialize state for a BRIN index scan.
526 : *
527 : * We read the metapage here to determine the pages-per-range number that this
528 : * index was built with. Note that since this cannot be changed while we're
529 : * holding lock on index, it's not necessary to recompute it during brinrescan.
530 : */
531 : IndexScanDesc
532 2946 : brinbeginscan(Relation r, int nkeys, int norderbys)
533 : {
534 : IndexScanDesc scan;
535 : BrinOpaque *opaque;
536 :
537 2946 : scan = RelationGetIndexScan(r, nkeys, norderbys);
538 :
539 2946 : opaque = palloc_object(BrinOpaque);
540 2946 : opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange);
541 2946 : opaque->bo_bdesc = brin_build_desc(r);
542 2946 : scan->opaque = opaque;
543 :
544 2946 : return scan;
545 : }
546 :
547 : /*
548 : * Execute the index scan.
549 : *
550 : * This works by reading index TIDs from the revmap, and obtaining the index
551 : * tuples pointed to by them; the summary values in the index tuples are
552 : * compared to the scan keys. We return into the TID bitmap all the pages in
553 : * ranges corresponding to index tuples that match the scan keys.
554 : *
555 : * If a TID from the revmap is read as InvalidTID, we know that range is
556 : * unsummarized. Pages in those ranges need to be returned regardless of scan
557 : * keys.
558 : */
559 : int64
560 2946 : bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
561 : {
562 2946 : Relation idxRel = scan->indexRelation;
563 2946 : Buffer buf = InvalidBuffer;
564 : BrinDesc *bdesc;
565 : Oid heapOid;
566 : Relation heapRel;
567 : BrinOpaque *opaque;
568 : BlockNumber nblocks;
569 : BlockNumber heapBlk;
570 2946 : int totalpages = 0;
571 : FmgrInfo *consistentFn;
572 : MemoryContext oldcxt;
573 : MemoryContext perRangeCxt;
574 : BrinMemTuple *dtup;
575 2946 : BrinTuple *btup = NULL;
576 2946 : Size btupsz = 0;
577 : ScanKey **keys,
578 : **nullkeys;
579 : int *nkeys,
580 : *nnullkeys;
581 : char *ptr;
582 : Size len;
583 : char *tmp PG_USED_FOR_ASSERTS_ONLY;
584 :
585 2946 : opaque = (BrinOpaque *) scan->opaque;
586 2946 : bdesc = opaque->bo_bdesc;
587 2946 : pgstat_count_index_scan(idxRel);
588 :
589 : /*
590 : * We need to know the size of the table so that we know how long to
591 : * iterate on the revmap.
592 : */
593 2946 : heapOid = IndexGetRelation(RelationGetRelid(idxRel), false);
594 2946 : heapRel = table_open(heapOid, AccessShareLock);
595 2946 : nblocks = RelationGetNumberOfBlocks(heapRel);
596 2946 : table_close(heapRel, AccessShareLock);
597 :
598 : /*
599 : * Make room for the consistent support procedures of indexed columns. We
600 : * don't look them up here; we do that lazily the first time we see a scan
601 : * key reference each of them. We rely on zeroing fn_oid to InvalidOid.
602 : */
603 2946 : consistentFn = palloc0_array(FmgrInfo, bdesc->bd_tupdesc->natts);
604 :
605 : /*
606 : * Make room for per-attribute lists of scan keys that we'll pass to the
607 : * consistent support procedure. We don't know which attributes have scan
608 : * keys, so we allocate space for all attributes. That may use more memory
609 : * but it's probably cheaper than determining which attributes are used.
610 : *
611 : * We keep null and regular keys separate, so that we can pass just the
612 : * regular keys to the consistent function easily.
613 : *
614 : * To reduce the allocation overhead, we allocate one big chunk and then
615 : * carve it into smaller arrays ourselves. All the pieces have exactly the
616 : * same lifetime, so that's OK.
617 : *
618 : * XXX The widest index can have 32 attributes, so the amount of wasted
619 : * memory is negligible. We could invent a more compact approach (with
620 : * just space for used attributes) but that would make the matching more
621 : * complex so it's not a good trade-off.
622 : */
623 2946 : len =
624 2946 : MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* regular keys */
625 2946 : MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts +
626 2946 : MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts) +
627 2946 : MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* NULL keys */
628 2946 : MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts +
629 2946 : MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
630 :
631 2946 : ptr = palloc(len);
632 2946 : tmp = ptr;
633 :
634 2946 : keys = (ScanKey **) ptr;
635 2946 : ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts);
636 :
637 2946 : nullkeys = (ScanKey **) ptr;
638 2946 : ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts);
639 :
640 2946 : nkeys = (int *) ptr;
641 2946 : ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
642 :
643 2946 : nnullkeys = (int *) ptr;
644 2946 : ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
645 :
646 69978 : for (int i = 0; i < bdesc->bd_tupdesc->natts; i++)
647 : {
648 67032 : keys[i] = (ScanKey *) ptr;
649 67032 : ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys);
650 :
651 67032 : nullkeys[i] = (ScanKey *) ptr;
652 67032 : ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys);
653 : }
654 :
655 : Assert(tmp + len == ptr);
656 :
657 : /* zero the number of keys */
658 2946 : memset(nkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts);
659 2946 : memset(nnullkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts);
660 :
661 : /* Preprocess the scan keys - split them into per-attribute arrays. */
662 5892 : for (int keyno = 0; keyno < scan->numberOfKeys; keyno++)
663 : {
664 2946 : ScanKey key = &scan->keyData[keyno];
665 2946 : AttrNumber keyattno = key->sk_attno;
666 :
667 : /*
668 : * The collation of the scan key must match the collation used in the
669 : * index column (but only if the search is not IS NULL/ IS NOT NULL).
670 : * Otherwise we shouldn't be using this index ...
671 : */
672 : Assert((key->sk_flags & SK_ISNULL) ||
673 : (key->sk_collation ==
674 : TupleDescAttr(bdesc->bd_tupdesc,
675 : keyattno - 1)->attcollation));
676 :
677 : /*
678 : * First time we see this index attribute, so init as needed.
679 : *
680 : * This is a bit of an overkill - we don't know how many scan keys are
681 : * there for this attribute, so we simply allocate the largest number
682 : * possible (as if all keys were for this attribute). This may waste a
683 : * bit of memory, but we only expect small number of scan keys in
684 : * general, so this should be negligible, and repeated repalloc calls
685 : * are not free either.
686 : */
687 2946 : if (consistentFn[keyattno - 1].fn_oid == InvalidOid)
688 : {
689 : FmgrInfo *tmp;
690 :
691 : /* First time we see this attribute, so no key/null keys. */
692 : Assert(nkeys[keyattno - 1] == 0);
693 : Assert(nnullkeys[keyattno - 1] == 0);
694 :
695 2946 : tmp = index_getprocinfo(idxRel, keyattno,
696 : BRIN_PROCNUM_CONSISTENT);
697 2946 : fmgr_info_copy(&consistentFn[keyattno - 1], tmp,
698 : CurrentMemoryContext);
699 : }
700 :
701 : /* Add key to the proper per-attribute array. */
702 2946 : if (key->sk_flags & SK_ISNULL)
703 : {
704 36 : nullkeys[keyattno - 1][nnullkeys[keyattno - 1]] = key;
705 36 : nnullkeys[keyattno - 1]++;
706 : }
707 : else
708 : {
709 2910 : keys[keyattno - 1][nkeys[keyattno - 1]] = key;
710 2910 : nkeys[keyattno - 1]++;
711 : }
712 : }
713 :
714 : /* allocate an initial in-memory tuple, out of the per-range memcxt */
715 2946 : dtup = brin_new_memtuple(bdesc);
716 :
717 : /*
718 : * Setup and use a per-range memory context, which is reset every time we
719 : * loop below. This avoids having to free the tuples within the loop.
720 : */
721 2946 : perRangeCxt = AllocSetContextCreate(CurrentMemoryContext,
722 : "bringetbitmap cxt",
723 : ALLOCSET_DEFAULT_SIZES);
724 2946 : oldcxt = MemoryContextSwitchTo(perRangeCxt);
725 :
726 : /*
727 : * Now scan the revmap. We start by querying for heap page 0,
728 : * incrementing by the number of pages per range; this gives us a full
729 : * view of the table.
730 : */
731 194598 : for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange)
732 : {
733 : bool addrange;
734 191652 : bool gottuple = false;
735 : BrinTuple *tup;
736 : OffsetNumber off;
737 : Size size;
738 :
739 191652 : CHECK_FOR_INTERRUPTS();
740 :
741 191652 : MemoryContextReset(perRangeCxt);
742 :
743 191652 : tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf,
744 : &off, &size, BUFFER_LOCK_SHARE);
745 191652 : if (tup)
746 : {
747 189936 : gottuple = true;
748 189936 : btup = brin_copy_tuple(tup, size, btup, &btupsz);
749 189936 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
750 : }
751 :
752 : /*
753 : * For page ranges with no indexed tuple, we must return the whole
754 : * range; otherwise, compare it to the scan keys.
755 : */
756 191652 : if (!gottuple)
757 : {
758 1716 : addrange = true;
759 : }
760 : else
761 : {
762 189936 : dtup = brin_deform_tuple(bdesc, btup, dtup);
763 189936 : if (dtup->bt_placeholder)
764 : {
765 : /*
766 : * Placeholder tuples are always returned, regardless of the
767 : * values stored in them.
768 : */
769 0 : addrange = true;
770 : }
771 : else
772 : {
773 : int attno;
774 :
775 : /*
776 : * Compare scan keys with summary values stored for the range.
777 : * If scan keys are matched, the page range must be added to
778 : * the bitmap. We initially assume the range needs to be
779 : * added; in particular this serves the case where there are
780 : * no keys.
781 : */
782 189936 : addrange = true;
783 4704072 : for (attno = 1; attno <= bdesc->bd_tupdesc->natts; attno++)
784 : {
785 : BrinValues *bval;
786 : Datum add;
787 : Oid collation;
788 :
789 : /*
790 : * skip attributes without any scan keys (both regular and
791 : * IS [NOT] NULL)
792 : */
793 4567734 : if (nkeys[attno - 1] == 0 && nnullkeys[attno - 1] == 0)
794 4377798 : continue;
795 :
796 189936 : bval = &dtup->bt_columns[attno - 1];
797 :
798 : /*
799 : * If the BRIN tuple indicates that this range is empty,
800 : * we can skip it: there's nothing to match. We don't
801 : * need to examine the next columns.
802 : */
803 189936 : if (dtup->bt_empty_range)
804 : {
805 0 : addrange = false;
806 0 : break;
807 : }
808 :
809 : /*
810 : * First check if there are any IS [NOT] NULL scan keys,
811 : * and if we're violating them. In that case we can
812 : * terminate early, without invoking the support function.
813 : *
814 : * As there may be more keys, we can only determine
815 : * mismatch within this loop.
816 : */
817 189936 : if (bdesc->bd_info[attno - 1]->oi_regular_nulls &&
818 189936 : !check_null_keys(bval, nullkeys[attno - 1],
819 189936 : nnullkeys[attno - 1]))
820 : {
821 : /*
822 : * If any of the IS [NOT] NULL keys failed, the page
823 : * range as a whole can't pass. So terminate the loop.
824 : */
825 996 : addrange = false;
826 996 : break;
827 : }
828 :
829 : /*
830 : * So either there are no IS [NOT] NULL keys, or all
831 : * passed. If there are no regular scan keys, we're done -
832 : * the page range matches. If there are regular keys, but
833 : * the page range is marked as 'all nulls' it can't
834 : * possibly pass (we're assuming the operators are
835 : * strict).
836 : */
837 :
838 : /* No regular scan keys - page range as a whole passes. */
839 188940 : if (!nkeys[attno - 1])
840 1236 : continue;
841 :
842 : Assert((nkeys[attno - 1] > 0) &&
843 : (nkeys[attno - 1] <= scan->numberOfKeys));
844 :
845 : /* If it is all nulls, it cannot possibly be consistent. */
846 187704 : if (bval->bv_allnulls)
847 : {
848 378 : addrange = false;
849 378 : break;
850 : }
851 :
852 : /*
853 : * Collation from the first key (has to be the same for
854 : * all keys for the same attribute).
855 : */
856 187326 : collation = keys[attno - 1][0]->sk_collation;
857 :
858 : /*
859 : * Check whether the scan key is consistent with the page
860 : * range values; if so, have the pages in the range added
861 : * to the output bitmap.
862 : *
863 : * The opclass may or may not support processing of
864 : * multiple scan keys. We can determine that based on the
865 : * number of arguments - functions with extra parameter
866 : * (number of scan keys) do support this, otherwise we
867 : * have to simply pass the scan keys one by one.
868 : */
869 187326 : if (consistentFn[attno - 1].fn_nargs >= 4)
870 : {
871 : /* Check all keys at once */
872 39594 : add = FunctionCall4Coll(&consistentFn[attno - 1],
873 : collation,
874 : PointerGetDatum(bdesc),
875 : PointerGetDatum(bval),
876 39594 : PointerGetDatum(keys[attno - 1]),
877 39594 : Int32GetDatum(nkeys[attno - 1]));
878 39594 : addrange = DatumGetBool(add);
879 : }
880 : else
881 : {
882 : /*
883 : * Check keys one by one
884 : *
885 : * When there are multiple scan keys, failure to meet
886 : * the criteria for a single one of them is enough to
887 : * discard the range as a whole, so break out of the
888 : * loop as soon as a false return value is obtained.
889 : */
890 : int keyno;
891 :
892 258078 : for (keyno = 0; keyno < nkeys[attno - 1]; keyno++)
893 : {
894 147732 : add = FunctionCall3Coll(&consistentFn[attno - 1],
895 147732 : keys[attno - 1][keyno]->sk_collation,
896 : PointerGetDatum(bdesc),
897 : PointerGetDatum(bval),
898 147732 : PointerGetDatum(keys[attno - 1][keyno]));
899 147732 : addrange = DatumGetBool(add);
900 147732 : if (!addrange)
901 37386 : break;
902 : }
903 : }
904 :
905 : /*
906 : * If we found a scan key eliminating the range, no need
907 : * to check additional ones.
908 : */
909 187326 : if (!addrange)
910 52224 : break;
911 : }
912 : }
913 : }
914 :
915 : /* add the pages in the range to the output bitmap, if needed */
916 191652 : if (addrange)
917 : {
918 : BlockNumber pageno;
919 :
920 138054 : for (pageno = heapBlk;
921 286044 : pageno <= Min(nblocks, heapBlk + opaque->bo_pagesPerRange) - 1;
922 147990 : pageno++)
923 : {
924 147990 : MemoryContextSwitchTo(oldcxt);
925 147990 : tbm_add_page(tbm, pageno);
926 147990 : totalpages++;
927 147990 : MemoryContextSwitchTo(perRangeCxt);
928 : }
929 : }
930 : }
931 :
932 2946 : MemoryContextSwitchTo(oldcxt);
933 2946 : MemoryContextDelete(perRangeCxt);
934 :
935 2946 : if (buf != InvalidBuffer)
936 2946 : ReleaseBuffer(buf);
937 :
938 : /*
939 : * XXX We have an approximation of the number of *pages* that our scan
940 : * returns, but we don't have a precise idea of the number of heap tuples
941 : * involved.
942 : */
943 2946 : return totalpages * 10;
944 : }
945 :
946 : /*
947 : * Re-initialize state for a BRIN index scan
948 : */
949 : void
950 2946 : brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
951 : ScanKey orderbys, int norderbys)
952 : {
953 : /*
954 : * Other index AMs preprocess the scan keys at this point, or sometime
955 : * early during the scan; this lets them optimize by removing redundant
956 : * keys, or doing early returns when they are impossible to satisfy; see
957 : * _bt_preprocess_keys for an example. Something like that could be added
958 : * here someday, too.
959 : */
960 :
961 2946 : if (scankey && scan->numberOfKeys > 0)
962 2946 : memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
963 2946 : }
964 :
965 : /*
966 : * Close down a BRIN index scan
967 : */
968 : void
969 2946 : brinendscan(IndexScanDesc scan)
970 : {
971 2946 : BrinOpaque *opaque = (BrinOpaque *) scan->opaque;
972 :
973 2946 : brinRevmapTerminate(opaque->bo_rmAccess);
974 2946 : brin_free_desc(opaque->bo_bdesc);
975 2946 : pfree(opaque);
976 2946 : }
977 :
978 : /*
979 : * Per-heap-tuple callback for table_index_build_scan.
980 : *
981 : * Note we don't worry about the page range at the end of the table here; it is
982 : * present in the build state struct after we're called the last time, but not
983 : * inserted into the index. Caller must ensure to do so, if appropriate.
984 : */
985 : static void
986 728304 : brinbuildCallback(Relation index,
987 : ItemPointer tid,
988 : Datum *values,
989 : bool *isnull,
990 : bool tupleIsAlive,
991 : void *brstate)
992 : {
993 728304 : BrinBuildState *state = (BrinBuildState *) brstate;
994 : BlockNumber thisblock;
995 :
996 728304 : thisblock = ItemPointerGetBlockNumber(tid);
997 :
998 : /*
999 : * If we're in a block that belongs to a future range, summarize what
1000 : * we've got and start afresh. Note the scan might have skipped many
1001 : * pages, if they were devoid of live tuples; make sure to insert index
1002 : * tuples for those too.
1003 : */
1004 730600 : while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1)
1005 : {
1006 :
1007 : BRIN_elog((DEBUG2,
1008 : "brinbuildCallback: completed a range: %u--%u",
1009 : state->bs_currRangeStart,
1010 : state->bs_currRangeStart + state->bs_pagesPerRange));
1011 :
1012 : /* create the index tuple and insert it */
1013 2296 : form_and_insert_tuple(state);
1014 :
1015 : /* set state to correspond to the next range */
1016 2296 : state->bs_currRangeStart += state->bs_pagesPerRange;
1017 :
1018 : /* re-initialize state for it */
1019 2296 : brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1020 : }
1021 :
1022 : /* Accumulate the current tuple into the running state */
1023 728304 : (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple,
1024 : values, isnull);
1025 728304 : }
1026 :
1027 : /*
1028 : * Per-heap-tuple callback for table_index_build_scan with parallelism.
1029 : *
1030 : * A version of the callback used by parallel index builds. The main difference
1031 : * is that instead of writing the BRIN tuples into the index, we write them
1032 : * into a shared tuplesort, and leave the insertion up to the leader (which may
1033 : * reorder them a bit etc.). The callback also does not generate empty ranges,
1034 : * those will be added by the leader when merging results from workers.
1035 : */
1036 : static void
1037 7962 : brinbuildCallbackParallel(Relation index,
1038 : ItemPointer tid,
1039 : Datum *values,
1040 : bool *isnull,
1041 : bool tupleIsAlive,
1042 : void *brstate)
1043 : {
1044 7962 : BrinBuildState *state = (BrinBuildState *) brstate;
1045 : BlockNumber thisblock;
1046 :
1047 7962 : thisblock = ItemPointerGetBlockNumber(tid);
1048 :
1049 : /*
1050 : * If we're in a block that belongs to a different range, summarize what
1051 : * we've got and start afresh. Note the scan might have skipped many
1052 : * pages, if they were devoid of live tuples; we do not create empty BRIN
1053 : * ranges here - the leader is responsible for filling them in.
1054 : *
1055 : * Unlike serial builds, parallel index builds allow synchronized seqscans
1056 : * (because that's what parallel scans do). This means the block may wrap
1057 : * around to the beginning of the relation, so the condition needs to
1058 : * check for both future and past ranges.
1059 : */
1060 7962 : if ((thisblock < state->bs_currRangeStart) ||
1061 7962 : (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1))
1062 : {
1063 :
1064 : BRIN_elog((DEBUG2,
1065 : "brinbuildCallbackParallel: completed a range: %u--%u",
1066 : state->bs_currRangeStart,
1067 : state->bs_currRangeStart + state->bs_pagesPerRange));
1068 :
1069 : /* create the index tuple and write it into the tuplesort */
1070 38 : form_and_spill_tuple(state);
1071 :
1072 : /*
1073 : * Set state to correspond to the next range (for this block).
1074 : *
1075 : * This skips ranges that are either empty (and so we don't get any
1076 : * tuples to summarize), or processed by other workers. We can't
1077 : * differentiate those cases here easily, so we leave it up to the
1078 : * leader to fill empty ranges where needed.
1079 : */
1080 : state->bs_currRangeStart
1081 38 : = state->bs_pagesPerRange * (thisblock / state->bs_pagesPerRange);
1082 :
1083 : /* re-initialize state for it */
1084 38 : brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1085 : }
1086 :
1087 : /* Accumulate the current tuple into the running state */
1088 7962 : (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple,
1089 : values, isnull);
1090 7962 : }
1091 :
1092 : /*
1093 : * brinbuild() -- build a new BRIN index.
1094 : */
1095 : IndexBuildResult *
1096 344 : brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
1097 : {
1098 : IndexBuildResult *result;
1099 : double reltuples;
1100 : double idxtuples;
1101 : BrinRevmap *revmap;
1102 : BrinBuildState *state;
1103 : Buffer meta;
1104 : BlockNumber pagesPerRange;
1105 :
1106 : /*
1107 : * We expect to be called exactly once for any index relation.
1108 : */
1109 344 : if (RelationGetNumberOfBlocks(index) != 0)
1110 0 : elog(ERROR, "index \"%s\" already contains data",
1111 : RelationGetRelationName(index));
1112 :
1113 : /*
1114 : * Critical section not required, because on error the creation of the
1115 : * whole relation will be rolled back.
1116 : */
1117 :
1118 344 : meta = ExtendBufferedRel(BMR_REL(index), MAIN_FORKNUM, NULL,
1119 : EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);
1120 : Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO);
1121 :
1122 344 : brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index),
1123 : BRIN_CURRENT_VERSION);
1124 344 : MarkBufferDirty(meta);
1125 :
1126 344 : if (RelationNeedsWAL(index))
1127 : {
1128 : xl_brin_createidx xlrec;
1129 : XLogRecPtr recptr;
1130 : Page page;
1131 :
1132 178 : xlrec.version = BRIN_CURRENT_VERSION;
1133 178 : xlrec.pagesPerRange = BrinGetPagesPerRange(index);
1134 :
1135 178 : XLogBeginInsert();
1136 178 : XLogRegisterData((char *) &xlrec, SizeOfBrinCreateIdx);
1137 178 : XLogRegisterBuffer(0, meta, REGBUF_WILL_INIT | REGBUF_STANDARD);
1138 :
1139 178 : recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX);
1140 :
1141 178 : page = BufferGetPage(meta);
1142 178 : PageSetLSN(page, recptr);
1143 : }
1144 :
1145 344 : UnlockReleaseBuffer(meta);
1146 :
1147 : /*
1148 : * Initialize our state, including the deformed tuple state.
1149 : */
1150 344 : revmap = brinRevmapInitialize(index, &pagesPerRange);
1151 344 : state = initialize_brin_buildstate(index, revmap, pagesPerRange,
1152 : RelationGetNumberOfBlocks(heap));
1153 :
1154 : /*
1155 : * Attempt to launch parallel worker scan when required
1156 : *
1157 : * XXX plan_create_index_workers makes the number of workers dependent on
1158 : * maintenance_work_mem, requiring 32MB for each worker. That makes sense
1159 : * for btree, but not for BRIN, which can do with much less memory. So
1160 : * maybe make that somehow less strict, optionally?
1161 : */
1162 344 : if (indexInfo->ii_ParallelWorkers > 0)
1163 10 : _brin_begin_parallel(state, heap, index, indexInfo->ii_Concurrent,
1164 : indexInfo->ii_ParallelWorkers);
1165 :
1166 : /*
1167 : * If parallel build requested and at least one worker process was
1168 : * successfully launched, set up coordination state, wait for workers to
1169 : * complete. Then read all tuples from the shared tuplesort and insert
1170 : * them into the index.
1171 : *
1172 : * In serial mode, simply scan the table and build the index one index
1173 : * tuple at a time.
1174 : */
1175 344 : if (state->bs_leader)
1176 : {
1177 : SortCoordinate coordinate;
1178 :
1179 8 : coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
1180 8 : coordinate->isWorker = false;
1181 8 : coordinate->nParticipants =
1182 8 : state->bs_leader->nparticipanttuplesorts;
1183 8 : coordinate->sharedsort = state->bs_leader->sharedsort;
1184 :
1185 : /*
1186 : * Begin leader tuplesort.
1187 : *
1188 : * In cases where parallelism is involved, the leader receives the
1189 : * same share of maintenance_work_mem as a serial sort (it is
1190 : * generally treated in the same way as a serial sort once we return).
1191 : * Parallel worker Tuplesortstates will have received only a fraction
1192 : * of maintenance_work_mem, though.
1193 : *
1194 : * We rely on the lifetime of the Leader Tuplesortstate almost not
1195 : * overlapping with any worker Tuplesortstate's lifetime. There may
1196 : * be some small overlap, but that's okay because we rely on leader
1197 : * Tuplesortstate only allocating a small, fixed amount of memory
1198 : * here. When its tuplesort_performsort() is called (by our caller),
1199 : * and significant amounts of memory are likely to be used, all
1200 : * workers must have already freed almost all memory held by their
1201 : * Tuplesortstates (they are about to go away completely, too). The
1202 : * overall effect is that maintenance_work_mem always represents an
1203 : * absolute high watermark on the amount of memory used by a CREATE
1204 : * INDEX operation, regardless of the use of parallelism or any other
1205 : * factor.
1206 : */
1207 8 : state->bs_sortstate =
1208 8 : tuplesort_begin_index_brin(maintenance_work_mem, coordinate,
1209 : TUPLESORT_NONE);
1210 :
1211 : /* scan the relation and merge per-worker results */
1212 8 : reltuples = _brin_parallel_merge(state);
1213 :
1214 8 : _brin_end_parallel(state->bs_leader, state);
1215 : }
1216 : else /* no parallel index build */
1217 : {
1218 : /*
1219 : * Now scan the relation. No syncscan allowed here because we want
1220 : * the heap blocks in physical order (we want to produce the ranges
1221 : * starting from block 0, and the callback also relies on this to not
1222 : * generate summary for the same range twice).
1223 : */
1224 336 : reltuples = table_index_build_scan(heap, index, indexInfo, false, true,
1225 : brinbuildCallback, state, NULL);
1226 :
1227 : /*
1228 : * process the final batch
1229 : *
1230 : * XXX Note this does not update state->bs_currRangeStart, i.e. it
1231 : * stays set to the last range added to the index. This is OK, because
1232 : * that's what brin_fill_empty_ranges expects.
1233 : */
1234 336 : form_and_insert_tuple(state);
1235 :
1236 : /*
1237 : * Backfill the final ranges with empty data.
1238 : *
1239 : * This saves us from doing what amounts to full table scans when the
1240 : * index with a predicate like WHERE (nonnull_column IS NULL), or
1241 : * other very selective predicates.
1242 : */
1243 336 : brin_fill_empty_ranges(state,
1244 : state->bs_currRangeStart,
1245 : state->bs_maxRangeStart);
1246 : }
1247 :
1248 : /* release resources */
1249 344 : idxtuples = state->bs_numtuples;
1250 344 : brinRevmapTerminate(state->bs_rmAccess);
1251 344 : terminate_brin_buildstate(state);
1252 :
1253 : /*
1254 : * Return statistics
1255 : */
1256 344 : result = palloc_object(IndexBuildResult);
1257 :
1258 344 : result->heap_tuples = reltuples;
1259 344 : result->index_tuples = idxtuples;
1260 :
1261 344 : return result;
1262 : }
1263 :
1264 : void
1265 6 : brinbuildempty(Relation index)
1266 : {
1267 : Buffer metabuf;
1268 :
1269 : /* An empty BRIN index has a metapage only. */
1270 6 : metabuf = ExtendBufferedRel(BMR_REL(index), INIT_FORKNUM, NULL,
1271 : EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);
1272 :
1273 : /* Initialize and xlog metabuffer. */
1274 6 : START_CRIT_SECTION();
1275 6 : brin_metapage_init(BufferGetPage(metabuf), BrinGetPagesPerRange(index),
1276 : BRIN_CURRENT_VERSION);
1277 6 : MarkBufferDirty(metabuf);
1278 6 : log_newpage_buffer(metabuf, true);
1279 6 : END_CRIT_SECTION();
1280 :
1281 6 : UnlockReleaseBuffer(metabuf);
1282 6 : }
1283 :
1284 : /*
1285 : * brinbulkdelete
1286 : * Since there are no per-heap-tuple index tuples in BRIN indexes,
1287 : * there's not a lot we can do here.
1288 : *
1289 : * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap
1290 : * tuple is deleted), meaning the need to re-run summarization on the affected
1291 : * range. Would need to add an extra flag in brintuples for that.
1292 : */
1293 : IndexBulkDeleteResult *
1294 22 : brinbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1295 : IndexBulkDeleteCallback callback, void *callback_state)
1296 : {
1297 : /* allocate stats if first time through, else re-use existing struct */
1298 22 : if (stats == NULL)
1299 22 : stats = palloc0_object(IndexBulkDeleteResult);
1300 :
1301 22 : return stats;
1302 : }
1303 :
1304 : /*
1305 : * This routine is in charge of "vacuuming" a BRIN index: we just summarize
1306 : * ranges that are currently unsummarized.
1307 : */
1308 : IndexBulkDeleteResult *
1309 90 : brinvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
1310 : {
1311 : Relation heapRel;
1312 :
1313 : /* No-op in ANALYZE ONLY mode */
1314 90 : if (info->analyze_only)
1315 4 : return stats;
1316 :
1317 86 : if (!stats)
1318 70 : stats = palloc0_object(IndexBulkDeleteResult);
1319 86 : stats->num_pages = RelationGetNumberOfBlocks(info->index);
1320 : /* rest of stats is initialized by zeroing */
1321 :
1322 86 : heapRel = table_open(IndexGetRelation(RelationGetRelid(info->index), false),
1323 : AccessShareLock);
1324 :
1325 86 : brin_vacuum_scan(info->index, info->strategy);
1326 :
1327 86 : brinsummarize(info->index, heapRel, BRIN_ALL_BLOCKRANGES, false,
1328 : &stats->num_index_tuples, &stats->num_index_tuples);
1329 :
1330 86 : table_close(heapRel, AccessShareLock);
1331 :
1332 86 : return stats;
1333 : }
1334 :
1335 : /*
1336 : * reloptions processor for BRIN indexes
1337 : */
1338 : bytea *
1339 1116 : brinoptions(Datum reloptions, bool validate)
1340 : {
1341 : static const relopt_parse_elt tab[] = {
1342 : {"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)},
1343 : {"autosummarize", RELOPT_TYPE_BOOL, offsetof(BrinOptions, autosummarize)}
1344 : };
1345 :
1346 1116 : return (bytea *) build_reloptions(reloptions, validate,
1347 : RELOPT_KIND_BRIN,
1348 : sizeof(BrinOptions),
1349 : tab, lengthof(tab));
1350 : }
1351 :
1352 : /*
1353 : * SQL-callable function to scan through an index and summarize all ranges
1354 : * that are not currently summarized.
1355 : */
1356 : Datum
1357 76 : brin_summarize_new_values(PG_FUNCTION_ARGS)
1358 : {
1359 76 : Datum relation = PG_GETARG_DATUM(0);
1360 :
1361 76 : return DirectFunctionCall2(brin_summarize_range,
1362 : relation,
1363 : Int64GetDatum((int64) BRIN_ALL_BLOCKRANGES));
1364 : }
1365 :
1366 : /*
1367 : * SQL-callable function to summarize the indicated page range, if not already
1368 : * summarized. If the second argument is BRIN_ALL_BLOCKRANGES, all
1369 : * unsummarized ranges are summarized.
1370 : */
1371 : Datum
1372 204 : brin_summarize_range(PG_FUNCTION_ARGS)
1373 : {
1374 204 : Oid indexoid = PG_GETARG_OID(0);
1375 204 : int64 heapBlk64 = PG_GETARG_INT64(1);
1376 : BlockNumber heapBlk;
1377 : Oid heapoid;
1378 : Relation indexRel;
1379 : Relation heapRel;
1380 : Oid save_userid;
1381 : int save_sec_context;
1382 : int save_nestlevel;
1383 204 : double numSummarized = 0;
1384 :
1385 204 : if (RecoveryInProgress())
1386 0 : ereport(ERROR,
1387 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1388 : errmsg("recovery is in progress"),
1389 : errhint("BRIN control functions cannot be executed during recovery.")));
1390 :
1391 204 : if (heapBlk64 > BRIN_ALL_BLOCKRANGES || heapBlk64 < 0)
1392 36 : ereport(ERROR,
1393 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1394 : errmsg("block number out of range: %lld",
1395 : (long long) heapBlk64)));
1396 168 : heapBlk = (BlockNumber) heapBlk64;
1397 :
1398 : /*
1399 : * We must lock table before index to avoid deadlocks. However, if the
1400 : * passed indexoid isn't an index then IndexGetRelation() will fail.
1401 : * Rather than emitting a not-very-helpful error message, postpone
1402 : * complaining, expecting that the is-it-an-index test below will fail.
1403 : */
1404 168 : heapoid = IndexGetRelation(indexoid, true);
1405 168 : if (OidIsValid(heapoid))
1406 : {
1407 150 : heapRel = table_open(heapoid, ShareUpdateExclusiveLock);
1408 :
1409 : /*
1410 : * Autovacuum calls us. For its benefit, switch to the table owner's
1411 : * userid, so that any index functions are run as that user. Also
1412 : * lock down security-restricted operations and arrange to make GUC
1413 : * variable changes local to this command. This is harmless, albeit
1414 : * unnecessary, when called from SQL, because we fail shortly if the
1415 : * user does not own the index.
1416 : */
1417 150 : GetUserIdAndSecContext(&save_userid, &save_sec_context);
1418 150 : SetUserIdAndSecContext(heapRel->rd_rel->relowner,
1419 : save_sec_context | SECURITY_RESTRICTED_OPERATION);
1420 150 : save_nestlevel = NewGUCNestLevel();
1421 150 : RestrictSearchPath();
1422 : }
1423 : else
1424 : {
1425 18 : heapRel = NULL;
1426 : /* Set these just to suppress "uninitialized variable" warnings */
1427 18 : save_userid = InvalidOid;
1428 18 : save_sec_context = -1;
1429 18 : save_nestlevel = -1;
1430 : }
1431 :
1432 168 : indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
1433 :
1434 : /* Must be a BRIN index */
1435 150 : if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
1436 150 : indexRel->rd_rel->relam != BRIN_AM_OID)
1437 18 : ereport(ERROR,
1438 : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1439 : errmsg("\"%s\" is not a BRIN index",
1440 : RelationGetRelationName(indexRel))));
1441 :
1442 : /* User must own the index (comparable to privileges needed for VACUUM) */
1443 132 : if (heapRel != NULL && !object_ownercheck(RelationRelationId, indexoid, save_userid))
1444 0 : aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX,
1445 0 : RelationGetRelationName(indexRel));
1446 :
1447 : /*
1448 : * Since we did the IndexGetRelation call above without any lock, it's
1449 : * barely possible that a race against an index drop/recreation could have
1450 : * netted us the wrong table. Recheck.
1451 : */
1452 132 : if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
1453 0 : ereport(ERROR,
1454 : (errcode(ERRCODE_UNDEFINED_TABLE),
1455 : errmsg("could not open parent table of index \"%s\"",
1456 : RelationGetRelationName(indexRel))));
1457 :
1458 : /* see gin_clean_pending_list() */
1459 132 : if (indexRel->rd_index->indisvalid)
1460 132 : brinsummarize(indexRel, heapRel, heapBlk, true, &numSummarized, NULL);
1461 : else
1462 0 : ereport(DEBUG1,
1463 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1464 : errmsg("index \"%s\" is not valid",
1465 : RelationGetRelationName(indexRel))));
1466 :
1467 : /* Roll back any GUC changes executed by index functions */
1468 132 : AtEOXact_GUC(false, save_nestlevel);
1469 :
1470 : /* Restore userid and security context */
1471 132 : SetUserIdAndSecContext(save_userid, save_sec_context);
1472 :
1473 132 : relation_close(indexRel, ShareUpdateExclusiveLock);
1474 132 : relation_close(heapRel, ShareUpdateExclusiveLock);
1475 :
1476 132 : PG_RETURN_INT32((int32) numSummarized);
1477 : }
1478 :
1479 : /*
1480 : * SQL-callable interface to mark a range as no longer summarized
1481 : */
1482 : Datum
1483 104 : brin_desummarize_range(PG_FUNCTION_ARGS)
1484 : {
1485 104 : Oid indexoid = PG_GETARG_OID(0);
1486 104 : int64 heapBlk64 = PG_GETARG_INT64(1);
1487 : BlockNumber heapBlk;
1488 : Oid heapoid;
1489 : Relation heapRel;
1490 : Relation indexRel;
1491 : bool done;
1492 :
1493 104 : if (RecoveryInProgress())
1494 0 : ereport(ERROR,
1495 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1496 : errmsg("recovery is in progress"),
1497 : errhint("BRIN control functions cannot be executed during recovery.")));
1498 :
1499 104 : if (heapBlk64 > MaxBlockNumber || heapBlk64 < 0)
1500 18 : ereport(ERROR,
1501 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1502 : errmsg("block number out of range: %lld",
1503 : (long long) heapBlk64)));
1504 86 : heapBlk = (BlockNumber) heapBlk64;
1505 :
1506 : /*
1507 : * We must lock table before index to avoid deadlocks. However, if the
1508 : * passed indexoid isn't an index then IndexGetRelation() will fail.
1509 : * Rather than emitting a not-very-helpful error message, postpone
1510 : * complaining, expecting that the is-it-an-index test below will fail.
1511 : *
1512 : * Unlike brin_summarize_range(), autovacuum never calls this. Hence, we
1513 : * don't switch userid.
1514 : */
1515 86 : heapoid = IndexGetRelation(indexoid, true);
1516 86 : if (OidIsValid(heapoid))
1517 86 : heapRel = table_open(heapoid, ShareUpdateExclusiveLock);
1518 : else
1519 0 : heapRel = NULL;
1520 :
1521 86 : indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
1522 :
1523 : /* Must be a BRIN index */
1524 86 : if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
1525 86 : indexRel->rd_rel->relam != BRIN_AM_OID)
1526 0 : ereport(ERROR,
1527 : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1528 : errmsg("\"%s\" is not a BRIN index",
1529 : RelationGetRelationName(indexRel))));
1530 :
1531 : /* User must own the index (comparable to privileges needed for VACUUM) */
1532 86 : if (!object_ownercheck(RelationRelationId, indexoid, GetUserId()))
1533 0 : aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX,
1534 0 : RelationGetRelationName(indexRel));
1535 :
1536 : /*
1537 : * Since we did the IndexGetRelation call above without any lock, it's
1538 : * barely possible that a race against an index drop/recreation could have
1539 : * netted us the wrong table. Recheck.
1540 : */
1541 86 : if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
1542 0 : ereport(ERROR,
1543 : (errcode(ERRCODE_UNDEFINED_TABLE),
1544 : errmsg("could not open parent table of index \"%s\"",
1545 : RelationGetRelationName(indexRel))));
1546 :
1547 : /* see gin_clean_pending_list() */
1548 86 : if (indexRel->rd_index->indisvalid)
1549 : {
1550 : /* the revmap does the hard work */
1551 : do
1552 : {
1553 86 : done = brinRevmapDesummarizeRange(indexRel, heapBlk);
1554 : }
1555 86 : while (!done);
1556 : }
1557 : else
1558 0 : ereport(DEBUG1,
1559 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1560 : errmsg("index \"%s\" is not valid",
1561 : RelationGetRelationName(indexRel))));
1562 :
1563 86 : relation_close(indexRel, ShareUpdateExclusiveLock);
1564 86 : relation_close(heapRel, ShareUpdateExclusiveLock);
1565 :
1566 86 : PG_RETURN_VOID();
1567 : }
1568 :
1569 : /*
1570 : * Build a BrinDesc used to create or scan a BRIN index
1571 : */
1572 : BrinDesc *
1573 4514 : brin_build_desc(Relation rel)
1574 : {
1575 : BrinOpcInfo **opcinfo;
1576 : BrinDesc *bdesc;
1577 : TupleDesc tupdesc;
1578 4514 : int totalstored = 0;
1579 : int keyno;
1580 : long totalsize;
1581 : MemoryContext cxt;
1582 : MemoryContext oldcxt;
1583 :
1584 4514 : cxt = AllocSetContextCreate(CurrentMemoryContext,
1585 : "brin desc cxt",
1586 : ALLOCSET_SMALL_SIZES);
1587 4514 : oldcxt = MemoryContextSwitchTo(cxt);
1588 4514 : tupdesc = RelationGetDescr(rel);
1589 :
1590 : /*
1591 : * Obtain BrinOpcInfo for each indexed column. While at it, accumulate
1592 : * the number of columns stored, since the number is opclass-defined.
1593 : */
1594 4514 : opcinfo = palloc_array(BrinOpcInfo *, tupdesc->natts);
1595 75864 : for (keyno = 0; keyno < tupdesc->natts; keyno++)
1596 : {
1597 : FmgrInfo *opcInfoFn;
1598 71350 : Form_pg_attribute attr = TupleDescAttr(tupdesc, keyno);
1599 :
1600 71350 : opcInfoFn = index_getprocinfo(rel, keyno + 1, BRIN_PROCNUM_OPCINFO);
1601 :
1602 142700 : opcinfo[keyno] = (BrinOpcInfo *)
1603 71350 : DatumGetPointer(FunctionCall1(opcInfoFn, attr->atttypid));
1604 71350 : totalstored += opcinfo[keyno]->oi_nstored;
1605 : }
1606 :
1607 : /* Allocate our result struct and fill it in */
1608 4514 : totalsize = offsetof(BrinDesc, bd_info) +
1609 4514 : sizeof(BrinOpcInfo *) * tupdesc->natts;
1610 :
1611 4514 : bdesc = palloc(totalsize);
1612 4514 : bdesc->bd_context = cxt;
1613 4514 : bdesc->bd_index = rel;
1614 4514 : bdesc->bd_tupdesc = tupdesc;
1615 4514 : bdesc->bd_disktdesc = NULL; /* generated lazily */
1616 4514 : bdesc->bd_totalstored = totalstored;
1617 :
1618 75864 : for (keyno = 0; keyno < tupdesc->natts; keyno++)
1619 71350 : bdesc->bd_info[keyno] = opcinfo[keyno];
1620 4514 : pfree(opcinfo);
1621 :
1622 4514 : MemoryContextSwitchTo(oldcxt);
1623 :
1624 4514 : return bdesc;
1625 : }
1626 :
1627 : void
1628 3410 : brin_free_desc(BrinDesc *bdesc)
1629 : {
1630 : /* make sure the tupdesc is still valid */
1631 : Assert(bdesc->bd_tupdesc->tdrefcount >= 1);
1632 : /* no need for retail pfree */
1633 3410 : MemoryContextDelete(bdesc->bd_context);
1634 3410 : }
1635 :
1636 : /*
1637 : * Fetch index's statistical data into *stats
1638 : */
1639 : void
1640 10730 : brinGetStats(Relation index, BrinStatsData *stats)
1641 : {
1642 : Buffer metabuffer;
1643 : Page metapage;
1644 : BrinMetaPageData *metadata;
1645 :
1646 10730 : metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO);
1647 10730 : LockBuffer(metabuffer, BUFFER_LOCK_SHARE);
1648 10730 : metapage = BufferGetPage(metabuffer);
1649 10730 : metadata = (BrinMetaPageData *) PageGetContents(metapage);
1650 :
1651 10730 : stats->pagesPerRange = metadata->pagesPerRange;
1652 10730 : stats->revmapNumPages = metadata->lastRevmapPage - 1;
1653 :
1654 10730 : UnlockReleaseBuffer(metabuffer);
1655 10730 : }
1656 :
1657 : /*
1658 : * Initialize a BrinBuildState appropriate to create tuples on the given index.
1659 : */
1660 : static BrinBuildState *
1661 434 : initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap,
1662 : BlockNumber pagesPerRange, BlockNumber tablePages)
1663 : {
1664 : BrinBuildState *state;
1665 434 : BlockNumber lastRange = 0;
1666 :
1667 434 : state = palloc_object(BrinBuildState);
1668 :
1669 434 : state->bs_irel = idxRel;
1670 434 : state->bs_numtuples = 0;
1671 434 : state->bs_reltuples = 0;
1672 434 : state->bs_currentInsertBuf = InvalidBuffer;
1673 434 : state->bs_pagesPerRange = pagesPerRange;
1674 434 : state->bs_currRangeStart = 0;
1675 434 : state->bs_rmAccess = revmap;
1676 434 : state->bs_bdesc = brin_build_desc(idxRel);
1677 434 : state->bs_dtuple = brin_new_memtuple(state->bs_bdesc);
1678 434 : state->bs_leader = NULL;
1679 434 : state->bs_worker_id = 0;
1680 434 : state->bs_sortstate = NULL;
1681 434 : state->bs_context = CurrentMemoryContext;
1682 434 : state->bs_emptyTuple = NULL;
1683 434 : state->bs_emptyTupleLen = 0;
1684 :
1685 : /* Remember the memory context to use for an empty tuple, if needed. */
1686 434 : state->bs_context = CurrentMemoryContext;
1687 434 : state->bs_emptyTuple = NULL;
1688 434 : state->bs_emptyTupleLen = 0;
1689 :
1690 : /*
1691 : * Calculate the start of the last page range. Page numbers are 0-based,
1692 : * so to calculate the index we need to subtract one. The integer division
1693 : * gives us the index of the page range.
1694 : */
1695 434 : if (tablePages > 0)
1696 332 : lastRange = ((tablePages - 1) / pagesPerRange) * pagesPerRange;
1697 :
1698 : /* Now calculate the start of the next range. */
1699 434 : state->bs_maxRangeStart = lastRange + state->bs_pagesPerRange;
1700 :
1701 434 : return state;
1702 : }
1703 :
1704 : /*
1705 : * Release resources associated with a BrinBuildState.
1706 : */
1707 : static void
1708 422 : terminate_brin_buildstate(BrinBuildState *state)
1709 : {
1710 : /*
1711 : * Release the last index buffer used. We might as well ensure that
1712 : * whatever free space remains in that page is available in FSM, too.
1713 : */
1714 422 : if (!BufferIsInvalid(state->bs_currentInsertBuf))
1715 : {
1716 : Page page;
1717 : Size freespace;
1718 : BlockNumber blk;
1719 :
1720 344 : page = BufferGetPage(state->bs_currentInsertBuf);
1721 344 : freespace = PageGetFreeSpace(page);
1722 344 : blk = BufferGetBlockNumber(state->bs_currentInsertBuf);
1723 344 : ReleaseBuffer(state->bs_currentInsertBuf);
1724 344 : RecordPageWithFreeSpace(state->bs_irel, blk, freespace);
1725 344 : FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1);
1726 : }
1727 :
1728 422 : brin_free_desc(state->bs_bdesc);
1729 422 : pfree(state->bs_dtuple);
1730 422 : pfree(state);
1731 422 : }
1732 :
1733 : /*
1734 : * On the given BRIN index, summarize the heap page range that corresponds
1735 : * to the heap block number given.
1736 : *
1737 : * This routine can run in parallel with insertions into the heap. To avoid
1738 : * missing those values from the summary tuple, we first insert a placeholder
1739 : * index tuple into the index, then execute the heap scan; transactions
1740 : * concurrent with the scan update the placeholder tuple. After the scan, we
1741 : * union the placeholder tuple with the one computed by this routine. The
1742 : * update of the index value happens in a loop, so that if somebody updates
1743 : * the placeholder tuple after we read it, we detect the case and try again.
1744 : * This ensures that the concurrently inserted tuples are not lost.
1745 : *
1746 : * A further corner case is this routine being asked to summarize the partial
1747 : * range at the end of the table. heapNumBlocks is the (possibly outdated)
1748 : * table size; if we notice that the requested range lies beyond that size,
1749 : * we re-compute the table size after inserting the placeholder tuple, to
1750 : * avoid missing pages that were appended recently.
1751 : */
1752 : static void
1753 2934 : summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel,
1754 : BlockNumber heapBlk, BlockNumber heapNumBlks)
1755 : {
1756 : Buffer phbuf;
1757 : BrinTuple *phtup;
1758 : Size phsz;
1759 : OffsetNumber offset;
1760 : BlockNumber scanNumBlks;
1761 :
1762 : /*
1763 : * Insert the placeholder tuple
1764 : */
1765 2934 : phbuf = InvalidBuffer;
1766 2934 : phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz);
1767 2934 : offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange,
1768 : state->bs_rmAccess, &phbuf,
1769 : heapBlk, phtup, phsz);
1770 :
1771 : /*
1772 : * Compute range end. We hold ShareUpdateExclusive lock on table, so it
1773 : * cannot shrink concurrently (but it can grow).
1774 : */
1775 : Assert(heapBlk % state->bs_pagesPerRange == 0);
1776 2934 : if (heapBlk + state->bs_pagesPerRange > heapNumBlks)
1777 : {
1778 : /*
1779 : * If we're asked to scan what we believe to be the final range on the
1780 : * table (i.e. a range that might be partial) we need to recompute our
1781 : * idea of what the latest page is after inserting the placeholder
1782 : * tuple. Anyone that grows the table later will update the
1783 : * placeholder tuple, so it doesn't matter that we won't scan these
1784 : * pages ourselves. Careful: the table might have been extended
1785 : * beyond the current range, so clamp our result.
1786 : *
1787 : * Fortunately, this should occur infrequently.
1788 : */
1789 24 : scanNumBlks = Min(RelationGetNumberOfBlocks(heapRel) - heapBlk,
1790 : state->bs_pagesPerRange);
1791 : }
1792 : else
1793 : {
1794 : /* Easy case: range is known to be complete */
1795 2910 : scanNumBlks = state->bs_pagesPerRange;
1796 : }
1797 :
1798 : /*
1799 : * Execute the partial heap scan covering the heap blocks in the specified
1800 : * page range, summarizing the heap tuples in it. This scan stops just
1801 : * short of brinbuildCallback creating the new index entry.
1802 : *
1803 : * Note that it is critical we use the "any visible" mode of
1804 : * table_index_build_range_scan here: otherwise, we would miss tuples
1805 : * inserted by transactions that are still in progress, among other corner
1806 : * cases.
1807 : */
1808 2934 : state->bs_currRangeStart = heapBlk;
1809 2934 : table_index_build_range_scan(heapRel, state->bs_irel, indexInfo, false, true, false,
1810 : heapBlk, scanNumBlks,
1811 : brinbuildCallback, state, NULL);
1812 :
1813 : /*
1814 : * Now we update the values obtained by the scan with the placeholder
1815 : * tuple. We do this in a loop which only terminates if we're able to
1816 : * update the placeholder tuple successfully; if we are not, this means
1817 : * somebody else modified the placeholder tuple after we read it.
1818 : */
1819 : for (;;)
1820 0 : {
1821 : BrinTuple *newtup;
1822 : Size newsize;
1823 : bool didupdate;
1824 : bool samepage;
1825 :
1826 2934 : CHECK_FOR_INTERRUPTS();
1827 :
1828 : /*
1829 : * Update the summary tuple and try to update.
1830 : */
1831 2934 : newtup = brin_form_tuple(state->bs_bdesc,
1832 : heapBlk, state->bs_dtuple, &newsize);
1833 2934 : samepage = brin_can_do_samepage_update(phbuf, phsz, newsize);
1834 : didupdate =
1835 2934 : brin_doupdate(state->bs_irel, state->bs_pagesPerRange,
1836 : state->bs_rmAccess, heapBlk, phbuf, offset,
1837 : phtup, phsz, newtup, newsize, samepage);
1838 2934 : brin_free_tuple(phtup);
1839 2934 : brin_free_tuple(newtup);
1840 :
1841 : /* If the update succeeded, we're done. */
1842 2934 : if (didupdate)
1843 2934 : break;
1844 :
1845 : /*
1846 : * If the update didn't work, it might be because somebody updated the
1847 : * placeholder tuple concurrently. Extract the new version, union it
1848 : * with the values we have from the scan, and start over. (There are
1849 : * other reasons for the update to fail, but it's simple to treat them
1850 : * the same.)
1851 : */
1852 0 : phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf,
1853 : &offset, &phsz, BUFFER_LOCK_SHARE);
1854 : /* the placeholder tuple must exist */
1855 0 : if (phtup == NULL)
1856 0 : elog(ERROR, "missing placeholder tuple");
1857 0 : phtup = brin_copy_tuple(phtup, phsz, NULL, NULL);
1858 0 : LockBuffer(phbuf, BUFFER_LOCK_UNLOCK);
1859 :
1860 : /* merge it into the tuple from the heap scan */
1861 0 : union_tuples(state->bs_bdesc, state->bs_dtuple, phtup);
1862 : }
1863 :
1864 2934 : ReleaseBuffer(phbuf);
1865 2934 : }
1866 :
1867 : /*
1868 : * Summarize page ranges that are not already summarized. If pageRange is
1869 : * BRIN_ALL_BLOCKRANGES then the whole table is scanned; otherwise, only the
1870 : * page range containing the given heap page number is scanned.
1871 : * If include_partial is true, then the partial range at the end of the table
1872 : * is summarized, otherwise not.
1873 : *
1874 : * For each new index tuple inserted, *numSummarized (if not NULL) is
1875 : * incremented; for each existing tuple, *numExisting (if not NULL) is
1876 : * incremented.
1877 : */
1878 : static void
1879 218 : brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
1880 : bool include_partial, double *numSummarized, double *numExisting)
1881 : {
1882 : BrinRevmap *revmap;
1883 218 : BrinBuildState *state = NULL;
1884 218 : IndexInfo *indexInfo = NULL;
1885 : BlockNumber heapNumBlocks;
1886 : BlockNumber pagesPerRange;
1887 : Buffer buf;
1888 : BlockNumber startBlk;
1889 :
1890 218 : revmap = brinRevmapInitialize(index, &pagesPerRange);
1891 :
1892 : /* determine range of pages to process */
1893 218 : heapNumBlocks = RelationGetNumberOfBlocks(heapRel);
1894 218 : if (pageRange == BRIN_ALL_BLOCKRANGES)
1895 144 : startBlk = 0;
1896 : else
1897 : {
1898 74 : startBlk = (pageRange / pagesPerRange) * pagesPerRange;
1899 74 : heapNumBlocks = Min(heapNumBlocks, startBlk + pagesPerRange);
1900 : }
1901 218 : if (startBlk > heapNumBlocks)
1902 : {
1903 : /* Nothing to do if start point is beyond end of table */
1904 0 : brinRevmapTerminate(revmap);
1905 0 : return;
1906 : }
1907 :
1908 : /*
1909 : * Scan the revmap to find unsummarized items.
1910 : */
1911 218 : buf = InvalidBuffer;
1912 18944 : for (; startBlk < heapNumBlocks; startBlk += pagesPerRange)
1913 : {
1914 : BrinTuple *tup;
1915 : OffsetNumber off;
1916 :
1917 : /*
1918 : * Unless requested to summarize even a partial range, go away now if
1919 : * we think the next range is partial. Caller would pass true when it
1920 : * is typically run once bulk data loading is done
1921 : * (brin_summarize_new_values), and false when it is typically the
1922 : * result of arbitrarily-scheduled maintenance command (vacuuming).
1923 : */
1924 18792 : if (!include_partial &&
1925 2050 : (startBlk + pagesPerRange > heapNumBlocks))
1926 66 : break;
1927 :
1928 18726 : CHECK_FOR_INTERRUPTS();
1929 :
1930 18726 : tup = brinGetTupleForHeapBlock(revmap, startBlk, &buf, &off, NULL,
1931 : BUFFER_LOCK_SHARE);
1932 18726 : if (tup == NULL)
1933 : {
1934 : /* no revmap entry for this heap range. Summarize it. */
1935 2934 : if (state == NULL)
1936 : {
1937 : /* first time through */
1938 : Assert(!indexInfo);
1939 78 : state = initialize_brin_buildstate(index, revmap,
1940 : pagesPerRange,
1941 : InvalidBlockNumber);
1942 78 : indexInfo = BuildIndexInfo(index);
1943 : }
1944 2934 : summarize_range(indexInfo, state, heapRel, startBlk, heapNumBlocks);
1945 :
1946 : /* and re-initialize state for the next range */
1947 2934 : brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1948 :
1949 2934 : if (numSummarized)
1950 2934 : *numSummarized += 1.0;
1951 : }
1952 : else
1953 : {
1954 15792 : if (numExisting)
1955 1892 : *numExisting += 1.0;
1956 15792 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1957 : }
1958 : }
1959 :
1960 218 : if (BufferIsValid(buf))
1961 152 : ReleaseBuffer(buf);
1962 :
1963 : /* free resources */
1964 218 : brinRevmapTerminate(revmap);
1965 218 : if (state)
1966 : {
1967 78 : terminate_brin_buildstate(state);
1968 78 : pfree(indexInfo);
1969 : }
1970 : }
1971 :
1972 : /*
1973 : * Given a deformed tuple in the build state, convert it into the on-disk
1974 : * format and insert it into the index, making the revmap point to it.
1975 : */
1976 : static void
1977 2632 : form_and_insert_tuple(BrinBuildState *state)
1978 : {
1979 : BrinTuple *tup;
1980 : Size size;
1981 :
1982 2632 : tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
1983 : state->bs_dtuple, &size);
1984 2632 : brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
1985 : &state->bs_currentInsertBuf, state->bs_currRangeStart,
1986 : tup, size);
1987 2632 : state->bs_numtuples++;
1988 :
1989 2632 : pfree(tup);
1990 2632 : }
1991 :
1992 : /*
1993 : * Given a deformed tuple in the build state, convert it into the on-disk
1994 : * format and write it to a (shared) tuplesort (the leader will insert it
1995 : * into the index later).
1996 : */
1997 : static void
1998 58 : form_and_spill_tuple(BrinBuildState *state)
1999 : {
2000 : BrinTuple *tup;
2001 : Size size;
2002 :
2003 : /* don't insert empty tuples in parallel build */
2004 58 : if (state->bs_dtuple->bt_empty_range)
2005 18 : return;
2006 :
2007 40 : tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
2008 : state->bs_dtuple, &size);
2009 :
2010 : /* write the BRIN tuple to the tuplesort */
2011 40 : tuplesort_putbrintuple(state->bs_sortstate, tup, size);
2012 :
2013 40 : state->bs_numtuples++;
2014 :
2015 40 : pfree(tup);
2016 : }
2017 :
2018 : /*
2019 : * Given two deformed tuples, adjust the first one so that it's consistent
2020 : * with the summary values in both.
2021 : */
2022 : static void
2023 0 : union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)
2024 : {
2025 : int keyno;
2026 : BrinMemTuple *db;
2027 : MemoryContext cxt;
2028 : MemoryContext oldcxt;
2029 :
2030 : /* Use our own memory context to avoid retail pfree */
2031 0 : cxt = AllocSetContextCreate(CurrentMemoryContext,
2032 : "brin union",
2033 : ALLOCSET_DEFAULT_SIZES);
2034 0 : oldcxt = MemoryContextSwitchTo(cxt);
2035 0 : db = brin_deform_tuple(bdesc, b, NULL);
2036 0 : MemoryContextSwitchTo(oldcxt);
2037 :
2038 : /*
2039 : * Check if the ranges are empty.
2040 : *
2041 : * If at least one of them is empty, we don't need to call per-key union
2042 : * functions at all. If "b" is empty, we just use "a" as the result (it
2043 : * might be empty fine, but that's fine). If "a" is empty but "b" is not,
2044 : * we use "b" as the result (but we have to copy the data into "a" first).
2045 : *
2046 : * Only when both ranges are non-empty, we actually do the per-key merge.
2047 : */
2048 :
2049 : /* If "b" is empty - ignore it and just use "a" (even if it's empty etc.). */
2050 0 : if (db->bt_empty_range)
2051 : {
2052 : /* skip the per-key merge */
2053 0 : MemoryContextDelete(cxt);
2054 0 : return;
2055 : }
2056 :
2057 : /*
2058 : * Now we know "b" is not empty. If "a" is empty, then "b" is the result.
2059 : * But we need to copy the data from "b" to "a" first, because that's how
2060 : * we pass result out.
2061 : *
2062 : * We have to copy all the global/per-key flags etc. too.
2063 : */
2064 0 : if (a->bt_empty_range)
2065 : {
2066 0 : for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2067 : {
2068 : int i;
2069 0 : BrinValues *col_a = &a->bt_columns[keyno];
2070 0 : BrinValues *col_b = &db->bt_columns[keyno];
2071 0 : BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];
2072 :
2073 0 : col_a->bv_allnulls = col_b->bv_allnulls;
2074 0 : col_a->bv_hasnulls = col_b->bv_hasnulls;
2075 :
2076 : /* If "b" has no data, we're done. */
2077 0 : if (col_b->bv_allnulls)
2078 0 : continue;
2079 :
2080 0 : for (i = 0; i < opcinfo->oi_nstored; i++)
2081 0 : col_a->bv_values[i] =
2082 0 : datumCopy(col_b->bv_values[i],
2083 0 : opcinfo->oi_typcache[i]->typbyval,
2084 0 : opcinfo->oi_typcache[i]->typlen);
2085 : }
2086 :
2087 : /* "a" started empty, but "b" was not empty, so remember that */
2088 0 : a->bt_empty_range = false;
2089 :
2090 : /* skip the per-key merge */
2091 0 : MemoryContextDelete(cxt);
2092 0 : return;
2093 : }
2094 :
2095 : /* Now we know neither range is empty. */
2096 0 : for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2097 : {
2098 : FmgrInfo *unionFn;
2099 0 : BrinValues *col_a = &a->bt_columns[keyno];
2100 0 : BrinValues *col_b = &db->bt_columns[keyno];
2101 0 : BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];
2102 :
2103 0 : if (opcinfo->oi_regular_nulls)
2104 : {
2105 : /* Does the "b" summary represent any NULL values? */
2106 0 : bool b_has_nulls = (col_b->bv_hasnulls || col_b->bv_allnulls);
2107 :
2108 : /* Adjust "hasnulls". */
2109 0 : if (!col_a->bv_allnulls && b_has_nulls)
2110 0 : col_a->bv_hasnulls = true;
2111 :
2112 : /* If there are no values in B, there's nothing left to do. */
2113 0 : if (col_b->bv_allnulls)
2114 0 : continue;
2115 :
2116 : /*
2117 : * Adjust "allnulls". If A doesn't have values, just copy the
2118 : * values from B into A, and we're done. We cannot run the
2119 : * operators in this case, because values in A might contain
2120 : * garbage. Note we already established that B contains values.
2121 : *
2122 : * Also adjust "hasnulls" in order not to forget the summary
2123 : * represents NULL values. This is not redundant with the earlier
2124 : * update, because that only happens when allnulls=false.
2125 : */
2126 0 : if (col_a->bv_allnulls)
2127 : {
2128 : int i;
2129 :
2130 0 : col_a->bv_allnulls = false;
2131 0 : col_a->bv_hasnulls = true;
2132 :
2133 0 : for (i = 0; i < opcinfo->oi_nstored; i++)
2134 0 : col_a->bv_values[i] =
2135 0 : datumCopy(col_b->bv_values[i],
2136 0 : opcinfo->oi_typcache[i]->typbyval,
2137 0 : opcinfo->oi_typcache[i]->typlen);
2138 :
2139 0 : continue;
2140 : }
2141 : }
2142 :
2143 0 : unionFn = index_getprocinfo(bdesc->bd_index, keyno + 1,
2144 : BRIN_PROCNUM_UNION);
2145 0 : FunctionCall3Coll(unionFn,
2146 0 : bdesc->bd_index->rd_indcollation[keyno],
2147 : PointerGetDatum(bdesc),
2148 : PointerGetDatum(col_a),
2149 : PointerGetDatum(col_b));
2150 : }
2151 :
2152 0 : MemoryContextDelete(cxt);
2153 : }
2154 :
2155 : /*
2156 : * brin_vacuum_scan
2157 : * Do a complete scan of the index during VACUUM.
2158 : *
2159 : * This routine scans the complete index looking for uncataloged index pages,
2160 : * i.e. those that might have been lost due to a crash after index extension
2161 : * and such.
2162 : */
2163 : static void
2164 86 : brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy)
2165 : {
2166 : BlockNumber nblocks;
2167 : BlockNumber blkno;
2168 :
2169 : /*
2170 : * Scan the index in physical order, and clean up any possible mess in
2171 : * each page.
2172 : */
2173 86 : nblocks = RelationGetNumberOfBlocks(idxrel);
2174 458 : for (blkno = 0; blkno < nblocks; blkno++)
2175 : {
2176 : Buffer buf;
2177 :
2178 372 : CHECK_FOR_INTERRUPTS();
2179 :
2180 372 : buf = ReadBufferExtended(idxrel, MAIN_FORKNUM, blkno,
2181 : RBM_NORMAL, strategy);
2182 :
2183 372 : brin_page_cleanup(idxrel, buf);
2184 :
2185 372 : ReleaseBuffer(buf);
2186 : }
2187 :
2188 : /*
2189 : * Update all upper pages in the index's FSM, as well. This ensures not
2190 : * only that we propagate leaf-page FSM updates made by brin_page_cleanup,
2191 : * but also that any pre-existing damage or out-of-dateness is repaired.
2192 : */
2193 86 : FreeSpaceMapVacuum(idxrel);
2194 86 : }
2195 :
2196 : static bool
2197 784072 : add_values_to_range(Relation idxRel, BrinDesc *bdesc, BrinMemTuple *dtup,
2198 : const Datum *values, const bool *nulls)
2199 : {
2200 : int keyno;
2201 :
2202 : /* If the range starts empty, we're certainly going to modify it. */
2203 784072 : bool modified = dtup->bt_empty_range;
2204 :
2205 : /*
2206 : * Compare the key values of the new tuple to the stored index values; our
2207 : * deformed tuple will get updated if the new tuple doesn't fit the
2208 : * original range (note this means we can't break out of the loop early).
2209 : * Make a note of whether this happens, so that we know to insert the
2210 : * modified tuple later.
2211 : */
2212 1847696 : for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2213 : {
2214 : Datum result;
2215 : BrinValues *bval;
2216 : FmgrInfo *addValue;
2217 : bool has_nulls;
2218 :
2219 1063624 : bval = &dtup->bt_columns[keyno];
2220 :
2221 : /*
2222 : * Does the range have actual NULL values? Either of the flags can be
2223 : * set, but we ignore the state before adding first row.
2224 : *
2225 : * We have to remember this, because we'll modify the flags and we
2226 : * need to know if the range started as empty.
2227 : */
2228 2090692 : has_nulls = ((!dtup->bt_empty_range) &&
2229 1027068 : (bval->bv_hasnulls || bval->bv_allnulls));
2230 :
2231 : /*
2232 : * If the value we're adding is NULL, handle it locally. Otherwise
2233 : * call the BRIN_PROCNUM_ADDVALUE procedure.
2234 : */
2235 1063624 : if (bdesc->bd_info[keyno]->oi_regular_nulls && nulls[keyno])
2236 : {
2237 : /*
2238 : * If the new value is null, we record that we saw it if it's the
2239 : * first one; otherwise, there's nothing to do.
2240 : */
2241 18616 : if (!bval->bv_hasnulls)
2242 : {
2243 3568 : bval->bv_hasnulls = true;
2244 3568 : modified = true;
2245 : }
2246 :
2247 18616 : continue;
2248 : }
2249 :
2250 1045008 : addValue = index_getprocinfo(idxRel, keyno + 1,
2251 : BRIN_PROCNUM_ADDVALUE);
2252 1045008 : result = FunctionCall4Coll(addValue,
2253 1045008 : idxRel->rd_indcollation[keyno],
2254 : PointerGetDatum(bdesc),
2255 : PointerGetDatum(bval),
2256 1045008 : values[keyno],
2257 1045008 : nulls[keyno]);
2258 : /* if that returned true, we need to insert the updated tuple */
2259 1045008 : modified |= DatumGetBool(result);
2260 :
2261 : /*
2262 : * If the range was had actual NULL values (i.e. did not start empty),
2263 : * make sure we don't forget about the NULL values. Either the
2264 : * allnulls flag is still set to true, or (if the opclass cleared it)
2265 : * we need to set hasnulls=true.
2266 : *
2267 : * XXX This can only happen when the opclass modified the tuple, so
2268 : * the modified flag should be set.
2269 : */
2270 1045008 : if (has_nulls && !(bval->bv_hasnulls || bval->bv_allnulls))
2271 : {
2272 : Assert(modified);
2273 4 : bval->bv_hasnulls = true;
2274 : }
2275 : }
2276 :
2277 : /*
2278 : * After updating summaries for all the keys, mark it as not empty.
2279 : *
2280 : * If we're actually changing the flag value (i.e. tuple started as
2281 : * empty), we should have modified the tuple. So we should not see empty
2282 : * range that was not modified.
2283 : */
2284 : Assert(!dtup->bt_empty_range || modified);
2285 784072 : dtup->bt_empty_range = false;
2286 :
2287 784072 : return modified;
2288 : }
2289 :
2290 : static bool
2291 189936 : check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys)
2292 : {
2293 : int keyno;
2294 :
2295 : /*
2296 : * First check if there are any IS [NOT] NULL scan keys, and if we're
2297 : * violating them.
2298 : */
2299 191172 : for (keyno = 0; keyno < nnullkeys; keyno++)
2300 : {
2301 2232 : ScanKey key = nullkeys[keyno];
2302 :
2303 : Assert(key->sk_attno == bval->bv_attno);
2304 :
2305 : /* Handle only IS NULL/IS NOT NULL tests */
2306 2232 : if (!(key->sk_flags & SK_ISNULL))
2307 0 : continue;
2308 :
2309 2232 : if (key->sk_flags & SK_SEARCHNULL)
2310 : {
2311 : /* IS NULL scan key, but range has no NULLs */
2312 1116 : if (!bval->bv_allnulls && !bval->bv_hasnulls)
2313 978 : return false;
2314 : }
2315 1116 : else if (key->sk_flags & SK_SEARCHNOTNULL)
2316 : {
2317 : /*
2318 : * For IS NOT NULL, we can only skip ranges that are known to have
2319 : * only nulls.
2320 : */
2321 1116 : if (bval->bv_allnulls)
2322 18 : return false;
2323 : }
2324 : else
2325 : {
2326 : /*
2327 : * Neither IS NULL nor IS NOT NULL was used; assume all indexable
2328 : * operators are strict and thus return false with NULL value in
2329 : * the scan key.
2330 : */
2331 0 : return false;
2332 : }
2333 : }
2334 :
2335 188940 : return true;
2336 : }
2337 :
2338 : /*
2339 : * Create parallel context, and launch workers for leader.
2340 : *
2341 : * buildstate argument should be initialized (with the exception of the
2342 : * tuplesort states, which may later be created based on shared
2343 : * state initially set up here).
2344 : *
2345 : * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
2346 : *
2347 : * request is the target number of parallel worker processes to launch.
2348 : *
2349 : * Sets buildstate's BrinLeader, which caller must use to shut down parallel
2350 : * mode by passing it to _brin_end_parallel() at the very end of its index
2351 : * build. If not even a single worker process can be launched, this is
2352 : * never set, and caller should proceed with a serial index build.
2353 : */
2354 : static void
2355 10 : _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
2356 : bool isconcurrent, int request)
2357 : {
2358 : ParallelContext *pcxt;
2359 : int scantuplesortstates;
2360 : Snapshot snapshot;
2361 : Size estbrinshared;
2362 : Size estsort;
2363 : BrinShared *brinshared;
2364 : Sharedsort *sharedsort;
2365 10 : BrinLeader *brinleader = (BrinLeader *) palloc0(sizeof(BrinLeader));
2366 : WalUsage *walusage;
2367 : BufferUsage *bufferusage;
2368 10 : bool leaderparticipates = true;
2369 : int querylen;
2370 :
2371 : #ifdef DISABLE_LEADER_PARTICIPATION
2372 : leaderparticipates = false;
2373 : #endif
2374 :
2375 : /*
2376 : * Enter parallel mode, and create context for parallel build of brin
2377 : * index
2378 : */
2379 10 : EnterParallelMode();
2380 : Assert(request > 0);
2381 10 : pcxt = CreateParallelContext("postgres", "_brin_parallel_build_main",
2382 : request);
2383 :
2384 10 : scantuplesortstates = leaderparticipates ? request + 1 : request;
2385 :
2386 : /*
2387 : * Prepare for scan of the base relation. In a normal index build, we use
2388 : * SnapshotAny because we must retrieve all tuples and do our own time
2389 : * qual checks (because we have to index RECENTLY_DEAD tuples). In a
2390 : * concurrent build, we take a regular MVCC snapshot and index whatever's
2391 : * live according to that.
2392 : */
2393 10 : if (!isconcurrent)
2394 10 : snapshot = SnapshotAny;
2395 : else
2396 0 : snapshot = RegisterSnapshot(GetTransactionSnapshot());
2397 :
2398 : /*
2399 : * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace.
2400 : */
2401 10 : estbrinshared = _brin_parallel_estimate_shared(heap, snapshot);
2402 10 : shm_toc_estimate_chunk(&pcxt->estimator, estbrinshared);
2403 10 : estsort = tuplesort_estimate_shared(scantuplesortstates);
2404 10 : shm_toc_estimate_chunk(&pcxt->estimator, estsort);
2405 :
2406 10 : shm_toc_estimate_keys(&pcxt->estimator, 2);
2407 :
2408 : /*
2409 : * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
2410 : * and PARALLEL_KEY_BUFFER_USAGE.
2411 : *
2412 : * If there are no extensions loaded that care, we could skip this. We
2413 : * have no way of knowing whether anyone's looking at pgWalUsage or
2414 : * pgBufferUsage, so do it unconditionally.
2415 : */
2416 10 : shm_toc_estimate_chunk(&pcxt->estimator,
2417 : mul_size(sizeof(WalUsage), pcxt->nworkers));
2418 10 : shm_toc_estimate_keys(&pcxt->estimator, 1);
2419 10 : shm_toc_estimate_chunk(&pcxt->estimator,
2420 : mul_size(sizeof(BufferUsage), pcxt->nworkers));
2421 10 : shm_toc_estimate_keys(&pcxt->estimator, 1);
2422 :
2423 : /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
2424 10 : if (debug_query_string)
2425 : {
2426 10 : querylen = strlen(debug_query_string);
2427 10 : shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
2428 10 : shm_toc_estimate_keys(&pcxt->estimator, 1);
2429 : }
2430 : else
2431 0 : querylen = 0; /* keep compiler quiet */
2432 :
2433 : /* Everyone's had a chance to ask for space, so now create the DSM */
2434 10 : InitializeParallelDSM(pcxt);
2435 :
2436 : /* If no DSM segment was available, back out (do serial build) */
2437 10 : if (pcxt->seg == NULL)
2438 : {
2439 0 : if (IsMVCCSnapshot(snapshot))
2440 0 : UnregisterSnapshot(snapshot);
2441 0 : DestroyParallelContext(pcxt);
2442 0 : ExitParallelMode();
2443 0 : return;
2444 : }
2445 :
2446 : /* Store shared build state, for which we reserved space */
2447 10 : brinshared = (BrinShared *) shm_toc_allocate(pcxt->toc, estbrinshared);
2448 : /* Initialize immutable state */
2449 10 : brinshared->heaprelid = RelationGetRelid(heap);
2450 10 : brinshared->indexrelid = RelationGetRelid(index);
2451 10 : brinshared->isconcurrent = isconcurrent;
2452 10 : brinshared->scantuplesortstates = scantuplesortstates;
2453 10 : brinshared->pagesPerRange = buildstate->bs_pagesPerRange;
2454 10 : brinshared->queryid = pgstat_get_my_query_id();
2455 10 : ConditionVariableInit(&brinshared->workersdonecv);
2456 10 : SpinLockInit(&brinshared->mutex);
2457 :
2458 : /* Initialize mutable state */
2459 10 : brinshared->nparticipantsdone = 0;
2460 10 : brinshared->reltuples = 0.0;
2461 10 : brinshared->indtuples = 0.0;
2462 :
2463 10 : table_parallelscan_initialize(heap,
2464 : ParallelTableScanFromBrinShared(brinshared),
2465 : snapshot);
2466 :
2467 : /*
2468 : * Store shared tuplesort-private state, for which we reserved space.
2469 : * Then, initialize opaque state using tuplesort routine.
2470 : */
2471 10 : sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
2472 10 : tuplesort_initialize_shared(sharedsort, scantuplesortstates,
2473 : pcxt->seg);
2474 :
2475 : /*
2476 : * Store shared tuplesort-private state, for which we reserved space.
2477 : * Then, initialize opaque state using tuplesort routine.
2478 : */
2479 10 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_BRIN_SHARED, brinshared);
2480 10 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
2481 :
2482 : /* Store query string for workers */
2483 10 : if (debug_query_string)
2484 : {
2485 : char *sharedquery;
2486 :
2487 10 : sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
2488 10 : memcpy(sharedquery, debug_query_string, querylen + 1);
2489 10 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
2490 : }
2491 :
2492 : /*
2493 : * Allocate space for each worker's WalUsage and BufferUsage; no need to
2494 : * initialize.
2495 : */
2496 10 : walusage = shm_toc_allocate(pcxt->toc,
2497 10 : mul_size(sizeof(WalUsage), pcxt->nworkers));
2498 10 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
2499 10 : bufferusage = shm_toc_allocate(pcxt->toc,
2500 10 : mul_size(sizeof(BufferUsage), pcxt->nworkers));
2501 10 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
2502 :
2503 : /* Launch workers, saving status for leader/caller */
2504 10 : LaunchParallelWorkers(pcxt);
2505 10 : brinleader->pcxt = pcxt;
2506 10 : brinleader->nparticipanttuplesorts = pcxt->nworkers_launched;
2507 10 : if (leaderparticipates)
2508 10 : brinleader->nparticipanttuplesorts++;
2509 10 : brinleader->brinshared = brinshared;
2510 10 : brinleader->sharedsort = sharedsort;
2511 10 : brinleader->snapshot = snapshot;
2512 10 : brinleader->walusage = walusage;
2513 10 : brinleader->bufferusage = bufferusage;
2514 :
2515 : /* If no workers were successfully launched, back out (do serial build) */
2516 10 : if (pcxt->nworkers_launched == 0)
2517 : {
2518 2 : _brin_end_parallel(brinleader, NULL);
2519 2 : return;
2520 : }
2521 :
2522 : /* Save leader state now that it's clear build will be parallel */
2523 8 : buildstate->bs_leader = brinleader;
2524 :
2525 : /* Join heap scan ourselves */
2526 8 : if (leaderparticipates)
2527 8 : _brin_leader_participate_as_worker(buildstate, heap, index);
2528 :
2529 : /*
2530 : * Caller needs to wait for all launched workers when we return. Make
2531 : * sure that the failure-to-start case will not hang forever.
2532 : */
2533 8 : WaitForParallelWorkersToAttach(pcxt);
2534 : }
2535 :
2536 : /*
2537 : * Shut down workers, destroy parallel context, and end parallel mode.
2538 : */
2539 : static void
2540 10 : _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state)
2541 : {
2542 : int i;
2543 :
2544 : /* Shutdown worker processes */
2545 10 : WaitForParallelWorkersToFinish(brinleader->pcxt);
2546 :
2547 : /*
2548 : * Next, accumulate WAL usage. (This must wait for the workers to finish,
2549 : * or we might get incomplete data.)
2550 : */
2551 22 : for (i = 0; i < brinleader->pcxt->nworkers_launched; i++)
2552 12 : InstrAccumParallelQuery(&brinleader->bufferusage[i], &brinleader->walusage[i]);
2553 :
2554 : /* Free last reference to MVCC snapshot, if one was used */
2555 10 : if (IsMVCCSnapshot(brinleader->snapshot))
2556 0 : UnregisterSnapshot(brinleader->snapshot);
2557 10 : DestroyParallelContext(brinleader->pcxt);
2558 10 : ExitParallelMode();
2559 10 : }
2560 :
2561 : /*
2562 : * Within leader, wait for end of heap scan.
2563 : *
2564 : * When called, parallel heap scan started by _brin_begin_parallel() will
2565 : * already be underway within worker processes (when leader participates
2566 : * as a worker, we should end up here just as workers are finishing).
2567 : *
2568 : * Returns the total number of heap tuples scanned.
2569 : */
2570 : static double
2571 8 : _brin_parallel_heapscan(BrinBuildState *state)
2572 : {
2573 8 : BrinShared *brinshared = state->bs_leader->brinshared;
2574 : int nparticipanttuplesorts;
2575 :
2576 8 : nparticipanttuplesorts = state->bs_leader->nparticipanttuplesorts;
2577 : for (;;)
2578 : {
2579 20 : SpinLockAcquire(&brinshared->mutex);
2580 20 : if (brinshared->nparticipantsdone == nparticipanttuplesorts)
2581 : {
2582 : /* copy the data into leader state */
2583 8 : state->bs_reltuples = brinshared->reltuples;
2584 8 : state->bs_numtuples = brinshared->indtuples;
2585 :
2586 8 : SpinLockRelease(&brinshared->mutex);
2587 8 : break;
2588 : }
2589 12 : SpinLockRelease(&brinshared->mutex);
2590 :
2591 12 : ConditionVariableSleep(&brinshared->workersdonecv,
2592 : WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
2593 : }
2594 :
2595 8 : ConditionVariableCancelSleep();
2596 :
2597 8 : return state->bs_reltuples;
2598 : }
2599 :
2600 : /*
2601 : * Within leader, wait for end of heap scan and merge per-worker results.
2602 : *
2603 : * After waiting for all workers to finish, merge the per-worker results into
2604 : * the complete index. The results from each worker are sorted by block number
2605 : * (start of the page range). While combining the per-worker results we merge
2606 : * summaries for the same page range, and also fill-in empty summaries for
2607 : * ranges without any tuples.
2608 : *
2609 : * Returns the total number of heap tuples scanned.
2610 : */
2611 : static double
2612 8 : _brin_parallel_merge(BrinBuildState *state)
2613 : {
2614 : BrinTuple *btup;
2615 8 : BrinMemTuple *memtuple = NULL;
2616 : Size tuplen;
2617 8 : BlockNumber prevblkno = InvalidBlockNumber;
2618 : MemoryContext rangeCxt,
2619 : oldCxt;
2620 : double reltuples;
2621 :
2622 : /* wait for workers to scan table and produce partial results */
2623 8 : reltuples = _brin_parallel_heapscan(state);
2624 :
2625 : /* do the actual sort in the leader */
2626 8 : tuplesort_performsort(state->bs_sortstate);
2627 :
2628 : /*
2629 : * Initialize BrinMemTuple we'll use to union summaries from workers (in
2630 : * case they happened to produce parts of the same page range).
2631 : */
2632 8 : memtuple = brin_new_memtuple(state->bs_bdesc);
2633 :
2634 : /*
2635 : * Create a memory context we'll reset to combine results for a single
2636 : * page range (received from the workers). We don't expect huge number of
2637 : * overlaps under regular circumstances, because for large tables the
2638 : * chunk size is likely larger than the BRIN page range), but it can
2639 : * happen, and the union functions may do all kinds of stuff. So we better
2640 : * reset the context once in a while.
2641 : */
2642 8 : rangeCxt = AllocSetContextCreate(CurrentMemoryContext,
2643 : "brin union",
2644 : ALLOCSET_DEFAULT_SIZES);
2645 8 : oldCxt = MemoryContextSwitchTo(rangeCxt);
2646 :
2647 : /*
2648 : * Read the BRIN tuples from the shared tuplesort, sorted by block number.
2649 : * That probably gives us an index that is cheaper to scan, thanks to
2650 : * mostly getting data from the same index page as before.
2651 : */
2652 48 : while ((btup = tuplesort_getbrintuple(state->bs_sortstate, &tuplen, true)) != NULL)
2653 : {
2654 : /* Ranges should be multiples of pages_per_range for the index. */
2655 : Assert(btup->bt_blkno % state->bs_leader->brinshared->pagesPerRange == 0);
2656 :
2657 : /*
2658 : * Do we need to union summaries for the same page range?
2659 : *
2660 : * If this is the first brin tuple we read, then just deform it into
2661 : * the memtuple, and continue with the next one from tuplesort. We
2662 : * however may need to insert empty summaries into the index.
2663 : *
2664 : * If it's the same block as the last we saw, we simply union the brin
2665 : * tuple into it, and we're done - we don't even need to insert empty
2666 : * ranges, because that was done earlier when we saw the first brin
2667 : * tuple (for this range).
2668 : *
2669 : * Finally, if it's not the first brin tuple, and it's not the same
2670 : * page range, we need to do the insert and then deform the tuple into
2671 : * the memtuple. Then we'll insert empty ranges before the new brin
2672 : * tuple, if needed.
2673 : */
2674 40 : if (prevblkno == InvalidBlockNumber)
2675 : {
2676 : /* First brin tuples, just deform into memtuple. */
2677 2 : memtuple = brin_deform_tuple(state->bs_bdesc, btup, memtuple);
2678 :
2679 : /* continue to insert empty pages before thisblock */
2680 : }
2681 38 : else if (memtuple->bt_blkno == btup->bt_blkno)
2682 : {
2683 : /*
2684 : * Not the first brin tuple, but same page range as the previous
2685 : * one, so we can merge it into the memtuple.
2686 : */
2687 0 : union_tuples(state->bs_bdesc, memtuple, btup);
2688 0 : continue;
2689 : }
2690 : else
2691 : {
2692 : BrinTuple *tmp;
2693 : Size len;
2694 :
2695 : /*
2696 : * We got brin tuple for a different page range, so form a brin
2697 : * tuple from the memtuple, insert it, and re-init the memtuple
2698 : * from the new brin tuple.
2699 : */
2700 38 : tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno,
2701 : memtuple, &len);
2702 :
2703 38 : brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
2704 : &state->bs_currentInsertBuf, tmp->bt_blkno, tmp, len);
2705 :
2706 : /*
2707 : * Reset the per-output-range context. This frees all the memory
2708 : * possibly allocated by the union functions, and also the BRIN
2709 : * tuple we just formed and inserted.
2710 : */
2711 38 : MemoryContextReset(rangeCxt);
2712 :
2713 38 : memtuple = brin_deform_tuple(state->bs_bdesc, btup, memtuple);
2714 :
2715 : /* continue to insert empty pages before thisblock */
2716 : }
2717 :
2718 : /* Fill empty ranges for all ranges missing in the tuplesort. */
2719 40 : brin_fill_empty_ranges(state, prevblkno, btup->bt_blkno);
2720 :
2721 40 : prevblkno = btup->bt_blkno;
2722 : }
2723 :
2724 8 : tuplesort_end(state->bs_sortstate);
2725 :
2726 : /* Fill the BRIN tuple for the last page range with data. */
2727 8 : if (prevblkno != InvalidBlockNumber)
2728 : {
2729 : BrinTuple *tmp;
2730 : Size len;
2731 :
2732 2 : tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno,
2733 : memtuple, &len);
2734 :
2735 2 : brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
2736 : &state->bs_currentInsertBuf, tmp->bt_blkno, tmp, len);
2737 :
2738 2 : pfree(tmp);
2739 : }
2740 :
2741 : /* Fill empty ranges at the end, for all ranges missing in the tuplesort. */
2742 8 : brin_fill_empty_ranges(state, prevblkno, state->bs_maxRangeStart);
2743 :
2744 : /*
2745 : * Switch back to the original memory context, and destroy the one we
2746 : * created to isolate the union_tuple calls.
2747 : */
2748 8 : MemoryContextSwitchTo(oldCxt);
2749 8 : MemoryContextDelete(rangeCxt);
2750 :
2751 8 : return reltuples;
2752 : }
2753 :
2754 : /*
2755 : * Returns size of shared memory required to store state for a parallel
2756 : * brin index build based on the snapshot its parallel scan will use.
2757 : */
2758 : static Size
2759 10 : _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot)
2760 : {
2761 : /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
2762 10 : return add_size(BUFFERALIGN(sizeof(BrinShared)),
2763 : table_parallelscan_estimate(heap, snapshot));
2764 : }
2765 :
2766 : /*
2767 : * Within leader, participate as a parallel worker.
2768 : */
2769 : static void
2770 8 : _brin_leader_participate_as_worker(BrinBuildState *buildstate, Relation heap, Relation index)
2771 : {
2772 8 : BrinLeader *brinleader = buildstate->bs_leader;
2773 : int sortmem;
2774 :
2775 : /*
2776 : * Might as well use reliable figure when doling out maintenance_work_mem
2777 : * (when requested number of workers were not launched, this will be
2778 : * somewhat higher than it is for other workers).
2779 : */
2780 8 : sortmem = maintenance_work_mem / brinleader->nparticipanttuplesorts;
2781 :
2782 : /* Perform work common to all participants */
2783 8 : _brin_parallel_scan_and_build(buildstate, brinleader->brinshared,
2784 : brinleader->sharedsort, heap, index, sortmem, true);
2785 8 : }
2786 :
2787 : /*
2788 : * Perform a worker's portion of a parallel sort.
2789 : *
2790 : * This generates a tuplesort for the worker portion of the table.
2791 : *
2792 : * sortmem is the amount of working memory to use within each worker,
2793 : * expressed in KBs.
2794 : *
2795 : * When this returns, workers are done, and need only release resources.
2796 : */
2797 : static void
2798 20 : _brin_parallel_scan_and_build(BrinBuildState *state,
2799 : BrinShared *brinshared, Sharedsort *sharedsort,
2800 : Relation heap, Relation index,
2801 : int sortmem, bool progress)
2802 : {
2803 : SortCoordinate coordinate;
2804 : TableScanDesc scan;
2805 : double reltuples;
2806 : IndexInfo *indexInfo;
2807 :
2808 : /* Initialize local tuplesort coordination state */
2809 20 : coordinate = palloc0(sizeof(SortCoordinateData));
2810 20 : coordinate->isWorker = true;
2811 20 : coordinate->nParticipants = -1;
2812 20 : coordinate->sharedsort = sharedsort;
2813 :
2814 : /* Begin "partial" tuplesort */
2815 20 : state->bs_sortstate = tuplesort_begin_index_brin(sortmem, coordinate,
2816 : TUPLESORT_NONE);
2817 :
2818 : /* Join parallel scan */
2819 20 : indexInfo = BuildIndexInfo(index);
2820 20 : indexInfo->ii_Concurrent = brinshared->isconcurrent;
2821 :
2822 20 : scan = table_beginscan_parallel(heap,
2823 : ParallelTableScanFromBrinShared(brinshared));
2824 :
2825 20 : reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
2826 : brinbuildCallbackParallel, state, scan);
2827 :
2828 : /* insert the last item */
2829 20 : form_and_spill_tuple(state);
2830 :
2831 : /* sort the BRIN ranges built by this worker */
2832 20 : tuplesort_performsort(state->bs_sortstate);
2833 :
2834 20 : state->bs_reltuples += reltuples;
2835 :
2836 : /*
2837 : * Done. Record ambuild statistics.
2838 : */
2839 20 : SpinLockAcquire(&brinshared->mutex);
2840 20 : brinshared->nparticipantsdone++;
2841 20 : brinshared->reltuples += state->bs_reltuples;
2842 20 : brinshared->indtuples += state->bs_numtuples;
2843 20 : SpinLockRelease(&brinshared->mutex);
2844 :
2845 : /* Notify leader */
2846 20 : ConditionVariableSignal(&brinshared->workersdonecv);
2847 :
2848 20 : tuplesort_end(state->bs_sortstate);
2849 20 : }
2850 :
2851 : /*
2852 : * Perform work within a launched parallel process.
2853 : */
2854 : void
2855 12 : _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
2856 : {
2857 : char *sharedquery;
2858 : BrinShared *brinshared;
2859 : Sharedsort *sharedsort;
2860 : BrinBuildState *buildstate;
2861 : Relation heapRel;
2862 : Relation indexRel;
2863 : LOCKMODE heapLockmode;
2864 : LOCKMODE indexLockmode;
2865 : WalUsage *walusage;
2866 : BufferUsage *bufferusage;
2867 : int sortmem;
2868 :
2869 : /*
2870 : * The only possible status flag that can be set to the parallel worker is
2871 : * PROC_IN_SAFE_IC.
2872 : */
2873 : Assert((MyProc->statusFlags == 0) ||
2874 : (MyProc->statusFlags == PROC_IN_SAFE_IC));
2875 :
2876 : /* Set debug_query_string for individual workers first */
2877 12 : sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
2878 12 : debug_query_string = sharedquery;
2879 :
2880 : /* Report the query string from leader */
2881 12 : pgstat_report_activity(STATE_RUNNING, debug_query_string);
2882 :
2883 : /* Look up brin shared state */
2884 12 : brinshared = shm_toc_lookup(toc, PARALLEL_KEY_BRIN_SHARED, false);
2885 :
2886 : /* Open relations using lock modes known to be obtained by index.c */
2887 12 : if (!brinshared->isconcurrent)
2888 : {
2889 12 : heapLockmode = ShareLock;
2890 12 : indexLockmode = AccessExclusiveLock;
2891 : }
2892 : else
2893 : {
2894 0 : heapLockmode = ShareUpdateExclusiveLock;
2895 0 : indexLockmode = RowExclusiveLock;
2896 : }
2897 :
2898 : /* Track query ID */
2899 12 : pgstat_report_query_id(brinshared->queryid, false);
2900 :
2901 : /* Open relations within worker */
2902 12 : heapRel = table_open(brinshared->heaprelid, heapLockmode);
2903 12 : indexRel = index_open(brinshared->indexrelid, indexLockmode);
2904 :
2905 12 : buildstate = initialize_brin_buildstate(indexRel, NULL,
2906 : brinshared->pagesPerRange,
2907 : InvalidBlockNumber);
2908 :
2909 : /* Look up shared state private to tuplesort.c */
2910 12 : sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
2911 12 : tuplesort_attach_shared(sharedsort, seg);
2912 :
2913 : /* Prepare to track buffer usage during parallel execution */
2914 12 : InstrStartParallelQuery();
2915 :
2916 : /*
2917 : * Might as well use reliable figure when doling out maintenance_work_mem
2918 : * (when requested number of workers were not launched, this will be
2919 : * somewhat higher than it is for other workers).
2920 : */
2921 12 : sortmem = maintenance_work_mem / brinshared->scantuplesortstates;
2922 :
2923 12 : _brin_parallel_scan_and_build(buildstate, brinshared, sharedsort,
2924 : heapRel, indexRel, sortmem, false);
2925 :
2926 : /* Report WAL/buffer usage during parallel execution */
2927 12 : bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
2928 12 : walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
2929 12 : InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
2930 12 : &walusage[ParallelWorkerNumber]);
2931 :
2932 12 : index_close(indexRel, indexLockmode);
2933 12 : table_close(heapRel, heapLockmode);
2934 12 : }
2935 :
2936 : /*
2937 : * brin_build_empty_tuple
2938 : * Maybe initialize a BRIN tuple representing empty range.
2939 : *
2940 : * Returns a BRIN tuple representing an empty page range starting at the
2941 : * specified block number. The empty tuple is initialized only once, when it's
2942 : * needed for the first time, stored in the memory context bs_context to ensure
2943 : * proper life span, and reused on following calls. All empty tuples are
2944 : * exactly the same except for the bt_blkno field, which is set to the value
2945 : * in blkno parameter.
2946 : */
2947 : static void
2948 20 : brin_build_empty_tuple(BrinBuildState *state, BlockNumber blkno)
2949 : {
2950 : /* First time an empty tuple is requested? If yes, initialize it. */
2951 20 : if (state->bs_emptyTuple == NULL)
2952 : {
2953 : MemoryContext oldcxt;
2954 10 : BrinMemTuple *dtuple = brin_new_memtuple(state->bs_bdesc);
2955 :
2956 : /* Allocate the tuple in context for the whole index build. */
2957 10 : oldcxt = MemoryContextSwitchTo(state->bs_context);
2958 :
2959 10 : state->bs_emptyTuple = brin_form_tuple(state->bs_bdesc, blkno, dtuple,
2960 : &state->bs_emptyTupleLen);
2961 :
2962 10 : MemoryContextSwitchTo(oldcxt);
2963 : }
2964 : else
2965 : {
2966 : /* If we already have an empty tuple, just update the block. */
2967 10 : state->bs_emptyTuple->bt_blkno = blkno;
2968 : }
2969 20 : }
2970 :
2971 : /*
2972 : * brin_fill_empty_ranges
2973 : * Add BRIN index tuples representing empty page ranges.
2974 : *
2975 : * prevRange/nextRange determine for which page ranges to add empty summaries.
2976 : * Both boundaries are exclusive, i.e. only ranges starting at blkno for which
2977 : * (prevRange < blkno < nextRange) will be added to the index.
2978 : *
2979 : * If prevRange is InvalidBlockNumber, this means there was no previous page
2980 : * range (i.e. the first empty range to add is for blkno=0).
2981 : *
2982 : * The empty tuple is built only once, and then reused for all future calls.
2983 : */
2984 : static void
2985 384 : brin_fill_empty_ranges(BrinBuildState *state,
2986 : BlockNumber prevRange, BlockNumber nextRange)
2987 : {
2988 : BlockNumber blkno;
2989 :
2990 : /*
2991 : * If we already summarized some ranges, we need to start with the next
2992 : * one. Otherwise start from the first range of the table.
2993 : */
2994 384 : blkno = (prevRange == InvalidBlockNumber) ? 0 : (prevRange + state->bs_pagesPerRange);
2995 :
2996 : /* Generate empty ranges until we hit the next non-empty range. */
2997 404 : while (blkno < nextRange)
2998 : {
2999 : /* Did we already build the empty tuple? If not, do it now. */
3000 20 : brin_build_empty_tuple(state, blkno);
3001 :
3002 20 : brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
3003 : &state->bs_currentInsertBuf,
3004 : blkno, state->bs_emptyTuple, state->bs_emptyTupleLen);
3005 :
3006 : /* try next page range */
3007 20 : blkno += state->bs_pagesPerRange;
3008 : }
3009 384 : }
|