Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtree.c
4 : * Implementation of Lehman and Yao's btree management algorithm for
5 : * Postgres.
6 : *
7 : * NOTES
8 : * This file contains only the public interface routines.
9 : *
10 : *
11 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
12 : * Portions Copyright (c) 1994, Regents of the University of California
13 : *
14 : * IDENTIFICATION
15 : * src/backend/access/nbtree/nbtree.c
16 : *
17 : *-------------------------------------------------------------------------
18 : */
19 : #include "postgres.h"
20 :
21 : #include "access/nbtree.h"
22 : #include "access/relscan.h"
23 : #include "access/stratnum.h"
24 : #include "commands/progress.h"
25 : #include "commands/vacuum.h"
26 : #include "nodes/execnodes.h"
27 : #include "pgstat.h"
28 : #include "storage/bulk_write.h"
29 : #include "storage/condition_variable.h"
30 : #include "storage/indexfsm.h"
31 : #include "storage/ipc.h"
32 : #include "storage/lmgr.h"
33 : #include "storage/read_stream.h"
34 : #include "utils/datum.h"
35 : #include "utils/fmgrprotos.h"
36 : #include "utils/index_selfuncs.h"
37 : #include "utils/memutils.h"
38 :
39 :
40 : /*
41 : * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
42 : *
43 : * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
44 : * scan to advance it via another call to _bt_first.
45 : *
46 : * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
47 : * a new page; others must wait.
48 : *
49 : * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
50 : * to a new page; some process can start doing that.
51 : *
52 : * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
53 : */
54 : typedef enum
55 : {
56 : BTPARALLEL_NOT_INITIALIZED,
57 : BTPARALLEL_NEED_PRIMSCAN,
58 : BTPARALLEL_ADVANCING,
59 : BTPARALLEL_IDLE,
60 : BTPARALLEL_DONE,
61 : } BTPS_State;
62 :
63 : /*
64 : * BTParallelScanDescData contains btree specific shared information required
65 : * for parallel scan.
66 : */
67 : typedef struct BTParallelScanDescData
68 : {
69 : BlockNumber btps_nextScanPage; /* next page to be scanned */
70 : BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into
71 : * btps_nextScanPage */
72 : BTPS_State btps_pageStatus; /* indicates whether next page is
73 : * available for scan. see above for
74 : * possible states of parallel scan. */
75 : LWLock btps_lock; /* protects shared parallel state */
76 : ConditionVariable btps_cv; /* used to synchronize parallel scan */
77 :
78 : /*
79 : * btps_arrElems is used when scans need to schedule another primitive
80 : * index scan with one or more SAOP arrays. Holds BTArrayKeyInfo.cur_elem
81 : * offsets for each = scan key associated with a ScalarArrayOp array.
82 : */
83 : int btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
84 :
85 : /*
86 : * Additional space (at the end of the struct) is used when scans need to
87 : * schedule another primitive index scan with one or more skip arrays.
88 : * Holds a flattened datum representation for each = scan key associated
89 : * with a skip array.
90 : */
91 : } BTParallelScanDescData;
92 :
93 : typedef struct BTParallelScanDescData *BTParallelScanDesc;
94 :
95 :
96 : static bool _bt_start_prim_scan(IndexScanDesc scan);
97 : static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan,
98 : BTScanOpaque so);
99 : static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan,
100 : BTScanOpaque so);
101 : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
102 : IndexBulkDeleteCallback callback, void *callback_state,
103 : BTCycleId cycleid);
104 : static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf);
105 : static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
106 : IndexTuple posting,
107 : OffsetNumber updatedoffset,
108 : int *nremaining);
109 :
110 :
111 : /*
112 : * Btree handler function: return IndexAmRoutine with access method parameters
113 : * and callbacks.
114 : */
115 : Datum
116 3551206 : bthandler(PG_FUNCTION_ARGS)
117 : {
118 3551206 : IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
119 :
120 3551206 : amroutine->amstrategies = BTMaxStrategyNumber;
121 3551206 : amroutine->amsupport = BTNProcs;
122 3551206 : amroutine->amoptsprocnum = BTOPTIONS_PROC;
123 3551206 : amroutine->amcanorder = true;
124 3551206 : amroutine->amcanorderbyop = false;
125 3551206 : amroutine->amcanhash = false;
126 3551206 : amroutine->amconsistentequality = true;
127 3551206 : amroutine->amconsistentordering = true;
128 3551206 : amroutine->amcanbackward = true;
129 3551206 : amroutine->amcanunique = true;
130 3551206 : amroutine->amcanmulticol = true;
131 3551206 : amroutine->amoptionalkey = true;
132 3551206 : amroutine->amsearcharray = true;
133 3551206 : amroutine->amsearchnulls = true;
134 3551206 : amroutine->amstorage = false;
135 3551206 : amroutine->amclusterable = true;
136 3551206 : amroutine->ampredlocks = true;
137 3551206 : amroutine->amcanparallel = true;
138 3551206 : amroutine->amcanbuildparallel = true;
139 3551206 : amroutine->amcaninclude = true;
140 3551206 : amroutine->amusemaintenanceworkmem = false;
141 3551206 : amroutine->amsummarizing = false;
142 3551206 : amroutine->amparallelvacuumoptions =
143 : VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
144 3551206 : amroutine->amkeytype = InvalidOid;
145 :
146 3551206 : amroutine->ambuild = btbuild;
147 3551206 : amroutine->ambuildempty = btbuildempty;
148 3551206 : amroutine->aminsert = btinsert;
149 3551206 : amroutine->aminsertcleanup = NULL;
150 3551206 : amroutine->ambulkdelete = btbulkdelete;
151 3551206 : amroutine->amvacuumcleanup = btvacuumcleanup;
152 3551206 : amroutine->amcanreturn = btcanreturn;
153 3551206 : amroutine->amcostestimate = btcostestimate;
154 3551206 : amroutine->amgettreeheight = btgettreeheight;
155 3551206 : amroutine->amoptions = btoptions;
156 3551206 : amroutine->amproperty = btproperty;
157 3551206 : amroutine->ambuildphasename = btbuildphasename;
158 3551206 : amroutine->amvalidate = btvalidate;
159 3551206 : amroutine->amadjustmembers = btadjustmembers;
160 3551206 : amroutine->ambeginscan = btbeginscan;
161 3551206 : amroutine->amrescan = btrescan;
162 3551206 : amroutine->amgettuple = btgettuple;
163 3551206 : amroutine->amgetbitmap = btgetbitmap;
164 3551206 : amroutine->amendscan = btendscan;
165 3551206 : amroutine->ammarkpos = btmarkpos;
166 3551206 : amroutine->amrestrpos = btrestrpos;
167 3551206 : amroutine->amestimateparallelscan = btestimateparallelscan;
168 3551206 : amroutine->aminitparallelscan = btinitparallelscan;
169 3551206 : amroutine->amparallelrescan = btparallelrescan;
170 3551206 : amroutine->amtranslatestrategy = bttranslatestrategy;
171 3551206 : amroutine->amtranslatecmptype = bttranslatecmptype;
172 :
173 3551206 : PG_RETURN_POINTER(amroutine);
174 : }
175 :
176 : /*
177 : * btbuildempty() -- build an empty btree index in the initialization fork
178 : */
179 : void
180 188 : btbuildempty(Relation index)
181 : {
182 188 : bool allequalimage = _bt_allequalimage(index, false);
183 : BulkWriteState *bulkstate;
184 : BulkWriteBuffer metabuf;
185 :
186 188 : bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
187 :
188 : /* Construct metapage. */
189 188 : metabuf = smgr_bulk_get_buf(bulkstate);
190 188 : _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
191 188 : smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
192 :
193 188 : smgr_bulk_finish(bulkstate);
194 188 : }
195 :
196 : /*
197 : * btinsert() -- insert an index tuple into a btree.
198 : *
199 : * Descend the tree recursively, find the appropriate location for our
200 : * new tuple, and put it there.
201 : */
202 : bool
203 7264012 : btinsert(Relation rel, Datum *values, bool *isnull,
204 : ItemPointer ht_ctid, Relation heapRel,
205 : IndexUniqueCheck checkUnique,
206 : bool indexUnchanged,
207 : IndexInfo *indexInfo)
208 : {
209 : bool result;
210 : IndexTuple itup;
211 :
212 : /* generate an index tuple */
213 7264012 : itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
214 7264012 : itup->t_tid = *ht_ctid;
215 :
216 7264012 : result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
217 :
218 7263476 : pfree(itup);
219 :
220 7263476 : return result;
221 : }
222 :
223 : /*
224 : * btgettuple() -- Get the next tuple in the scan.
225 : */
226 : bool
227 35822728 : btgettuple(IndexScanDesc scan, ScanDirection dir)
228 : {
229 35822728 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
230 : bool res;
231 :
232 : Assert(scan->heapRelation != NULL);
233 :
234 : /* btree indexes are never lossy */
235 35822728 : scan->xs_recheck = false;
236 :
237 : /* Each loop iteration performs another primitive index scan */
238 : do
239 : {
240 : /*
241 : * If we've already initialized this scan, we can just advance it in
242 : * the appropriate direction. If we haven't done so yet, we call
243 : * _bt_first() to get the first item in the scan.
244 : */
245 35839640 : if (!BTScanPosIsValid(so->currPos))
246 16386116 : res = _bt_first(scan, dir);
247 : else
248 : {
249 : /*
250 : * Check to see if we should kill the previously-fetched tuple.
251 : */
252 19453524 : if (scan->kill_prior_tuple)
253 : {
254 : /*
255 : * Yes, remember it for later. (We'll deal with all such
256 : * tuples at once right before leaving the index page.) The
257 : * test for numKilled overrun is not just paranoia: if the
258 : * caller reverses direction in the indexscan then the same
259 : * item might get entered multiple times. It's not worth
260 : * trying to optimize that, so we don't detect it, but instead
261 : * just forget any excess entries.
262 : */
263 475588 : if (so->killedItems == NULL)
264 169436 : so->killedItems = (int *)
265 169436 : palloc(MaxTIDsPerBTreePage * sizeof(int));
266 475588 : if (so->numKilled < MaxTIDsPerBTreePage)
267 475588 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
268 : }
269 :
270 : /*
271 : * Now continue the scan.
272 : */
273 19453524 : res = _bt_next(scan, dir);
274 : }
275 :
276 : /* If we have a tuple, return it ... */
277 35839636 : if (res)
278 28716380 : break;
279 : /* ... otherwise see if we need another primitive index scan */
280 7123256 : } while (so->numArrayKeys && _bt_start_prim_scan(scan));
281 :
282 35822724 : return res;
283 : }
284 :
285 : /*
286 : * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
287 : */
288 : int64
289 13132 : btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
290 : {
291 13132 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
292 13132 : int64 ntids = 0;
293 : ItemPointer heapTid;
294 :
295 : Assert(scan->heapRelation == NULL);
296 :
297 : /* Each loop iteration performs another primitive index scan */
298 : do
299 : {
300 : /* Fetch the first page & tuple */
301 13764 : if (_bt_first(scan, ForwardScanDirection))
302 : {
303 : /* Save tuple ID, and continue scanning */
304 9726 : heapTid = &scan->xs_heaptid;
305 9726 : tbm_add_tuples(tbm, heapTid, 1, false);
306 9726 : ntids++;
307 :
308 : for (;;)
309 : {
310 : /*
311 : * Advance to next tuple within page. This is the same as the
312 : * easy case in _bt_next().
313 : */
314 1986314 : if (++so->currPos.itemIndex > so->currPos.lastItem)
315 : {
316 : /* let _bt_next do the heavy lifting */
317 15296 : if (!_bt_next(scan, ForwardScanDirection))
318 9726 : break;
319 : }
320 :
321 : /* Save tuple ID, and continue scanning */
322 1976588 : heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
323 1976588 : tbm_add_tuples(tbm, heapTid, 1, false);
324 1976588 : ntids++;
325 : }
326 : }
327 : /* Now see if we need another primitive index scan */
328 13764 : } while (so->numArrayKeys && _bt_start_prim_scan(scan));
329 :
330 13132 : return ntids;
331 : }
332 :
333 : /*
334 : * btbeginscan() -- start a scan on a btree index
335 : */
336 : IndexScanDesc
337 15747236 : btbeginscan(Relation rel, int nkeys, int norderbys)
338 : {
339 : IndexScanDesc scan;
340 : BTScanOpaque so;
341 :
342 : /* no order by operators allowed */
343 : Assert(norderbys == 0);
344 :
345 : /* get the scan */
346 15747236 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
347 :
348 : /* allocate private workspace */
349 15747236 : so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
350 15747236 : BTScanPosInvalidate(so->currPos);
351 15747236 : BTScanPosInvalidate(so->markPos);
352 15747236 : if (scan->numberOfKeys > 0)
353 15734146 : so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
354 : else
355 13090 : so->keyData = NULL;
356 :
357 15747236 : so->skipScan = false;
358 15747236 : so->needPrimScan = false;
359 15747236 : so->scanBehind = false;
360 15747236 : so->oppositeDirCheck = false;
361 15747236 : so->arrayKeys = NULL;
362 15747236 : so->orderProcs = NULL;
363 15747236 : so->arrayContext = NULL;
364 :
365 15747236 : so->killedItems = NULL; /* until needed */
366 15747236 : so->numKilled = 0;
367 :
368 : /*
369 : * We don't know yet whether the scan will be index-only, so we do not
370 : * allocate the tuple workspace arrays until btrescan. However, we set up
371 : * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
372 : */
373 15747236 : so->currTuples = so->markTuples = NULL;
374 :
375 15747236 : scan->xs_itupdesc = RelationGetDescr(rel);
376 :
377 15747236 : scan->opaque = so;
378 :
379 15747236 : return scan;
380 : }
381 :
382 : /*
383 : * btrescan() -- rescan an index relation
384 : */
385 : void
386 16383664 : btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
387 : ScanKey orderbys, int norderbys)
388 : {
389 16383664 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
390 :
391 : /* we aren't holding any read locks, but gotta drop the pins */
392 16383664 : if (BTScanPosIsValid(so->currPos))
393 : {
394 : /* Before leaving current page, deal with any killed items */
395 88876 : if (so->numKilled > 0)
396 1350 : _bt_killitems(scan);
397 88876 : BTScanPosUnpinIfPinned(so->currPos);
398 88876 : BTScanPosInvalidate(so->currPos);
399 : }
400 :
401 : /*
402 : * We prefer to eagerly drop leaf page pins before btgettuple returns.
403 : * This avoids making VACUUM wait to acquire a cleanup lock on the page.
404 : *
405 : * We cannot safely drop leaf page pins during index-only scans due to a
406 : * race condition involving VACUUM setting pages all-visible in the VM.
407 : * It's also unsafe for plain index scans that use a non-MVCC snapshot.
408 : *
409 : * When we drop pins eagerly, the mechanism that marks so->killedItems[]
410 : * index tuples LP_DEAD has to deal with concurrent TID recycling races.
411 : * The scheme used to detect unsafe TID recycling won't work when scanning
412 : * unlogged relations (since it involves saving an affected page's LSN).
413 : * Opt out of eager pin dropping during unlogged relation scans for now
414 : * (this is preferable to opting out of kill_prior_tuple LP_DEAD setting).
415 : *
416 : * Also opt out of dropping leaf page pins eagerly during bitmap scans.
417 : * Pins cannot be held for more than an instant during bitmap scans either
418 : * way, so we might as well avoid wasting cycles on acquiring page LSNs.
419 : *
420 : * See nbtree/README section on making concurrent TID recycling safe.
421 : *
422 : * Note: so->dropPin should never change across rescans.
423 : */
424 48795512 : so->dropPin = (!scan->xs_want_itup &&
425 16028184 : IsMVCCSnapshot(scan->xs_snapshot) &&
426 47245500 : RelationNeedsWAL(scan->indexRelation) &&
427 14833652 : scan->heapRelation != NULL);
428 :
429 16383664 : so->markItemIndex = -1;
430 16383664 : so->needPrimScan = false;
431 16383664 : so->scanBehind = false;
432 16383664 : so->oppositeDirCheck = false;
433 16383664 : BTScanPosUnpinIfPinned(so->markPos);
434 16383664 : BTScanPosInvalidate(so->markPos);
435 :
436 : /*
437 : * Allocate tuple workspace arrays, if needed for an index-only scan and
438 : * not already done in a previous rescan call. To save on palloc
439 : * overhead, both workspaces are allocated as one palloc block; only this
440 : * function and btendscan know that.
441 : *
442 : * NOTE: this data structure also makes it safe to return data from a
443 : * "name" column, even though btree name_ops uses an underlying storage
444 : * datatype of cstring. The risk there is that "name" is supposed to be
445 : * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
446 : * However, since we only return data out of tuples sitting in the
447 : * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
448 : * data out of the markTuples array --- running off the end of memory for
449 : * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
450 : * adding special-case treatment for name_ops elsewhere.
451 : */
452 16383664 : if (scan->xs_want_itup && so->currTuples == NULL)
453 : {
454 135620 : so->currTuples = (char *) palloc(BLCKSZ * 2);
455 135620 : so->markTuples = so->currTuples + BLCKSZ;
456 : }
457 :
458 : /*
459 : * Reset the scan keys
460 : */
461 16383664 : if (scankey && scan->numberOfKeys > 0)
462 16370352 : memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
463 16383664 : so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
464 16383664 : so->numArrayKeys = 0; /* ditto */
465 16383664 : }
466 :
467 : /*
468 : * btendscan() -- close down a scan
469 : */
470 : void
471 15745612 : btendscan(IndexScanDesc scan)
472 : {
473 15745612 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
474 :
475 : /* we aren't holding any read locks, but gotta drop the pins */
476 15745612 : if (BTScanPosIsValid(so->currPos))
477 : {
478 : /* Before leaving current page, deal with any killed items */
479 9172824 : if (so->numKilled > 0)
480 88592 : _bt_killitems(scan);
481 9172824 : BTScanPosUnpinIfPinned(so->currPos);
482 : }
483 :
484 15745612 : so->markItemIndex = -1;
485 15745612 : BTScanPosUnpinIfPinned(so->markPos);
486 :
487 : /* No need to invalidate positions, the RAM is about to be freed. */
488 :
489 : /* Release storage */
490 15745612 : if (so->keyData != NULL)
491 15732552 : pfree(so->keyData);
492 : /* so->arrayKeys and so->orderProcs are in arrayContext */
493 15745612 : if (so->arrayContext != NULL)
494 4838 : MemoryContextDelete(so->arrayContext);
495 15745612 : if (so->killedItems != NULL)
496 169368 : pfree(so->killedItems);
497 15745612 : if (so->currTuples != NULL)
498 135576 : pfree(so->currTuples);
499 : /* so->markTuples should not be pfree'd, see btrescan */
500 15745612 : pfree(so);
501 15745612 : }
502 :
503 : /*
504 : * btmarkpos() -- save current scan position
505 : */
506 : void
507 130148 : btmarkpos(IndexScanDesc scan)
508 : {
509 130148 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
510 :
511 : /* There may be an old mark with a pin (but no lock). */
512 130148 : BTScanPosUnpinIfPinned(so->markPos);
513 :
514 : /*
515 : * Just record the current itemIndex. If we later step to next page
516 : * before releasing the marked position, _bt_steppage makes a full copy of
517 : * the currPos struct in markPos. If (as often happens) the mark is moved
518 : * before we leave the page, we don't have to do that work.
519 : */
520 130148 : if (BTScanPosIsValid(so->currPos))
521 130148 : so->markItemIndex = so->currPos.itemIndex;
522 : else
523 : {
524 0 : BTScanPosInvalidate(so->markPos);
525 0 : so->markItemIndex = -1;
526 : }
527 130148 : }
528 :
529 : /*
530 : * btrestrpos() -- restore scan to last saved position
531 : */
532 : void
533 54042 : btrestrpos(IndexScanDesc scan)
534 : {
535 54042 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
536 :
537 54042 : if (so->markItemIndex >= 0)
538 : {
539 : /*
540 : * The scan has never moved to a new page since the last mark. Just
541 : * restore the itemIndex.
542 : *
543 : * NB: In this case we can't count on anything in so->markPos to be
544 : * accurate.
545 : */
546 53934 : so->currPos.itemIndex = so->markItemIndex;
547 : }
548 : else
549 : {
550 : /*
551 : * The scan moved to a new page after last mark or restore, and we are
552 : * now restoring to the marked page. We aren't holding any read
553 : * locks, but if we're still holding the pin for the current position,
554 : * we must drop it.
555 : */
556 108 : if (BTScanPosIsValid(so->currPos))
557 : {
558 : /* Before leaving current page, deal with any killed items */
559 108 : if (so->numKilled > 0)
560 0 : _bt_killitems(scan);
561 108 : BTScanPosUnpinIfPinned(so->currPos);
562 : }
563 :
564 108 : if (BTScanPosIsValid(so->markPos))
565 : {
566 : /* bump pin on mark buffer for assignment to current buffer */
567 108 : if (BTScanPosIsPinned(so->markPos))
568 0 : IncrBufferRefCount(so->markPos.buf);
569 108 : memcpy(&so->currPos, &so->markPos,
570 : offsetof(BTScanPosData, items[1]) +
571 108 : so->markPos.lastItem * sizeof(BTScanPosItem));
572 108 : if (so->currTuples)
573 0 : memcpy(so->currTuples, so->markTuples,
574 0 : so->markPos.nextTupleOffset);
575 : /* Reset the scan's array keys (see _bt_steppage for why) */
576 108 : if (so->numArrayKeys)
577 : {
578 0 : _bt_start_array_keys(scan, so->currPos.dir);
579 0 : so->needPrimScan = false;
580 : }
581 : }
582 : else
583 0 : BTScanPosInvalidate(so->currPos);
584 : }
585 54042 : }
586 :
587 : /*
588 : * btestimateparallelscan -- estimate storage for BTParallelScanDescData
589 : */
590 : Size
591 64 : btestimateparallelscan(Relation rel, int nkeys, int norderbys)
592 : {
593 64 : int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
594 : Size estnbtreeshared,
595 : genericattrspace;
596 :
597 : /*
598 : * Pessimistically assume that every input scan key will be output with
599 : * its own SAOP array
600 : */
601 64 : estnbtreeshared = offsetof(BTParallelScanDescData, btps_arrElems) +
602 : sizeof(int) * nkeys;
603 :
604 : /* Single column indexes cannot possibly use a skip array */
605 64 : if (nkeyatts == 1)
606 46 : return estnbtreeshared;
607 :
608 : /*
609 : * Pessimistically assume that all attributes prior to the least
610 : * significant attribute require a skip array (and an associated key)
611 : */
612 18 : genericattrspace = datumEstimateSpace((Datum) 0, false, true,
613 : sizeof(Datum));
614 36 : for (int attnum = 1; attnum < nkeyatts; attnum++)
615 : {
616 : CompactAttribute *attr;
617 :
618 : /*
619 : * We make the conservative assumption that every index column will
620 : * also require a skip array.
621 : *
622 : * Every skip array must have space to store its scan key's sk_flags.
623 : */
624 18 : estnbtreeshared = add_size(estnbtreeshared, sizeof(int));
625 :
626 : /* Consider space required to store a datum of opclass input type */
627 18 : attr = TupleDescCompactAttr(rel->rd_att, attnum - 1);
628 18 : if (attr->attbyval)
629 18 : {
630 : /* This index attribute stores pass-by-value datums */
631 18 : Size estfixed = datumEstimateSpace((Datum) 0, false,
632 18 : true, attr->attlen);
633 :
634 18 : estnbtreeshared = add_size(estnbtreeshared, estfixed);
635 18 : continue;
636 : }
637 :
638 : /*
639 : * This index attribute stores pass-by-reference datums.
640 : *
641 : * Assume that serializing this array will use just as much space as a
642 : * pass-by-value datum, in addition to space for the largest possible
643 : * whole index tuple (this is not just a per-datum portion of the
644 : * largest possible tuple because that'd be almost as large anyway).
645 : *
646 : * This is quite conservative, but it's not clear how we could do much
647 : * better. The executor requires an up-front storage request size
648 : * that reliably covers the scan's high watermark memory usage. We
649 : * can't be sure of the real high watermark until the scan is over.
650 : */
651 0 : estnbtreeshared = add_size(estnbtreeshared, genericattrspace);
652 0 : estnbtreeshared = add_size(estnbtreeshared, BTMaxItemSize);
653 : }
654 :
655 18 : return estnbtreeshared;
656 : }
657 :
658 : /*
659 : * _bt_start_prim_scan() -- start scheduled primitive index scan?
660 : *
661 : * Returns true if _bt_checkkeys scheduled another primitive index scan, just
662 : * as the last one ended. Otherwise returns false, indicating that the array
663 : * keys are now fully exhausted.
664 : *
665 : * Only call here during scans with one or more equality type array scan keys,
666 : * after _bt_first or _bt_next return false.
667 : */
668 : static bool
669 88772 : _bt_start_prim_scan(IndexScanDesc scan)
670 : {
671 88772 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
672 :
673 : Assert(so->numArrayKeys);
674 :
675 88772 : so->scanBehind = so->oppositeDirCheck = false; /* reset */
676 :
677 : /*
678 : * Array keys are advanced within _bt_checkkeys when the scan reaches the
679 : * leaf level (more precisely, they're advanced when the scan reaches the
680 : * end of each distinct set of array elements). This process avoids
681 : * repeat access to leaf pages (across multiple primitive index scans) by
682 : * advancing the scan's array keys when it allows the primitive index scan
683 : * to find nearby matching tuples (or when it eliminates ranges of array
684 : * key space that can't possibly be satisfied by any index tuple).
685 : *
686 : * _bt_checkkeys sets a simple flag variable to schedule another primitive
687 : * index scan. The flag tells us what to do.
688 : *
689 : * We cannot rely on _bt_first always reaching _bt_checkkeys. There are
690 : * various cases where that won't happen. For example, if the index is
691 : * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys.
692 : * We also don't expect a call to _bt_checkkeys during searches for a
693 : * non-existent value that happens to be lower/higher than any existing
694 : * value in the index.
695 : *
696 : * We don't require special handling for these cases -- we don't need to
697 : * be explicitly instructed to _not_ perform another primitive index scan.
698 : * It's up to code under the control of _bt_first to always set the flag
699 : * when another primitive index scan will be required.
700 : *
701 : * This works correctly, even with the tricky cases listed above, which
702 : * all involve access to leaf pages "near the boundaries of the key space"
703 : * (whether it's from a leftmost/rightmost page, or an imaginary empty
704 : * leaf root page). If _bt_checkkeys cannot be reached by a primitive
705 : * index scan for one set of array keys, then it also won't be reached for
706 : * any later set ("later" in terms of the direction that we scan the index
707 : * and advance the arrays). The array keys won't have advanced in these
708 : * cases, but that's the correct behavior (even _bt_advance_array_keys
709 : * won't always advance the arrays at the point they become "exhausted").
710 : */
711 88772 : if (so->needPrimScan)
712 : {
713 : /*
714 : * Flag was set -- must call _bt_first again, which will reset the
715 : * scan's needPrimScan flag
716 : */
717 17544 : return true;
718 : }
719 :
720 : /* The top-level index scan ran out of tuples in this scan direction */
721 71228 : if (scan->parallel_scan != NULL)
722 30 : _bt_parallel_done(scan);
723 :
724 71228 : return false;
725 : }
726 :
727 : /*
728 : * _bt_parallel_serialize_arrays() -- Serialize parallel array state.
729 : *
730 : * Caller must have exclusively locked btscan->btps_lock when called.
731 : */
732 : static void
733 36 : _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan,
734 : BTScanOpaque so)
735 : {
736 : char *datumshared;
737 :
738 : /* Space for serialized datums begins immediately after btps_arrElems[] */
739 36 : datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
740 72 : for (int i = 0; i < so->numArrayKeys; i++)
741 : {
742 36 : BTArrayKeyInfo *array = &so->arrayKeys[i];
743 36 : ScanKey skey = &so->keyData[array->scan_key];
744 :
745 36 : if (array->num_elems != -1)
746 : {
747 : /* Save SAOP array's cur_elem (no need to copy key/datum) */
748 : Assert(!(skey->sk_flags & SK_BT_SKIP));
749 36 : btscan->btps_arrElems[i] = array->cur_elem;
750 36 : continue;
751 : }
752 :
753 : /* Save all mutable state associated with skip array's key */
754 : Assert(skey->sk_flags & SK_BT_SKIP);
755 0 : memcpy(datumshared, &skey->sk_flags, sizeof(int));
756 0 : datumshared += sizeof(int);
757 :
758 0 : if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
759 : {
760 : /* No sk_argument datum to serialize */
761 : Assert(skey->sk_argument == 0);
762 0 : continue;
763 : }
764 :
765 0 : datumSerialize(skey->sk_argument, (skey->sk_flags & SK_ISNULL) != 0,
766 0 : array->attbyval, array->attlen, &datumshared);
767 : }
768 36 : }
769 :
770 : /*
771 : * _bt_parallel_restore_arrays() -- Restore serialized parallel array state.
772 : *
773 : * Caller must have exclusively locked btscan->btps_lock when called.
774 : */
775 : static void
776 36 : _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan,
777 : BTScanOpaque so)
778 : {
779 : char *datumshared;
780 :
781 : /* Space for serialized datums begins immediately after btps_arrElems[] */
782 36 : datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
783 72 : for (int i = 0; i < so->numArrayKeys; i++)
784 : {
785 36 : BTArrayKeyInfo *array = &so->arrayKeys[i];
786 36 : ScanKey skey = &so->keyData[array->scan_key];
787 : bool isnull;
788 :
789 36 : if (array->num_elems != -1)
790 : {
791 : /* Restore SAOP array using its saved cur_elem */
792 : Assert(!(skey->sk_flags & SK_BT_SKIP));
793 36 : array->cur_elem = btscan->btps_arrElems[i];
794 36 : skey->sk_argument = array->elem_values[array->cur_elem];
795 36 : continue;
796 : }
797 :
798 : /* Restore skip array by restoring its key directly */
799 0 : if (!array->attbyval && skey->sk_argument)
800 0 : pfree(DatumGetPointer(skey->sk_argument));
801 0 : skey->sk_argument = (Datum) 0;
802 0 : memcpy(&skey->sk_flags, datumshared, sizeof(int));
803 0 : datumshared += sizeof(int);
804 :
805 : Assert(skey->sk_flags & SK_BT_SKIP);
806 :
807 0 : if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
808 : {
809 : /* No sk_argument datum to restore */
810 0 : continue;
811 : }
812 :
813 0 : skey->sk_argument = datumRestore(&datumshared, &isnull);
814 0 : if (isnull)
815 : {
816 : Assert(skey->sk_argument == 0);
817 : Assert(skey->sk_flags & SK_SEARCHNULL);
818 : Assert(skey->sk_flags & SK_ISNULL);
819 : }
820 : }
821 36 : }
822 :
823 : /*
824 : * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
825 : */
826 : void
827 64 : btinitparallelscan(void *target)
828 : {
829 64 : BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
830 :
831 64 : LWLockInitialize(&bt_target->btps_lock,
832 : LWTRANCHE_PARALLEL_BTREE_SCAN);
833 64 : bt_target->btps_nextScanPage = InvalidBlockNumber;
834 64 : bt_target->btps_lastCurrPage = InvalidBlockNumber;
835 64 : bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
836 64 : ConditionVariableInit(&bt_target->btps_cv);
837 64 : }
838 :
839 : /*
840 : * btparallelrescan() -- reset parallel scan
841 : */
842 : void
843 24 : btparallelrescan(IndexScanDesc scan)
844 : {
845 : BTParallelScanDesc btscan;
846 24 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
847 :
848 : Assert(parallel_scan);
849 :
850 24 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
851 : parallel_scan->ps_offset_am);
852 :
853 : /*
854 : * In theory, we don't need to acquire the LWLock here, because there
855 : * shouldn't be any other workers running at this point, but we do so for
856 : * consistency.
857 : */
858 24 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
859 24 : btscan->btps_nextScanPage = InvalidBlockNumber;
860 24 : btscan->btps_lastCurrPage = InvalidBlockNumber;
861 24 : btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
862 24 : LWLockRelease(&btscan->btps_lock);
863 24 : }
864 :
865 : /*
866 : * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
867 : * page. Other scans must wait until we call _bt_parallel_release()
868 : * or _bt_parallel_done().
869 : *
870 : * The return value is true if we successfully seized the scan and false
871 : * if we did not. The latter case occurs when no pages remain, or when
872 : * another primitive index scan is scheduled that caller's backend cannot
873 : * start just yet (only backends that call from _bt_first are capable of
874 : * starting primitive index scans, which they indicate by passing first=true).
875 : *
876 : * If the return value is true, *next_scan_page returns the next page of the
877 : * scan, and *last_curr_page returns the page that *next_scan_page came from.
878 : * An invalid *next_scan_page means the scan hasn't yet started, or that
879 : * caller needs to start the next primitive index scan (if it's the latter
880 : * case we'll set so.needPrimScan).
881 : *
882 : * Callers should ignore the value of *next_scan_page and *last_curr_page if
883 : * the return value is false.
884 : */
885 : bool
886 1658 : _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page,
887 : BlockNumber *last_curr_page, bool first)
888 : {
889 1658 : Relation rel = scan->indexRelation;
890 1658 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
891 1658 : bool exit_loop = false,
892 1658 : status = true,
893 1658 : endscan = false;
894 1658 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
895 : BTParallelScanDesc btscan;
896 :
897 1658 : *next_scan_page = InvalidBlockNumber;
898 1658 : *last_curr_page = InvalidBlockNumber;
899 :
900 : /*
901 : * Reset so->currPos, and initialize moreLeft/moreRight such that the next
902 : * call to _bt_readnextpage treats this backend similarly to a serial
903 : * backend that steps from *last_curr_page to *next_scan_page (unless this
904 : * backend's so->currPos is initialized by _bt_readfirstpage before then).
905 : */
906 1658 : BTScanPosInvalidate(so->currPos);
907 1658 : so->currPos.moreLeft = so->currPos.moreRight = true;
908 :
909 1658 : if (first)
910 : {
911 : /*
912 : * Initialize array related state when called from _bt_first, assuming
913 : * that this will be the first primitive index scan for the scan
914 : */
915 446 : so->needPrimScan = false;
916 446 : so->scanBehind = false;
917 446 : so->oppositeDirCheck = false;
918 : }
919 : else
920 : {
921 : /*
922 : * Don't attempt to seize the scan when it requires another primitive
923 : * index scan, since caller's backend cannot start it right now
924 : */
925 1212 : if (so->needPrimScan)
926 0 : return false;
927 : }
928 :
929 1658 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
930 : parallel_scan->ps_offset_am);
931 :
932 : while (1)
933 : {
934 1682 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
935 :
936 1682 : if (btscan->btps_pageStatus == BTPARALLEL_DONE)
937 : {
938 : /* We're done with this parallel index scan */
939 312 : status = false;
940 : }
941 1370 : else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
942 1222 : btscan->btps_nextScanPage == P_NONE)
943 : {
944 : /* End this parallel index scan */
945 10 : status = false;
946 10 : endscan = true;
947 : }
948 1360 : else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
949 : {
950 : Assert(so->numArrayKeys);
951 :
952 36 : if (first)
953 : {
954 : /* Can start scheduled primitive scan right away, so do so */
955 36 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
956 :
957 : /* Restore scan's array keys from serialized values */
958 36 : _bt_parallel_restore_arrays(rel, btscan, so);
959 36 : exit_loop = true;
960 : }
961 : else
962 : {
963 : /*
964 : * Don't attempt to seize the scan when it requires another
965 : * primitive index scan, since caller's backend cannot start
966 : * it right now
967 : */
968 0 : status = false;
969 : }
970 :
971 : /*
972 : * Either way, update backend local state to indicate that a
973 : * pending primitive scan is required
974 : */
975 36 : so->needPrimScan = true;
976 36 : so->scanBehind = false;
977 36 : so->oppositeDirCheck = false;
978 : }
979 1324 : else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
980 : {
981 : /*
982 : * We have successfully seized control of the scan for the purpose
983 : * of advancing it to a new page!
984 : */
985 1300 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
986 : Assert(btscan->btps_nextScanPage != P_NONE);
987 1300 : *next_scan_page = btscan->btps_nextScanPage;
988 1300 : *last_curr_page = btscan->btps_lastCurrPage;
989 1300 : exit_loop = true;
990 : }
991 1682 : LWLockRelease(&btscan->btps_lock);
992 1682 : if (exit_loop || !status)
993 : break;
994 24 : ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
995 : }
996 1658 : ConditionVariableCancelSleep();
997 :
998 : /* When the scan has reached the rightmost (or leftmost) page, end it */
999 1658 : if (endscan)
1000 10 : _bt_parallel_done(scan);
1001 :
1002 1658 : return status;
1003 : }
1004 :
1005 : /*
1006 : * _bt_parallel_release() -- Complete the process of advancing the scan to a
1007 : * new page. We now have the new value btps_nextScanPage; another backend
1008 : * can now begin advancing the scan.
1009 : *
1010 : * Callers whose scan uses array keys must save their curr_page argument so
1011 : * that it can be passed to _bt_parallel_primscan_schedule, should caller
1012 : * determine that another primitive index scan is required.
1013 : *
1014 : * If caller's next_scan_page is P_NONE, the scan has reached the index's
1015 : * rightmost/leftmost page. This is treated as reaching the end of the scan
1016 : * within _bt_parallel_seize.
1017 : *
1018 : * Note: unlike the serial case, parallel scans don't need to remember both
1019 : * sibling links. next_scan_page is whichever link is next given the scan's
1020 : * direction. That's all we'll ever need, since the direction of a parallel
1021 : * scan can never change.
1022 : */
1023 : void
1024 1336 : _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page,
1025 : BlockNumber curr_page)
1026 : {
1027 1336 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1028 : BTParallelScanDesc btscan;
1029 :
1030 : Assert(BlockNumberIsValid(next_scan_page));
1031 :
1032 1336 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1033 : parallel_scan->ps_offset_am);
1034 :
1035 1336 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1036 1336 : btscan->btps_nextScanPage = next_scan_page;
1037 1336 : btscan->btps_lastCurrPage = curr_page;
1038 1336 : btscan->btps_pageStatus = BTPARALLEL_IDLE;
1039 1336 : LWLockRelease(&btscan->btps_lock);
1040 1336 : ConditionVariableSignal(&btscan->btps_cv);
1041 1336 : }
1042 :
1043 : /*
1044 : * _bt_parallel_done() -- Mark the parallel scan as complete.
1045 : *
1046 : * When there are no pages left to scan, this function should be called to
1047 : * notify other workers. Otherwise, they might wait forever for the scan to
1048 : * advance to the next page.
1049 : */
1050 : void
1051 7136738 : _bt_parallel_done(IndexScanDesc scan)
1052 : {
1053 7136738 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1054 7136738 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1055 : BTParallelScanDesc btscan;
1056 7136738 : bool status_changed = false;
1057 :
1058 : Assert(!BTScanPosIsValid(so->currPos));
1059 :
1060 : /* Do nothing, for non-parallel scans */
1061 7136738 : if (parallel_scan == NULL)
1062 7136580 : return;
1063 :
1064 : /*
1065 : * Should not mark parallel scan done when there's still a pending
1066 : * primitive index scan
1067 : */
1068 158 : if (so->needPrimScan)
1069 36 : return;
1070 :
1071 122 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1072 : parallel_scan->ps_offset_am);
1073 :
1074 : /*
1075 : * Mark the parallel scan as done, unless some other process did so
1076 : * already
1077 : */
1078 122 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1079 : Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
1080 122 : if (btscan->btps_pageStatus != BTPARALLEL_DONE)
1081 : {
1082 88 : btscan->btps_pageStatus = BTPARALLEL_DONE;
1083 88 : status_changed = true;
1084 : }
1085 122 : LWLockRelease(&btscan->btps_lock);
1086 :
1087 : /* wake up all the workers associated with this parallel scan */
1088 122 : if (status_changed)
1089 88 : ConditionVariableBroadcast(&btscan->btps_cv);
1090 : }
1091 :
1092 : /*
1093 : * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
1094 : *
1095 : * Caller passes the curr_page most recently passed to _bt_parallel_release
1096 : * by its backend. Caller successfully schedules the next primitive index scan
1097 : * if the shared parallel state hasn't been seized since caller's backend last
1098 : * advanced the scan.
1099 : */
1100 : void
1101 36 : _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
1102 : {
1103 36 : Relation rel = scan->indexRelation;
1104 36 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1105 36 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1106 : BTParallelScanDesc btscan;
1107 :
1108 : Assert(so->numArrayKeys);
1109 :
1110 36 : btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1111 : parallel_scan->ps_offset_am);
1112 :
1113 36 : LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1114 36 : if (btscan->btps_lastCurrPage == curr_page &&
1115 36 : btscan->btps_pageStatus == BTPARALLEL_IDLE)
1116 : {
1117 36 : btscan->btps_nextScanPage = InvalidBlockNumber;
1118 36 : btscan->btps_lastCurrPage = InvalidBlockNumber;
1119 36 : btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
1120 :
1121 : /* Serialize scan's current array keys */
1122 36 : _bt_parallel_serialize_arrays(rel, btscan, so);
1123 : }
1124 36 : LWLockRelease(&btscan->btps_lock);
1125 36 : }
1126 :
1127 : /*
1128 : * Bulk deletion of all index entries pointing to a set of heap tuples.
1129 : * The set of target tuples is specified via a callback routine that tells
1130 : * whether any given heap tuple (identified by ItemPointer) is being deleted.
1131 : *
1132 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
1133 : */
1134 : IndexBulkDeleteResult *
1135 2894 : btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1136 : IndexBulkDeleteCallback callback, void *callback_state)
1137 : {
1138 2894 : Relation rel = info->index;
1139 : BTCycleId cycleid;
1140 :
1141 : /* allocate stats if first time through, else re-use existing struct */
1142 2894 : if (stats == NULL)
1143 2890 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
1144 :
1145 : /* Establish the vacuum cycle ID to use for this scan */
1146 : /* The ENSURE stuff ensures we clean up shared memory on failure */
1147 2894 : PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
1148 : {
1149 2894 : cycleid = _bt_start_vacuum(rel);
1150 :
1151 2894 : btvacuumscan(info, stats, callback, callback_state, cycleid);
1152 : }
1153 2894 : PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
1154 2894 : _bt_end_vacuum(rel);
1155 :
1156 2894 : return stats;
1157 : }
1158 :
1159 : /*
1160 : * Post-VACUUM cleanup.
1161 : *
1162 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
1163 : */
1164 : IndexBulkDeleteResult *
1165 272970 : btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
1166 : {
1167 : BlockNumber num_delpages;
1168 :
1169 : /* No-op in ANALYZE ONLY mode */
1170 272970 : if (info->analyze_only)
1171 17908 : return stats;
1172 :
1173 : /*
1174 : * If btbulkdelete was called, we need not do anything (we just maintain
1175 : * the information used within _bt_vacuum_needs_cleanup() by calling
1176 : * _bt_set_cleanup_info() below).
1177 : *
1178 : * If btbulkdelete was _not_ called, then we have a choice to make: we
1179 : * must decide whether or not a btvacuumscan() call is needed now (i.e.
1180 : * whether the ongoing VACUUM operation can entirely avoid a physical scan
1181 : * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
1182 : * now.
1183 : */
1184 255062 : if (stats == NULL)
1185 : {
1186 : /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
1187 252778 : if (!_bt_vacuum_needs_cleanup(info->index))
1188 252762 : return NULL;
1189 :
1190 : /*
1191 : * Since we aren't going to actually delete any leaf items, there's no
1192 : * need to go through all the vacuum-cycle-ID pushups here.
1193 : *
1194 : * Posting list tuples are a source of inaccuracy for cleanup-only
1195 : * scans. btvacuumscan() will assume that the number of index tuples
1196 : * from each page can be used as num_index_tuples, even though
1197 : * num_index_tuples is supposed to represent the number of TIDs in the
1198 : * index. This naive approach can underestimate the number of tuples
1199 : * in the index significantly.
1200 : *
1201 : * We handle the problem by making num_index_tuples an estimate in
1202 : * cleanup-only case.
1203 : */
1204 16 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
1205 16 : btvacuumscan(info, stats, NULL, NULL, 0);
1206 16 : stats->estimated_count = true;
1207 : }
1208 :
1209 : /*
1210 : * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
1211 : *
1212 : * num_delpages is the number of deleted pages now in the index that were
1213 : * not safe to place in the FSM to be recycled just yet. num_delpages is
1214 : * greater than 0 only when _bt_pagedel() actually deleted pages during
1215 : * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
1216 : * have failed to place any newly deleted pages in the FSM just moments
1217 : * ago. (Actually, there are edge cases where recycling of the current
1218 : * VACUUM's newly deleted pages does not even become safe by the time the
1219 : * next VACUUM comes around. See nbtree/README.)
1220 : */
1221 : Assert(stats->pages_deleted >= stats->pages_free);
1222 2300 : num_delpages = stats->pages_deleted - stats->pages_free;
1223 2300 : _bt_set_cleanup_info(info->index, num_delpages);
1224 :
1225 : /*
1226 : * It's quite possible for us to be fooled by concurrent page splits into
1227 : * double-counting some index tuples, so disbelieve any total that exceeds
1228 : * the underlying heap's count ... if we know that accurately. Otherwise
1229 : * this might just make matters worse.
1230 : */
1231 2300 : if (!info->estimated_count)
1232 : {
1233 2220 : if (stats->num_index_tuples > info->num_heap_tuples)
1234 30 : stats->num_index_tuples = info->num_heap_tuples;
1235 : }
1236 :
1237 2300 : return stats;
1238 : }
1239 :
1240 : /*
1241 : * btvacuumscan --- scan the index for VACUUMing purposes
1242 : *
1243 : * This combines the functions of looking for leaf tuples that are deletable
1244 : * according to the vacuum callback, looking for empty pages that can be
1245 : * deleted, and looking for old deleted pages that can be recycled. Both
1246 : * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
1247 : * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
1248 : *
1249 : * The caller is responsible for initially allocating/zeroing a stats struct
1250 : * and for obtaining a vacuum cycle ID if necessary.
1251 : */
1252 : static void
1253 2910 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1254 : IndexBulkDeleteCallback callback, void *callback_state,
1255 : BTCycleId cycleid)
1256 : {
1257 2910 : Relation rel = info->index;
1258 : BTVacState vstate;
1259 : BlockNumber num_pages;
1260 : bool needLock;
1261 : BlockRangeReadStreamPrivate p;
1262 2910 : ReadStream *stream = NULL;
1263 :
1264 : /*
1265 : * Reset fields that track information about the entire index now. This
1266 : * avoids double-counting in the case where a single VACUUM command
1267 : * requires multiple scans of the index.
1268 : *
1269 : * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
1270 : * since they track information about the VACUUM command, and so must last
1271 : * across each call to btvacuumscan().
1272 : *
1273 : * (Note that pages_free is treated as state about the whole index, not
1274 : * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1275 : * calls are idempotent, and get repeated for the same deleted pages in
1276 : * some scenarios. The point for us is to track the number of recyclable
1277 : * pages in the index at the end of the VACUUM command.)
1278 : */
1279 2910 : stats->num_pages = 0;
1280 2910 : stats->num_index_tuples = 0;
1281 2910 : stats->pages_deleted = 0;
1282 2910 : stats->pages_free = 0;
1283 :
1284 : /* Set up info to pass down to btvacuumpage */
1285 2910 : vstate.info = info;
1286 2910 : vstate.stats = stats;
1287 2910 : vstate.callback = callback;
1288 2910 : vstate.callback_state = callback_state;
1289 2910 : vstate.cycleid = cycleid;
1290 :
1291 : /* Create a temporary memory context to run _bt_pagedel in */
1292 2910 : vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
1293 : "_bt_pagedel",
1294 : ALLOCSET_DEFAULT_SIZES);
1295 :
1296 : /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1297 2910 : vstate.bufsize = 0;
1298 2910 : vstate.maxbufsize = 0;
1299 2910 : vstate.pendingpages = NULL;
1300 2910 : vstate.npendingpages = 0;
1301 : /* Consider applying _bt_pendingfsm_finalize optimization */
1302 2910 : _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
1303 :
1304 : /*
1305 : * The outer loop iterates over all index pages except the metapage, in
1306 : * physical order (we hope the kernel will cooperate in providing
1307 : * read-ahead for speed). It is critical that we visit all leaf pages,
1308 : * including ones added after we start the scan, else we might fail to
1309 : * delete some deletable tuples. Hence, we must repeatedly check the
1310 : * relation length. We must acquire the relation-extension lock while
1311 : * doing so to avoid a race condition: if someone else is extending the
1312 : * relation, there is a window where bufmgr/smgr have created a new
1313 : * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1314 : * we manage to scan such a page here, we'll improperly assume it can be
1315 : * recycled. Taking the lock synchronizes things enough to prevent a
1316 : * problem: either num_pages won't include the new page, or _bt_getbuf
1317 : * already has write lock on the buffer and it will be fully initialized
1318 : * before we can examine it. Also, we need not worry if a page is added
1319 : * immediately after we look; the page splitting code already has
1320 : * write-lock on the left page before it adds a right page, so we must
1321 : * already have processed any tuples due to be moved into such a page.
1322 : *
1323 : * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1324 : * think the use of the extension lock is still required.
1325 : *
1326 : * We can skip locking for new or temp relations, however, since no one
1327 : * else could be accessing them.
1328 : */
1329 2910 : needLock = !RELATION_IS_LOCAL(rel);
1330 :
1331 2910 : p.current_blocknum = BTREE_METAPAGE + 1;
1332 :
1333 : /*
1334 : * It is safe to use batchmode as block_range_read_stream_cb takes no
1335 : * locks.
1336 : */
1337 2910 : stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
1338 : READ_STREAM_FULL |
1339 : READ_STREAM_USE_BATCHING,
1340 : info->strategy,
1341 : rel,
1342 : MAIN_FORKNUM,
1343 : block_range_read_stream_cb,
1344 : &p,
1345 : 0);
1346 : for (;;)
1347 : {
1348 : /* Get the current relation length */
1349 5528 : if (needLock)
1350 5524 : LockRelationForExtension(rel, ExclusiveLock);
1351 5528 : num_pages = RelationGetNumberOfBlocks(rel);
1352 5528 : if (needLock)
1353 5524 : UnlockRelationForExtension(rel, ExclusiveLock);
1354 :
1355 5528 : if (info->report_progress)
1356 916 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1357 : num_pages);
1358 :
1359 : /* Quit if we've scanned the whole relation */
1360 5528 : if (p.current_blocknum >= num_pages)
1361 2910 : break;
1362 :
1363 2618 : p.last_exclusive = num_pages;
1364 :
1365 : /* Iterate over pages, then loop back to recheck relation length */
1366 : while (true)
1367 25090 : {
1368 : BlockNumber current_block;
1369 : Buffer buf;
1370 :
1371 : /* call vacuum_delay_point while not holding any buffer lock */
1372 27708 : vacuum_delay_point(false);
1373 :
1374 27708 : buf = read_stream_next_buffer(stream, NULL);
1375 :
1376 27708 : if (!BufferIsValid(buf))
1377 2618 : break;
1378 :
1379 25090 : current_block = btvacuumpage(&vstate, buf);
1380 :
1381 25090 : if (info->report_progress)
1382 952 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1383 : current_block);
1384 : }
1385 :
1386 : /*
1387 : * We have to reset the read stream to use it again. After returning
1388 : * InvalidBuffer, the read stream API won't invoke our callback again
1389 : * until the stream has been reset.
1390 : */
1391 2618 : read_stream_reset(stream);
1392 : }
1393 :
1394 2910 : read_stream_end(stream);
1395 :
1396 : /* Set statistics num_pages field to final size of index */
1397 2910 : stats->num_pages = num_pages;
1398 :
1399 2910 : MemoryContextDelete(vstate.pagedelcontext);
1400 :
1401 : /*
1402 : * If there were any calls to _bt_pagedel() during scan of the index then
1403 : * see if any of the resulting pages can be placed in the FSM now. When
1404 : * it's not safe we'll have to leave it up to a future VACUUM operation.
1405 : *
1406 : * Finally, if we placed any pages in the FSM (either just now or during
1407 : * the scan), forcibly update the upper-level FSM pages to ensure that
1408 : * searchers can find them.
1409 : */
1410 2910 : _bt_pendingfsm_finalize(rel, &vstate);
1411 2910 : if (stats->pages_free > 0)
1412 50 : IndexFreeSpaceMapVacuum(rel);
1413 2910 : }
1414 :
1415 : /*
1416 : * btvacuumpage --- VACUUM one page
1417 : *
1418 : * This processes a single page for btvacuumscan(). In some cases we must
1419 : * backtrack to re-examine and VACUUM pages that were on buf's page during
1420 : * a previous call here. This is how we handle page splits (that happened
1421 : * after our cycleid was acquired) whose right half page happened to reuse
1422 : * a block that we might have processed at some point before it was
1423 : * recycled (i.e. before the page split).
1424 : *
1425 : * Returns BlockNumber of a scanned page (not backtracked).
1426 : */
1427 : static BlockNumber
1428 25090 : btvacuumpage(BTVacState *vstate, Buffer buf)
1429 : {
1430 25090 : IndexVacuumInfo *info = vstate->info;
1431 25090 : IndexBulkDeleteResult *stats = vstate->stats;
1432 25090 : IndexBulkDeleteCallback callback = vstate->callback;
1433 25090 : void *callback_state = vstate->callback_state;
1434 25090 : Relation rel = info->index;
1435 25090 : Relation heaprel = info->heaprel;
1436 : bool attempt_pagedel;
1437 : BlockNumber blkno,
1438 : backtrack_to;
1439 25090 : BlockNumber scanblkno = BufferGetBlockNumber(buf);
1440 : Page page;
1441 : BTPageOpaque opaque;
1442 :
1443 25090 : blkno = scanblkno;
1444 :
1445 25090 : backtrack:
1446 :
1447 25090 : attempt_pagedel = false;
1448 25090 : backtrack_to = P_NONE;
1449 :
1450 25090 : _bt_lockbuf(rel, buf, BT_READ);
1451 25090 : page = BufferGetPage(buf);
1452 25090 : opaque = NULL;
1453 25090 : if (!PageIsNew(page))
1454 : {
1455 25090 : _bt_checkpage(rel, buf);
1456 25090 : opaque = BTPageGetOpaque(page);
1457 : }
1458 :
1459 : Assert(blkno <= scanblkno);
1460 25090 : if (blkno != scanblkno)
1461 : {
1462 : /*
1463 : * We're backtracking.
1464 : *
1465 : * We followed a right link to a sibling leaf page (a page that
1466 : * happens to be from a block located before scanblkno). The only
1467 : * case we want to do anything with is a live leaf page having the
1468 : * current vacuum cycle ID.
1469 : *
1470 : * The page had better be in a state that's consistent with what we
1471 : * expect. Check for conditions that imply corruption in passing. It
1472 : * can't be half-dead because only an interrupted VACUUM process can
1473 : * leave pages in that state, so we'd definitely have dealt with it
1474 : * back when the page was the scanblkno page (half-dead pages are
1475 : * always marked fully deleted by _bt_pagedel(), barring corruption).
1476 : */
1477 0 : if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1478 : {
1479 : Assert(false);
1480 0 : ereport(LOG,
1481 : (errcode(ERRCODE_INDEX_CORRUPTED),
1482 : errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1483 : blkno, scanblkno, RelationGetRelationName(rel))));
1484 0 : _bt_relbuf(rel, buf);
1485 0 : return scanblkno;
1486 : }
1487 :
1488 : /*
1489 : * We may have already processed the page in an earlier call, when the
1490 : * page was scanblkno. This happens when the leaf page split occurred
1491 : * after the scan began, but before the right sibling page became the
1492 : * scanblkno.
1493 : *
1494 : * Page may also have been deleted by current btvacuumpage() call,
1495 : * since _bt_pagedel() sometimes deletes the right sibling page of
1496 : * scanblkno in passing (it does so after we decided where to
1497 : * backtrack to). We don't need to process this page as a deleted
1498 : * page a second time now (in fact, it would be wrong to count it as a
1499 : * deleted page in the bulk delete statistics a second time).
1500 : */
1501 0 : if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1502 : {
1503 : /* Done with current scanblkno (and all lower split pages) */
1504 0 : _bt_relbuf(rel, buf);
1505 0 : return scanblkno;
1506 : }
1507 : }
1508 :
1509 25090 : if (!opaque || BTPageIsRecyclable(page, heaprel))
1510 : {
1511 : /* Okay to recycle this page (which could be leaf or internal) */
1512 252 : RecordFreeIndexPage(rel, blkno);
1513 252 : stats->pages_deleted++;
1514 252 : stats->pages_free++;
1515 : }
1516 24838 : else if (P_ISDELETED(opaque))
1517 : {
1518 : /*
1519 : * Already deleted page (which could be leaf or internal). Can't
1520 : * recycle yet.
1521 : */
1522 200 : stats->pages_deleted++;
1523 : }
1524 24638 : else if (P_ISHALFDEAD(opaque))
1525 : {
1526 : /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1527 0 : attempt_pagedel = true;
1528 :
1529 : /*
1530 : * _bt_pagedel() will increment both pages_newly_deleted and
1531 : * pages_deleted stats in all cases (barring corruption)
1532 : */
1533 : }
1534 24638 : else if (P_ISLEAF(opaque))
1535 : {
1536 : OffsetNumber deletable[MaxIndexTuplesPerPage];
1537 : int ndeletable;
1538 : BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1539 : int nupdatable;
1540 : OffsetNumber offnum,
1541 : minoff,
1542 : maxoff;
1543 : int nhtidsdead,
1544 : nhtidslive;
1545 :
1546 : /*
1547 : * Trade in the initial read lock for a full cleanup lock on this
1548 : * page. We must get such a lock on every leaf page over the course
1549 : * of the vacuum scan, whether or not it actually contains any
1550 : * deletable tuples --- see nbtree/README.
1551 : */
1552 22976 : _bt_upgradelockbufcleanup(rel, buf);
1553 :
1554 : /*
1555 : * Check whether we need to backtrack to earlier pages. What we are
1556 : * concerned about is a page split that happened since we started the
1557 : * vacuum scan. If the split moved tuples on the right half of the
1558 : * split (i.e. the tuples that sort high) to a block that we already
1559 : * passed over, then we might have missed the tuples. We need to
1560 : * backtrack now. (Must do this before possibly clearing btpo_cycleid
1561 : * or deleting scanblkno page below!)
1562 : */
1563 22976 : if (vstate->cycleid != 0 &&
1564 22768 : opaque->btpo_cycleid == vstate->cycleid &&
1565 8 : !(opaque->btpo_flags & BTP_SPLIT_END) &&
1566 6 : !P_RIGHTMOST(opaque) &&
1567 4 : opaque->btpo_next < scanblkno)
1568 0 : backtrack_to = opaque->btpo_next;
1569 :
1570 22976 : ndeletable = 0;
1571 22976 : nupdatable = 0;
1572 22976 : minoff = P_FIRSTDATAKEY(opaque);
1573 22976 : maxoff = PageGetMaxOffsetNumber(page);
1574 22976 : nhtidsdead = 0;
1575 22976 : nhtidslive = 0;
1576 22976 : if (callback)
1577 : {
1578 : /* btbulkdelete callback tells us what to delete (or update) */
1579 22768 : for (offnum = minoff;
1580 4285872 : offnum <= maxoff;
1581 4263104 : offnum = OffsetNumberNext(offnum))
1582 : {
1583 : IndexTuple itup;
1584 :
1585 4263104 : itup = (IndexTuple) PageGetItem(page,
1586 4263104 : PageGetItemId(page, offnum));
1587 :
1588 : Assert(!BTreeTupleIsPivot(itup));
1589 4263104 : if (!BTreeTupleIsPosting(itup))
1590 : {
1591 : /* Regular tuple, standard table TID representation */
1592 4135362 : if (callback(&itup->t_tid, callback_state))
1593 : {
1594 1424576 : deletable[ndeletable++] = offnum;
1595 1424576 : nhtidsdead++;
1596 : }
1597 : else
1598 2710786 : nhtidslive++;
1599 : }
1600 : else
1601 : {
1602 : BTVacuumPosting vacposting;
1603 : int nremaining;
1604 :
1605 : /* Posting list tuple */
1606 127742 : vacposting = btreevacuumposting(vstate, itup, offnum,
1607 : &nremaining);
1608 127742 : if (vacposting == NULL)
1609 : {
1610 : /*
1611 : * All table TIDs from the posting tuple remain, so no
1612 : * delete or update required
1613 : */
1614 : Assert(nremaining == BTreeTupleGetNPosting(itup));
1615 : }
1616 72356 : else if (nremaining > 0)
1617 : {
1618 :
1619 : /*
1620 : * Store metadata about posting list tuple in
1621 : * updatable array for entire page. Existing tuple
1622 : * will be updated during the later call to
1623 : * _bt_delitems_vacuum().
1624 : */
1625 : Assert(nremaining < BTreeTupleGetNPosting(itup));
1626 39240 : updatable[nupdatable++] = vacposting;
1627 39240 : nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1628 : }
1629 : else
1630 : {
1631 : /*
1632 : * All table TIDs from the posting list must be
1633 : * deleted. We'll delete the index tuple completely
1634 : * (no update required).
1635 : */
1636 : Assert(nremaining == 0);
1637 33116 : deletable[ndeletable++] = offnum;
1638 33116 : nhtidsdead += BTreeTupleGetNPosting(itup);
1639 33116 : pfree(vacposting);
1640 : }
1641 :
1642 127742 : nhtidslive += nremaining;
1643 : }
1644 : }
1645 : }
1646 :
1647 : /*
1648 : * Apply any needed deletes or updates. We issue just one
1649 : * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1650 : */
1651 22976 : if (ndeletable > 0 || nupdatable > 0)
1652 : {
1653 : Assert(nhtidsdead >= ndeletable + nupdatable);
1654 13792 : _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1655 : nupdatable);
1656 :
1657 13792 : stats->tuples_removed += nhtidsdead;
1658 : /* must recompute maxoff */
1659 13792 : maxoff = PageGetMaxOffsetNumber(page);
1660 :
1661 : /* can't leak memory here */
1662 53032 : for (int i = 0; i < nupdatable; i++)
1663 39240 : pfree(updatable[i]);
1664 : }
1665 : else
1666 : {
1667 : /*
1668 : * If the leaf page has been split during this vacuum cycle, it
1669 : * seems worth expending a write to clear btpo_cycleid even if we
1670 : * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1671 : * takes care of this.) This ensures we won't process the page
1672 : * again.
1673 : *
1674 : * We treat this like a hint-bit update because there's no need to
1675 : * WAL-log it.
1676 : */
1677 : Assert(nhtidsdead == 0);
1678 9184 : if (vstate->cycleid != 0 &&
1679 8976 : opaque->btpo_cycleid == vstate->cycleid)
1680 : {
1681 4 : opaque->btpo_cycleid = 0;
1682 4 : MarkBufferDirtyHint(buf, true);
1683 : }
1684 : }
1685 :
1686 : /*
1687 : * If the leaf page is now empty, try to delete it; else count the
1688 : * live tuples (live table TIDs in posting lists are counted as
1689 : * separate live tuples). We don't delete when backtracking, though,
1690 : * since that would require teaching _bt_pagedel() about backtracking
1691 : * (doesn't seem worth adding more complexity to deal with that).
1692 : *
1693 : * We don't count the number of live TIDs during cleanup-only calls to
1694 : * btvacuumscan (i.e. when callback is not set). We count the number
1695 : * of index tuples directly instead. This avoids the expense of
1696 : * directly examining all of the tuples on each page. VACUUM will
1697 : * treat num_index_tuples as an estimate in cleanup-only case, so it
1698 : * doesn't matter that this underestimates num_index_tuples
1699 : * significantly in some cases.
1700 : */
1701 22976 : if (minoff > maxoff)
1702 5632 : attempt_pagedel = (blkno == scanblkno);
1703 17344 : else if (callback)
1704 17144 : stats->num_index_tuples += nhtidslive;
1705 : else
1706 200 : stats->num_index_tuples += maxoff - minoff + 1;
1707 :
1708 : Assert(!attempt_pagedel || nhtidslive == 0);
1709 : }
1710 :
1711 25090 : if (attempt_pagedel)
1712 : {
1713 : MemoryContext oldcontext;
1714 :
1715 : /* Run pagedel in a temp context to avoid memory leakage */
1716 5632 : MemoryContextReset(vstate->pagedelcontext);
1717 5632 : oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1718 :
1719 : /*
1720 : * _bt_pagedel maintains the bulk delete stats on our behalf;
1721 : * pages_newly_deleted and pages_deleted are likely to be incremented
1722 : * during call
1723 : */
1724 : Assert(blkno == scanblkno);
1725 5632 : _bt_pagedel(rel, buf, vstate);
1726 :
1727 5632 : MemoryContextSwitchTo(oldcontext);
1728 : /* pagedel released buffer, so we shouldn't */
1729 : }
1730 : else
1731 19458 : _bt_relbuf(rel, buf);
1732 :
1733 25090 : if (backtrack_to != P_NONE)
1734 : {
1735 0 : blkno = backtrack_to;
1736 :
1737 : /* check for vacuum delay while not holding any buffer lock */
1738 0 : vacuum_delay_point(false);
1739 :
1740 : /*
1741 : * We can't use _bt_getbuf() here because it always applies
1742 : * _bt_checkpage(), which will barf on an all-zero page. We want to
1743 : * recycle all-zero pages, not fail. Also, we want to use a
1744 : * nondefault buffer access strategy.
1745 : */
1746 0 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1747 : info->strategy);
1748 0 : goto backtrack;
1749 : }
1750 :
1751 25090 : return scanblkno;
1752 : }
1753 :
1754 : /*
1755 : * btreevacuumposting --- determine TIDs still needed in posting list
1756 : *
1757 : * Returns metadata describing how to build replacement tuple without the TIDs
1758 : * that VACUUM needs to delete. Returned value is NULL in the common case
1759 : * where no changes are needed to caller's posting list tuple (we avoid
1760 : * allocating memory here as an optimization).
1761 : *
1762 : * The number of TIDs that should remain in the posting list tuple is set for
1763 : * caller in *nremaining.
1764 : */
1765 : static BTVacuumPosting
1766 127742 : btreevacuumposting(BTVacState *vstate, IndexTuple posting,
1767 : OffsetNumber updatedoffset, int *nremaining)
1768 : {
1769 127742 : int live = 0;
1770 127742 : int nitem = BTreeTupleGetNPosting(posting);
1771 127742 : ItemPointer items = BTreeTupleGetPosting(posting);
1772 127742 : BTVacuumPosting vacposting = NULL;
1773 :
1774 729802 : for (int i = 0; i < nitem; i++)
1775 : {
1776 602060 : if (!vstate->callback(items + i, vstate->callback_state))
1777 : {
1778 : /* Live table TID */
1779 362446 : live++;
1780 : }
1781 239614 : else if (vacposting == NULL)
1782 : {
1783 : /*
1784 : * First dead table TID encountered.
1785 : *
1786 : * It's now clear that we need to delete one or more dead table
1787 : * TIDs, so start maintaining metadata describing how to update
1788 : * existing posting list tuple.
1789 : */
1790 72356 : vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1791 : nitem * sizeof(uint16));
1792 :
1793 72356 : vacposting->itup = posting;
1794 72356 : vacposting->updatedoffset = updatedoffset;
1795 72356 : vacposting->ndeletedtids = 0;
1796 72356 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1797 : }
1798 : else
1799 : {
1800 : /* Second or subsequent dead table TID */
1801 167258 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1802 : }
1803 : }
1804 :
1805 127742 : *nremaining = live;
1806 127742 : return vacposting;
1807 : }
1808 :
1809 : /*
1810 : * btcanreturn() -- Check whether btree indexes support index-only scans.
1811 : *
1812 : * btrees always do, so this is trivial.
1813 : */
1814 : bool
1815 1119136 : btcanreturn(Relation index, int attno)
1816 : {
1817 1119136 : return true;
1818 : }
1819 :
1820 : /*
1821 : * btgettreeheight() -- Compute tree height for use by btcostestimate().
1822 : */
1823 : int
1824 715268 : btgettreeheight(Relation rel)
1825 : {
1826 715268 : return _bt_getrootheight(rel);
1827 : }
1828 :
1829 : CompareType
1830 0 : bttranslatestrategy(StrategyNumber strategy, Oid opfamily)
1831 : {
1832 0 : switch (strategy)
1833 : {
1834 0 : case BTLessStrategyNumber:
1835 0 : return COMPARE_LT;
1836 0 : case BTLessEqualStrategyNumber:
1837 0 : return COMPARE_LE;
1838 0 : case BTEqualStrategyNumber:
1839 0 : return COMPARE_EQ;
1840 0 : case BTGreaterEqualStrategyNumber:
1841 0 : return COMPARE_GE;
1842 0 : case BTGreaterStrategyNumber:
1843 0 : return COMPARE_GT;
1844 0 : default:
1845 0 : return COMPARE_INVALID;
1846 : }
1847 : }
1848 :
1849 : StrategyNumber
1850 0 : bttranslatecmptype(CompareType cmptype, Oid opfamily)
1851 : {
1852 0 : switch (cmptype)
1853 : {
1854 0 : case COMPARE_LT:
1855 0 : return BTLessStrategyNumber;
1856 0 : case COMPARE_LE:
1857 0 : return BTLessEqualStrategyNumber;
1858 0 : case COMPARE_EQ:
1859 0 : return BTEqualStrategyNumber;
1860 0 : case COMPARE_GE:
1861 0 : return BTGreaterEqualStrategyNumber;
1862 0 : case COMPARE_GT:
1863 0 : return BTGreaterStrategyNumber;
1864 0 : default:
1865 0 : return InvalidStrategy;
1866 : }
1867 : }
|