Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtree.c
4 : * Implementation of Lehman and Yao's btree management algorithm for
5 : * Postgres.
6 : *
7 : * NOTES
8 : * This file contains only the public interface routines.
9 : *
10 : *
11 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
12 : * Portions Copyright (c) 1994, Regents of the University of California
13 : *
14 : * IDENTIFICATION
15 : * src/backend/access/nbtree/nbtree.c
16 : *
17 : *-------------------------------------------------------------------------
18 : */
19 : #include "postgres.h"
20 :
21 : #include "access/nbtree.h"
22 : #include "access/relscan.h"
23 : #include "access/xloginsert.h"
24 : #include "commands/progress.h"
25 : #include "commands/vacuum.h"
26 : #include "miscadmin.h"
27 : #include "nodes/execnodes.h"
28 : #include "pgstat.h"
29 : #include "storage/bulk_write.h"
30 : #include "storage/condition_variable.h"
31 : #include "storage/indexfsm.h"
32 : #include "storage/ipc.h"
33 : #include "storage/lmgr.h"
34 : #include "storage/smgr.h"
35 : #include "utils/fmgrprotos.h"
36 : #include "utils/index_selfuncs.h"
37 : #include "utils/memutils.h"
38 :
39 :
40 : /*
41 : * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
42 : *
43 : * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
44 : * scan to advance it via another call to _bt_first.
45 : *
46 : * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
47 : * a new page; others must wait.
48 : *
49 : * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
50 : * to a new page; some process can start doing that.
51 : *
52 : * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
53 : */
54 : typedef enum
55 : {
56 : BTPARALLEL_NOT_INITIALIZED,
57 : BTPARALLEL_NEED_PRIMSCAN,
58 : BTPARALLEL_ADVANCING,
59 : BTPARALLEL_IDLE,
60 : BTPARALLEL_DONE,
61 : } BTPS_State;
62 :
63 : /*
64 : * BTParallelScanDescData contains btree specific shared information required
65 : * for parallel scan.
66 : */
67 : typedef struct BTParallelScanDescData
68 : {
69 : BlockNumber btps_scanPage; /* latest or next page to be scanned */
70 : BTPS_State btps_pageStatus; /* indicates whether next page is
71 : * available for scan. see above for
72 : * possible states of parallel scan. */
73 : slock_t btps_mutex; /* protects above variables, btps_arrElems */
74 : ConditionVariable btps_cv; /* used to synchronize parallel scan */
75 :
76 : /*
77 : * btps_arrElems is used when scans need to schedule another primitive
78 : * index scan. Holds BTArrayKeyInfo.cur_elem offsets for scan keys.
79 : */
80 : int btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
81 : } BTParallelScanDescData;
82 :
83 : typedef struct BTParallelScanDescData *BTParallelScanDesc;
84 :
85 :
86 : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
87 : IndexBulkDeleteCallback callback, void *callback_state,
88 : BTCycleId cycleid);
89 : static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
90 : static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
91 : IndexTuple posting,
92 : OffsetNumber updatedoffset,
93 : int *nremaining);
94 :
95 :
96 : /*
97 : * Btree handler function: return IndexAmRoutine with access method parameters
98 : * and callbacks.
99 : */
100 : Datum
101 2281804 : bthandler(PG_FUNCTION_ARGS)
102 : {
103 2281804 : IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
104 :
105 2281804 : amroutine->amstrategies = BTMaxStrategyNumber;
106 2281804 : amroutine->amsupport = BTNProcs;
107 2281804 : amroutine->amoptsprocnum = BTOPTIONS_PROC;
108 2281804 : amroutine->amcanorder = true;
109 2281804 : amroutine->amcanorderbyop = false;
110 2281804 : amroutine->amcanbackward = true;
111 2281804 : amroutine->amcanunique = true;
112 2281804 : amroutine->amcanmulticol = true;
113 2281804 : amroutine->amoptionalkey = true;
114 2281804 : amroutine->amsearcharray = true;
115 2281804 : amroutine->amsearchnulls = true;
116 2281804 : amroutine->amstorage = false;
117 2281804 : amroutine->amclusterable = true;
118 2281804 : amroutine->ampredlocks = true;
119 2281804 : amroutine->amcanparallel = true;
120 2281804 : amroutine->amcanbuildparallel = true;
121 2281804 : amroutine->amcaninclude = true;
122 2281804 : amroutine->amusemaintenanceworkmem = false;
123 2281804 : amroutine->amsummarizing = false;
124 2281804 : amroutine->amparallelvacuumoptions =
125 : VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
126 2281804 : amroutine->amkeytype = InvalidOid;
127 :
128 2281804 : amroutine->ambuild = btbuild;
129 2281804 : amroutine->ambuildempty = btbuildempty;
130 2281804 : amroutine->aminsert = btinsert;
131 2281804 : amroutine->aminsertcleanup = NULL;
132 2281804 : amroutine->ambulkdelete = btbulkdelete;
133 2281804 : amroutine->amvacuumcleanup = btvacuumcleanup;
134 2281804 : amroutine->amcanreturn = btcanreturn;
135 2281804 : amroutine->amcostestimate = btcostestimate;
136 2281804 : amroutine->amoptions = btoptions;
137 2281804 : amroutine->amproperty = btproperty;
138 2281804 : amroutine->ambuildphasename = btbuildphasename;
139 2281804 : amroutine->amvalidate = btvalidate;
140 2281804 : amroutine->amadjustmembers = btadjustmembers;
141 2281804 : amroutine->ambeginscan = btbeginscan;
142 2281804 : amroutine->amrescan = btrescan;
143 2281804 : amroutine->amgettuple = btgettuple;
144 2281804 : amroutine->amgetbitmap = btgetbitmap;
145 2281804 : amroutine->amendscan = btendscan;
146 2281804 : amroutine->ammarkpos = btmarkpos;
147 2281804 : amroutine->amrestrpos = btrestrpos;
148 2281804 : amroutine->amestimateparallelscan = btestimateparallelscan;
149 2281804 : amroutine->aminitparallelscan = btinitparallelscan;
150 2281804 : amroutine->amparallelrescan = btparallelrescan;
151 :
152 2281804 : PG_RETURN_POINTER(amroutine);
153 : }
154 :
155 : /*
156 : * btbuildempty() -- build an empty btree index in the initialization fork
157 : */
158 : void
159 158 : btbuildempty(Relation index)
160 : {
161 158 : bool allequalimage = _bt_allequalimage(index, false);
162 : BulkWriteState *bulkstate;
163 : BulkWriteBuffer metabuf;
164 :
165 158 : bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
166 :
167 : /* Construct metapage. */
168 158 : metabuf = smgr_bulk_get_buf(bulkstate);
169 158 : _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
170 158 : smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
171 :
172 158 : smgr_bulk_finish(bulkstate);
173 158 : }
174 :
175 : /*
176 : * btinsert() -- insert an index tuple into a btree.
177 : *
178 : * Descend the tree recursively, find the appropriate location for our
179 : * new tuple, and put it there.
180 : */
181 : bool
182 6276018 : btinsert(Relation rel, Datum *values, bool *isnull,
183 : ItemPointer ht_ctid, Relation heapRel,
184 : IndexUniqueCheck checkUnique,
185 : bool indexUnchanged,
186 : IndexInfo *indexInfo)
187 : {
188 : bool result;
189 : IndexTuple itup;
190 :
191 : /* generate an index tuple */
192 6276018 : itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
193 6276018 : itup->t_tid = *ht_ctid;
194 :
195 6276018 : result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
196 :
197 6275494 : pfree(itup);
198 :
199 6275494 : return result;
200 : }
201 :
202 : /*
203 : * btgettuple() -- Get the next tuple in the scan.
204 : */
205 : bool
206 26570902 : btgettuple(IndexScanDesc scan, ScanDirection dir)
207 : {
208 26570902 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
209 : bool res;
210 :
211 : /* btree indexes are never lossy */
212 26570902 : scan->xs_recheck = false;
213 :
214 : /* Each loop iteration performs another primitive index scan */
215 : do
216 : {
217 : /*
218 : * If we've already initialized this scan, we can just advance it in
219 : * the appropriate direction. If we haven't done so yet, we call
220 : * _bt_first() to get the first item in the scan.
221 : */
222 26571036 : if (!BTScanPosIsValid(so->currPos))
223 11357178 : res = _bt_first(scan, dir);
224 : else
225 : {
226 : /*
227 : * Check to see if we should kill the previously-fetched tuple.
228 : */
229 15213858 : if (scan->kill_prior_tuple)
230 : {
231 : /*
232 : * Yes, remember it for later. (We'll deal with all such
233 : * tuples at once right before leaving the index page.) The
234 : * test for numKilled overrun is not just paranoia: if the
235 : * caller reverses direction in the indexscan then the same
236 : * item might get entered multiple times. It's not worth
237 : * trying to optimize that, so we don't detect it, but instead
238 : * just forget any excess entries.
239 : */
240 474502 : if (so->killedItems == NULL)
241 135860 : so->killedItems = (int *)
242 135860 : palloc(MaxTIDsPerBTreePage * sizeof(int));
243 474502 : if (so->numKilled < MaxTIDsPerBTreePage)
244 474502 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
245 : }
246 :
247 : /*
248 : * Now continue the scan.
249 : */
250 15213858 : res = _bt_next(scan, dir);
251 : }
252 :
253 : /* If we have a tuple, return it ... */
254 26571036 : if (res)
255 21041562 : break;
256 : /* ... otherwise see if we need another primitive index scan */
257 5529474 : } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
258 :
259 26570902 : return res;
260 : }
261 :
262 : /*
263 : * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
264 : */
265 : int64
266 13564 : btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
267 : {
268 13564 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
269 13564 : int64 ntids = 0;
270 : ItemPointer heapTid;
271 :
272 : /* Each loop iteration performs another primitive index scan */
273 : do
274 : {
275 : /* Fetch the first page & tuple */
276 13990 : if (_bt_first(scan, ForwardScanDirection))
277 : {
278 : /* Save tuple ID, and continue scanning */
279 8864 : heapTid = &scan->xs_heaptid;
280 8864 : tbm_add_tuples(tbm, heapTid, 1, false);
281 8864 : ntids++;
282 :
283 : for (;;)
284 : {
285 : /*
286 : * Advance to next tuple within page. This is the same as the
287 : * easy case in _bt_next().
288 : */
289 1797618 : if (++so->currPos.itemIndex > so->currPos.lastItem)
290 : {
291 : /* let _bt_next do the heavy lifting */
292 13518 : if (!_bt_next(scan, ForwardScanDirection))
293 8864 : break;
294 : }
295 :
296 : /* Save tuple ID, and continue scanning */
297 1788754 : heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
298 1788754 : tbm_add_tuples(tbm, heapTid, 1, false);
299 1788754 : ntids++;
300 : }
301 : }
302 : /* Now see if we need another primitive index scan */
303 13990 : } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection));
304 :
305 13564 : return ntids;
306 : }
307 :
308 : /*
309 : * btbeginscan() -- start a scan on a btree index
310 : */
311 : IndexScanDesc
312 10915938 : btbeginscan(Relation rel, int nkeys, int norderbys)
313 : {
314 : IndexScanDesc scan;
315 : BTScanOpaque so;
316 :
317 : /* no order by operators allowed */
318 : Assert(norderbys == 0);
319 :
320 : /* get the scan */
321 10915938 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
322 :
323 : /* allocate private workspace */
324 10915938 : so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
325 10915938 : BTScanPosInvalidate(so->currPos);
326 10915938 : BTScanPosInvalidate(so->markPos);
327 10915938 : if (scan->numberOfKeys > 0)
328 10905342 : so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
329 : else
330 10596 : so->keyData = NULL;
331 :
332 10915938 : so->needPrimScan = false;
333 10915938 : so->scanBehind = false;
334 10915938 : so->arrayKeys = NULL;
335 10915938 : so->orderProcs = NULL;
336 10915938 : so->arrayContext = NULL;
337 :
338 10915938 : so->killedItems = NULL; /* until needed */
339 10915938 : so->numKilled = 0;
340 :
341 : /*
342 : * We don't know yet whether the scan will be index-only, so we do not
343 : * allocate the tuple workspace arrays until btrescan. However, we set up
344 : * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
345 : */
346 10915938 : so->currTuples = so->markTuples = NULL;
347 :
348 10915938 : scan->xs_itupdesc = RelationGetDescr(rel);
349 :
350 10915938 : scan->opaque = so;
351 :
352 10915938 : return scan;
353 : }
354 :
355 : /*
356 : * btrescan() -- rescan an index relation
357 : */
358 : void
359 11371576 : btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
360 : ScanKey orderbys, int norderbys)
361 : {
362 11371576 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
363 :
364 : /* we aren't holding any read locks, but gotta drop the pins */
365 11371576 : if (BTScanPosIsValid(so->currPos))
366 : {
367 : /* Before leaving current page, deal with any killed items */
368 76778 : if (so->numKilled > 0)
369 738 : _bt_killitems(scan);
370 76778 : BTScanPosUnpinIfPinned(so->currPos);
371 76778 : BTScanPosInvalidate(so->currPos);
372 : }
373 :
374 11371576 : so->markItemIndex = -1;
375 11371576 : so->needPrimScan = false;
376 11371576 : so->scanBehind = false;
377 11371576 : BTScanPosUnpinIfPinned(so->markPos);
378 11371576 : BTScanPosInvalidate(so->markPos);
379 :
380 : /*
381 : * Allocate tuple workspace arrays, if needed for an index-only scan and
382 : * not already done in a previous rescan call. To save on palloc
383 : * overhead, both workspaces are allocated as one palloc block; only this
384 : * function and btendscan know that.
385 : *
386 : * NOTE: this data structure also makes it safe to return data from a
387 : * "name" column, even though btree name_ops uses an underlying storage
388 : * datatype of cstring. The risk there is that "name" is supposed to be
389 : * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
390 : * However, since we only return data out of tuples sitting in the
391 : * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
392 : * data out of the markTuples array --- running off the end of memory for
393 : * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
394 : * adding special-case treatment for name_ops elsewhere.
395 : */
396 11371576 : if (scan->xs_want_itup && so->currTuples == NULL)
397 : {
398 112992 : so->currTuples = (char *) palloc(BLCKSZ * 2);
399 112992 : so->markTuples = so->currTuples + BLCKSZ;
400 : }
401 :
402 : /*
403 : * Reset the scan keys
404 : */
405 11371576 : if (scankey && scan->numberOfKeys > 0)
406 11360890 : memmove(scan->keyData,
407 : scankey,
408 11360890 : scan->numberOfKeys * sizeof(ScanKeyData));
409 11371576 : so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
410 11371576 : so->numArrayKeys = 0; /* ditto */
411 11371576 : }
412 :
413 : /*
414 : * btendscan() -- close down a scan
415 : */
416 : void
417 10914552 : btendscan(IndexScanDesc scan)
418 : {
419 10914552 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
420 :
421 : /* we aren't holding any read locks, but gotta drop the pins */
422 10914552 : if (BTScanPosIsValid(so->currPos))
423 : {
424 : /* Before leaving current page, deal with any killed items */
425 5749960 : if (so->numKilled > 0)
426 66766 : _bt_killitems(scan);
427 5749960 : BTScanPosUnpinIfPinned(so->currPos);
428 : }
429 :
430 10914552 : so->markItemIndex = -1;
431 10914552 : BTScanPosUnpinIfPinned(so->markPos);
432 :
433 : /* No need to invalidate positions, the RAM is about to be freed. */
434 :
435 : /* Release storage */
436 10914552 : if (so->keyData != NULL)
437 10903998 : pfree(so->keyData);
438 : /* so->arrayKeys and so->orderProcs are in arrayContext */
439 10914552 : if (so->arrayContext != NULL)
440 832 : MemoryContextDelete(so->arrayContext);
441 10914552 : if (so->killedItems != NULL)
442 135820 : pfree(so->killedItems);
443 10914552 : if (so->currTuples != NULL)
444 112936 : pfree(so->currTuples);
445 : /* so->markTuples should not be pfree'd, see btrescan */
446 10914552 : pfree(so);
447 10914552 : }
448 :
449 : /*
450 : * btmarkpos() -- save current scan position
451 : */
452 : void
453 130042 : btmarkpos(IndexScanDesc scan)
454 : {
455 130042 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
456 :
457 : /* There may be an old mark with a pin (but no lock). */
458 130042 : BTScanPosUnpinIfPinned(so->markPos);
459 :
460 : /*
461 : * Just record the current itemIndex. If we later step to next page
462 : * before releasing the marked position, _bt_steppage makes a full copy of
463 : * the currPos struct in markPos. If (as often happens) the mark is moved
464 : * before we leave the page, we don't have to do that work.
465 : */
466 130042 : if (BTScanPosIsValid(so->currPos))
467 130042 : so->markItemIndex = so->currPos.itemIndex;
468 : else
469 : {
470 0 : BTScanPosInvalidate(so->markPos);
471 0 : so->markItemIndex = -1;
472 : }
473 130042 : }
474 :
475 : /*
476 : * btrestrpos() -- restore scan to last saved position
477 : */
478 : void
479 54018 : btrestrpos(IndexScanDesc scan)
480 : {
481 54018 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
482 :
483 54018 : if (so->markItemIndex >= 0)
484 : {
485 : /*
486 : * The scan has never moved to a new page since the last mark. Just
487 : * restore the itemIndex.
488 : *
489 : * NB: In this case we can't count on anything in so->markPos to be
490 : * accurate.
491 : */
492 53910 : so->currPos.itemIndex = so->markItemIndex;
493 : }
494 : else
495 : {
496 : /*
497 : * The scan moved to a new page after last mark or restore, and we are
498 : * now restoring to the marked page. We aren't holding any read
499 : * locks, but if we're still holding the pin for the current position,
500 : * we must drop it.
501 : */
502 108 : if (BTScanPosIsValid(so->currPos))
503 : {
504 : /* Before leaving current page, deal with any killed items */
505 108 : if (so->numKilled > 0)
506 0 : _bt_killitems(scan);
507 108 : BTScanPosUnpinIfPinned(so->currPos);
508 : }
509 :
510 108 : if (BTScanPosIsValid(so->markPos))
511 : {
512 : /* bump pin on mark buffer for assignment to current buffer */
513 108 : if (BTScanPosIsPinned(so->markPos))
514 0 : IncrBufferRefCount(so->markPos.buf);
515 108 : memcpy(&so->currPos, &so->markPos,
516 : offsetof(BTScanPosData, items[1]) +
517 108 : so->markPos.lastItem * sizeof(BTScanPosItem));
518 108 : if (so->currTuples)
519 0 : memcpy(so->currTuples, so->markTuples,
520 0 : so->markPos.nextTupleOffset);
521 : /* Reset the scan's array keys (see _bt_steppage for why) */
522 108 : if (so->numArrayKeys)
523 : {
524 0 : _bt_start_array_keys(scan, so->currPos.dir);
525 0 : so->needPrimScan = false;
526 : }
527 : }
528 : else
529 0 : BTScanPosInvalidate(so->currPos);
530 : }
531 54018 : }
532 :
533 : /*
534 : * btestimateparallelscan -- estimate storage for BTParallelScanDescData
535 : */
536 : Size
537 58 : btestimateparallelscan(int nkeys, int norderbys)
538 : {
539 : /* Pessimistically assume all input scankeys will be output with arrays */
540 58 : return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys;
541 : }
542 :
543 : /*
544 : * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
545 : */
546 : void
547 58 : btinitparallelscan(void *target)
548 : {
549 58 : BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
550 :
551 58 : SpinLockInit(&bt_target->btps_mutex);
552 58 : bt_target->btps_scanPage = InvalidBlockNumber;
553 58 : bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
554 58 : ConditionVariableInit(&bt_target->btps_cv);
555 58 : }
556 :
557 : /*
558 : * btparallelrescan() -- reset parallel scan
559 : */
560 : void
561 24 : btparallelrescan(IndexScanDesc scan)
562 : {
563 : BTParallelScanDesc btscan;
564 24 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
565 :
566 : Assert(parallel_scan);
567 :
568 24 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
569 : parallel_scan->ps_offset);
570 :
571 : /*
572 : * In theory, we don't need to acquire the spinlock here, because there
573 : * shouldn't be any other workers running at this point, but we do so for
574 : * consistency.
575 : */
576 24 : SpinLockAcquire(&btscan->btps_mutex);
577 24 : btscan->btps_scanPage = InvalidBlockNumber;
578 24 : btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
579 24 : SpinLockRelease(&btscan->btps_mutex);
580 24 : }
581 :
582 : /*
583 : * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
584 : * page. Other scans must wait until we call _bt_parallel_release()
585 : * or _bt_parallel_done().
586 : *
587 : * The return value is true if we successfully seized the scan and false
588 : * if we did not. The latter case occurs if no pages remain.
589 : *
590 : * If the return value is true, *pageno returns the next or current page
591 : * of the scan (depending on the scan direction). An invalid block number
592 : * means the scan hasn't yet started, or that caller needs to start the next
593 : * primitive index scan (if it's the latter case we'll set so.needPrimScan).
594 : * The first time a participating process reaches the last page, it will return
595 : * true and set *pageno to P_NONE; after that, further attempts to seize the
596 : * scan will return false.
597 : *
598 : * Callers should ignore the value of pageno if the return value is false.
599 : *
600 : * Callers that are in a position to start a new primitive index scan must
601 : * pass first=true (all other callers pass first=false). We just return false
602 : * for first=false callers that require another primitive index scan.
603 : */
604 : bool
605 1764 : _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
606 : {
607 1764 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
608 1764 : bool exit_loop = false;
609 1764 : bool status = true;
610 1764 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
611 : BTParallelScanDesc btscan;
612 :
613 1764 : *pageno = P_NONE;
614 :
615 1764 : if (first)
616 : {
617 : /*
618 : * Initialize array related state when called from _bt_first, assuming
619 : * that this will either be the first primitive index scan for the
620 : * scan, or a previous explicitly scheduled primitive scan.
621 : *
622 : * Note: so->needPrimScan is only set when a scheduled primitive index
623 : * scan is set to be performed in caller's worker process. It should
624 : * not be set here by us for the first primitive scan, nor should we
625 : * ever set it for a parallel scan that has no array keys.
626 : */
627 434 : so->needPrimScan = false;
628 434 : so->scanBehind = false;
629 : }
630 : else
631 : {
632 : /*
633 : * Don't attempt to seize the scan when backend requires another
634 : * primitive index scan unless we're in a position to start it now
635 : */
636 1330 : if (so->needPrimScan)
637 36 : return false;
638 : }
639 :
640 1728 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
641 : parallel_scan->ps_offset);
642 :
643 : while (1)
644 : {
645 1728 : SpinLockAcquire(&btscan->btps_mutex);
646 :
647 1728 : if (btscan->btps_pageStatus == BTPARALLEL_DONE)
648 : {
649 : /* We're done with this parallel index scan */
650 316 : status = false;
651 : }
652 1412 : else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
653 : {
654 : Assert(so->numArrayKeys);
655 :
656 : /*
657 : * If we can start another primitive scan right away, do so.
658 : * Otherwise just wait.
659 : */
660 36 : if (first)
661 : {
662 36 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
663 72 : for (int i = 0; i < so->numArrayKeys; i++)
664 : {
665 36 : BTArrayKeyInfo *array = &so->arrayKeys[i];
666 36 : ScanKey skey = &so->keyData[array->scan_key];
667 :
668 36 : array->cur_elem = btscan->btps_arrElems[i];
669 36 : skey->sk_argument = array->elem_values[array->cur_elem];
670 : }
671 36 : so->needPrimScan = true;
672 36 : so->scanBehind = false;
673 36 : *pageno = InvalidBlockNumber;
674 36 : exit_loop = true;
675 : }
676 : }
677 1376 : else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
678 : {
679 : /*
680 : * We have successfully seized control of the scan for the purpose
681 : * of advancing it to a new page!
682 : */
683 1376 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
684 1376 : *pageno = btscan->btps_scanPage;
685 1376 : exit_loop = true;
686 : }
687 1728 : SpinLockRelease(&btscan->btps_mutex);
688 1728 : if (exit_loop || !status)
689 : break;
690 0 : ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
691 : }
692 1728 : ConditionVariableCancelSleep();
693 :
694 1728 : return status;
695 : }
696 :
697 : /*
698 : * _bt_parallel_release() -- Complete the process of advancing the scan to a
699 : * new page. We now have the new value btps_scanPage; some other backend
700 : * can now begin advancing the scan.
701 : *
702 : * Callers whose scan uses array keys must save their scan_page argument so
703 : * that it can be passed to _bt_parallel_primscan_schedule, should caller
704 : * determine that another primitive index scan is required. If that happens,
705 : * scan_page won't be scanned by any backend (unless the next primitive index
706 : * scan lands on scan_page).
707 : */
708 : void
709 1330 : _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
710 : {
711 1330 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
712 : BTParallelScanDesc btscan;
713 :
714 1330 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
715 : parallel_scan->ps_offset);
716 :
717 1330 : SpinLockAcquire(&btscan->btps_mutex);
718 1330 : btscan->btps_scanPage = scan_page;
719 1330 : btscan->btps_pageStatus = BTPARALLEL_IDLE;
720 1330 : SpinLockRelease(&btscan->btps_mutex);
721 1330 : ConditionVariableSignal(&btscan->btps_cv);
722 1330 : }
723 :
724 : /*
725 : * _bt_parallel_done() -- Mark the parallel scan as complete.
726 : *
727 : * When there are no pages left to scan, this function should be called to
728 : * notify other workers. Otherwise, they might wait forever for the scan to
729 : * advance to the next page.
730 : */
731 : void
732 5544720 : _bt_parallel_done(IndexScanDesc scan)
733 : {
734 5544720 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
735 : BTParallelScanDesc btscan;
736 5544720 : bool status_changed = false;
737 :
738 : /* Do nothing, for non-parallel scans */
739 5544720 : if (parallel_scan == NULL)
740 5544608 : return;
741 :
742 112 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
743 : parallel_scan->ps_offset);
744 :
745 : /*
746 : * Mark the parallel scan as done, unless some other process did so
747 : * already
748 : */
749 112 : SpinLockAcquire(&btscan->btps_mutex);
750 112 : if (btscan->btps_pageStatus != BTPARALLEL_DONE)
751 : {
752 82 : btscan->btps_pageStatus = BTPARALLEL_DONE;
753 82 : status_changed = true;
754 : }
755 112 : SpinLockRelease(&btscan->btps_mutex);
756 :
757 : /* wake up all the workers associated with this parallel scan */
758 112 : if (status_changed)
759 82 : ConditionVariableBroadcast(&btscan->btps_cv);
760 : }
761 :
762 : /*
763 : * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
764 : *
765 : * Caller passes the block number most recently passed to _bt_parallel_release
766 : * by its backend. Caller successfully schedules the next primitive index scan
767 : * if the shared parallel state hasn't been seized since caller's backend last
768 : * advanced the scan.
769 : */
770 : void
771 36 : _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page)
772 : {
773 36 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
774 36 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
775 : BTParallelScanDesc btscan;
776 :
777 : Assert(so->numArrayKeys);
778 :
779 36 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
780 : parallel_scan->ps_offset);
781 :
782 36 : SpinLockAcquire(&btscan->btps_mutex);
783 36 : if (btscan->btps_scanPage == prev_scan_page &&
784 36 : btscan->btps_pageStatus == BTPARALLEL_IDLE)
785 : {
786 36 : btscan->btps_scanPage = InvalidBlockNumber;
787 36 : btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
788 :
789 : /* Serialize scan's current array keys */
790 72 : for (int i = 0; i < so->numArrayKeys; i++)
791 : {
792 36 : BTArrayKeyInfo *array = &so->arrayKeys[i];
793 :
794 36 : btscan->btps_arrElems[i] = array->cur_elem;
795 : }
796 : }
797 36 : SpinLockRelease(&btscan->btps_mutex);
798 36 : }
799 :
800 : /*
801 : * Bulk deletion of all index entries pointing to a set of heap tuples.
802 : * The set of target tuples is specified via a callback routine that tells
803 : * whether any given heap tuple (identified by ItemPointer) is being deleted.
804 : *
805 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
806 : */
807 : IndexBulkDeleteResult *
808 2170 : btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
809 : IndexBulkDeleteCallback callback, void *callback_state)
810 : {
811 2170 : Relation rel = info->index;
812 : BTCycleId cycleid;
813 :
814 : /* allocate stats if first time through, else re-use existing struct */
815 2170 : if (stats == NULL)
816 2170 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
817 :
818 : /* Establish the vacuum cycle ID to use for this scan */
819 : /* The ENSURE stuff ensures we clean up shared memory on failure */
820 2170 : PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
821 : {
822 2170 : cycleid = _bt_start_vacuum(rel);
823 :
824 2170 : btvacuumscan(info, stats, callback, callback_state, cycleid);
825 : }
826 2170 : PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
827 2170 : _bt_end_vacuum(rel);
828 :
829 2170 : return stats;
830 : }
831 :
832 : /*
833 : * Post-VACUUM cleanup.
834 : *
835 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
836 : */
837 : IndexBulkDeleteResult *
838 41044 : btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
839 : {
840 : BlockNumber num_delpages;
841 :
842 : /* No-op in ANALYZE ONLY mode */
843 41044 : if (info->analyze_only)
844 14800 : return stats;
845 :
846 : /*
847 : * If btbulkdelete was called, we need not do anything (we just maintain
848 : * the information used within _bt_vacuum_needs_cleanup() by calling
849 : * _bt_set_cleanup_info() below).
850 : *
851 : * If btbulkdelete was _not_ called, then we have a choice to make: we
852 : * must decide whether or not a btvacuumscan() call is needed now (i.e.
853 : * whether the ongoing VACUUM operation can entirely avoid a physical scan
854 : * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
855 : * now.
856 : */
857 26244 : if (stats == NULL)
858 : {
859 : /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
860 24670 : if (!_bt_vacuum_needs_cleanup(info->index))
861 24660 : return NULL;
862 :
863 : /*
864 : * Since we aren't going to actually delete any leaf items, there's no
865 : * need to go through all the vacuum-cycle-ID pushups here.
866 : *
867 : * Posting list tuples are a source of inaccuracy for cleanup-only
868 : * scans. btvacuumscan() will assume that the number of index tuples
869 : * from each page can be used as num_index_tuples, even though
870 : * num_index_tuples is supposed to represent the number of TIDs in the
871 : * index. This naive approach can underestimate the number of tuples
872 : * in the index significantly.
873 : *
874 : * We handle the problem by making num_index_tuples an estimate in
875 : * cleanup-only case.
876 : */
877 10 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
878 10 : btvacuumscan(info, stats, NULL, NULL, 0);
879 10 : stats->estimated_count = true;
880 : }
881 :
882 : /*
883 : * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
884 : *
885 : * num_delpages is the number of deleted pages now in the index that were
886 : * not safe to place in the FSM to be recycled just yet. num_delpages is
887 : * greater than 0 only when _bt_pagedel() actually deleted pages during
888 : * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
889 : * have failed to place any newly deleted pages in the FSM just moments
890 : * ago. (Actually, there are edge cases where recycling of the current
891 : * VACUUM's newly deleted pages does not even become safe by the time the
892 : * next VACUUM comes around. See nbtree/README.)
893 : */
894 : Assert(stats->pages_deleted >= stats->pages_free);
895 1584 : num_delpages = stats->pages_deleted - stats->pages_free;
896 1584 : _bt_set_cleanup_info(info->index, num_delpages);
897 :
898 : /*
899 : * It's quite possible for us to be fooled by concurrent page splits into
900 : * double-counting some index tuples, so disbelieve any total that exceeds
901 : * the underlying heap's count ... if we know that accurately. Otherwise
902 : * this might just make matters worse.
903 : */
904 1584 : if (!info->estimated_count)
905 : {
906 1552 : if (stats->num_index_tuples > info->num_heap_tuples)
907 4 : stats->num_index_tuples = info->num_heap_tuples;
908 : }
909 :
910 1584 : return stats;
911 : }
912 :
913 : /*
914 : * btvacuumscan --- scan the index for VACUUMing purposes
915 : *
916 : * This combines the functions of looking for leaf tuples that are deletable
917 : * according to the vacuum callback, looking for empty pages that can be
918 : * deleted, and looking for old deleted pages that can be recycled. Both
919 : * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
920 : * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
921 : *
922 : * The caller is responsible for initially allocating/zeroing a stats struct
923 : * and for obtaining a vacuum cycle ID if necessary.
924 : */
925 : static void
926 2180 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
927 : IndexBulkDeleteCallback callback, void *callback_state,
928 : BTCycleId cycleid)
929 : {
930 2180 : Relation rel = info->index;
931 : BTVacState vstate;
932 : BlockNumber num_pages;
933 : BlockNumber scanblkno;
934 : bool needLock;
935 :
936 : /*
937 : * Reset fields that track information about the entire index now. This
938 : * avoids double-counting in the case where a single VACUUM command
939 : * requires multiple scans of the index.
940 : *
941 : * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
942 : * since they track information about the VACUUM command, and so must last
943 : * across each call to btvacuumscan().
944 : *
945 : * (Note that pages_free is treated as state about the whole index, not
946 : * the current VACUUM. This is appropriate because RecordFreeIndexPage()
947 : * calls are idempotent, and get repeated for the same deleted pages in
948 : * some scenarios. The point for us is to track the number of recyclable
949 : * pages in the index at the end of the VACUUM command.)
950 : */
951 2180 : stats->num_pages = 0;
952 2180 : stats->num_index_tuples = 0;
953 2180 : stats->pages_deleted = 0;
954 2180 : stats->pages_free = 0;
955 :
956 : /* Set up info to pass down to btvacuumpage */
957 2180 : vstate.info = info;
958 2180 : vstate.stats = stats;
959 2180 : vstate.callback = callback;
960 2180 : vstate.callback_state = callback_state;
961 2180 : vstate.cycleid = cycleid;
962 :
963 : /* Create a temporary memory context to run _bt_pagedel in */
964 2180 : vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
965 : "_bt_pagedel",
966 : ALLOCSET_DEFAULT_SIZES);
967 :
968 : /* Initialize vstate fields used by _bt_pendingfsm_finalize */
969 2180 : vstate.bufsize = 0;
970 2180 : vstate.maxbufsize = 0;
971 2180 : vstate.pendingpages = NULL;
972 2180 : vstate.npendingpages = 0;
973 : /* Consider applying _bt_pendingfsm_finalize optimization */
974 2180 : _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
975 :
976 : /*
977 : * The outer loop iterates over all index pages except the metapage, in
978 : * physical order (we hope the kernel will cooperate in providing
979 : * read-ahead for speed). It is critical that we visit all leaf pages,
980 : * including ones added after we start the scan, else we might fail to
981 : * delete some deletable tuples. Hence, we must repeatedly check the
982 : * relation length. We must acquire the relation-extension lock while
983 : * doing so to avoid a race condition: if someone else is extending the
984 : * relation, there is a window where bufmgr/smgr have created a new
985 : * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
986 : * we manage to scan such a page here, we'll improperly assume it can be
987 : * recycled. Taking the lock synchronizes things enough to prevent a
988 : * problem: either num_pages won't include the new page, or _bt_getbuf
989 : * already has write lock on the buffer and it will be fully initialized
990 : * before we can examine it. Also, we need not worry if a page is added
991 : * immediately after we look; the page splitting code already has
992 : * write-lock on the left page before it adds a right page, so we must
993 : * already have processed any tuples due to be moved into such a page.
994 : *
995 : * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
996 : * think the use of the extension lock is still required.
997 : *
998 : * We can skip locking for new or temp relations, however, since no one
999 : * else could be accessing them.
1000 : */
1001 2180 : needLock = !RELATION_IS_LOCAL(rel);
1002 :
1003 2180 : scanblkno = BTREE_METAPAGE + 1;
1004 : for (;;)
1005 : {
1006 : /* Get the current relation length */
1007 4112 : if (needLock)
1008 4108 : LockRelationForExtension(rel, ExclusiveLock);
1009 4112 : num_pages = RelationGetNumberOfBlocks(rel);
1010 4112 : if (needLock)
1011 4108 : UnlockRelationForExtension(rel, ExclusiveLock);
1012 :
1013 4112 : if (info->report_progress)
1014 944 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1015 : num_pages);
1016 :
1017 : /* Quit if we've scanned the whole relation */
1018 4112 : if (scanblkno >= num_pages)
1019 2180 : break;
1020 : /* Iterate over pages, then loop back to recheck length */
1021 18464 : for (; scanblkno < num_pages; scanblkno++)
1022 : {
1023 16532 : btvacuumpage(&vstate, scanblkno);
1024 16532 : if (info->report_progress)
1025 438 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1026 : scanblkno);
1027 : }
1028 : }
1029 :
1030 : /* Set statistics num_pages field to final size of index */
1031 2180 : stats->num_pages = num_pages;
1032 :
1033 2180 : MemoryContextDelete(vstate.pagedelcontext);
1034 :
1035 : /*
1036 : * If there were any calls to _bt_pagedel() during scan of the index then
1037 : * see if any of the resulting pages can be placed in the FSM now. When
1038 : * it's not safe we'll have to leave it up to a future VACUUM operation.
1039 : *
1040 : * Finally, if we placed any pages in the FSM (either just now or during
1041 : * the scan), forcibly update the upper-level FSM pages to ensure that
1042 : * searchers can find them.
1043 : */
1044 2180 : _bt_pendingfsm_finalize(rel, &vstate);
1045 2180 : if (stats->pages_free > 0)
1046 14 : IndexFreeSpaceMapVacuum(rel);
1047 2180 : }
1048 :
1049 : /*
1050 : * btvacuumpage --- VACUUM one page
1051 : *
1052 : * This processes a single page for btvacuumscan(). In some cases we must
1053 : * backtrack to re-examine and VACUUM pages that were the scanblkno during
1054 : * a previous call here. This is how we handle page splits (that happened
1055 : * after our cycleid was acquired) whose right half page happened to reuse
1056 : * a block that we might have processed at some point before it was
1057 : * recycled (i.e. before the page split).
1058 : */
1059 : static void
1060 16532 : btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
1061 : {
1062 16532 : IndexVacuumInfo *info = vstate->info;
1063 16532 : IndexBulkDeleteResult *stats = vstate->stats;
1064 16532 : IndexBulkDeleteCallback callback = vstate->callback;
1065 16532 : void *callback_state = vstate->callback_state;
1066 16532 : Relation rel = info->index;
1067 16532 : Relation heaprel = info->heaprel;
1068 : bool attempt_pagedel;
1069 : BlockNumber blkno,
1070 : backtrack_to;
1071 : Buffer buf;
1072 : Page page;
1073 : BTPageOpaque opaque;
1074 :
1075 16532 : blkno = scanblkno;
1076 :
1077 16532 : backtrack:
1078 :
1079 16532 : attempt_pagedel = false;
1080 16532 : backtrack_to = P_NONE;
1081 :
1082 : /* call vacuum_delay_point while not holding any buffer lock */
1083 16532 : vacuum_delay_point();
1084 :
1085 : /*
1086 : * We can't use _bt_getbuf() here because it always applies
1087 : * _bt_checkpage(), which will barf on an all-zero page. We want to
1088 : * recycle all-zero pages, not fail. Also, we want to use a nondefault
1089 : * buffer access strategy.
1090 : */
1091 16532 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1092 : info->strategy);
1093 16532 : _bt_lockbuf(rel, buf, BT_READ);
1094 16532 : page = BufferGetPage(buf);
1095 16532 : opaque = NULL;
1096 16532 : if (!PageIsNew(page))
1097 : {
1098 16532 : _bt_checkpage(rel, buf);
1099 16532 : opaque = BTPageGetOpaque(page);
1100 : }
1101 :
1102 : Assert(blkno <= scanblkno);
1103 16532 : if (blkno != scanblkno)
1104 : {
1105 : /*
1106 : * We're backtracking.
1107 : *
1108 : * We followed a right link to a sibling leaf page (a page that
1109 : * happens to be from a block located before scanblkno). The only
1110 : * case we want to do anything with is a live leaf page having the
1111 : * current vacuum cycle ID.
1112 : *
1113 : * The page had better be in a state that's consistent with what we
1114 : * expect. Check for conditions that imply corruption in passing. It
1115 : * can't be half-dead because only an interrupted VACUUM process can
1116 : * leave pages in that state, so we'd definitely have dealt with it
1117 : * back when the page was the scanblkno page (half-dead pages are
1118 : * always marked fully deleted by _bt_pagedel(), barring corruption).
1119 : */
1120 0 : if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1121 : {
1122 : Assert(false);
1123 0 : ereport(LOG,
1124 : (errcode(ERRCODE_INDEX_CORRUPTED),
1125 : errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1126 : blkno, scanblkno, RelationGetRelationName(rel))));
1127 0 : _bt_relbuf(rel, buf);
1128 0 : return;
1129 : }
1130 :
1131 : /*
1132 : * We may have already processed the page in an earlier call, when the
1133 : * page was scanblkno. This happens when the leaf page split occurred
1134 : * after the scan began, but before the right sibling page became the
1135 : * scanblkno.
1136 : *
1137 : * Page may also have been deleted by current btvacuumpage() call,
1138 : * since _bt_pagedel() sometimes deletes the right sibling page of
1139 : * scanblkno in passing (it does so after we decided where to
1140 : * backtrack to). We don't need to process this page as a deleted
1141 : * page a second time now (in fact, it would be wrong to count it as a
1142 : * deleted page in the bulk delete statistics a second time).
1143 : */
1144 0 : if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1145 : {
1146 : /* Done with current scanblkno (and all lower split pages) */
1147 0 : _bt_relbuf(rel, buf);
1148 0 : return;
1149 : }
1150 : }
1151 :
1152 16532 : if (!opaque || BTPageIsRecyclable(page, heaprel))
1153 : {
1154 : /* Okay to recycle this page (which could be leaf or internal) */
1155 72 : RecordFreeIndexPage(rel, blkno);
1156 72 : stats->pages_deleted++;
1157 72 : stats->pages_free++;
1158 : }
1159 16460 : else if (P_ISDELETED(opaque))
1160 : {
1161 : /*
1162 : * Already deleted page (which could be leaf or internal). Can't
1163 : * recycle yet.
1164 : */
1165 178 : stats->pages_deleted++;
1166 : }
1167 16282 : else if (P_ISHALFDEAD(opaque))
1168 : {
1169 : /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1170 0 : attempt_pagedel = true;
1171 :
1172 : /*
1173 : * _bt_pagedel() will increment both pages_newly_deleted and
1174 : * pages_deleted stats in all cases (barring corruption)
1175 : */
1176 : }
1177 16282 : else if (P_ISLEAF(opaque))
1178 : {
1179 : OffsetNumber deletable[MaxIndexTuplesPerPage];
1180 : int ndeletable;
1181 : BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1182 : int nupdatable;
1183 : OffsetNumber offnum,
1184 : minoff,
1185 : maxoff;
1186 : int nhtidsdead,
1187 : nhtidslive;
1188 :
1189 : /*
1190 : * Trade in the initial read lock for a full cleanup lock on this
1191 : * page. We must get such a lock on every leaf page over the course
1192 : * of the vacuum scan, whether or not it actually contains any
1193 : * deletable tuples --- see nbtree/README.
1194 : */
1195 15236 : _bt_upgradelockbufcleanup(rel, buf);
1196 :
1197 : /*
1198 : * Check whether we need to backtrack to earlier pages. What we are
1199 : * concerned about is a page split that happened since we started the
1200 : * vacuum scan. If the split moved tuples on the right half of the
1201 : * split (i.e. the tuples that sort high) to a block that we already
1202 : * passed over, then we might have missed the tuples. We need to
1203 : * backtrack now. (Must do this before possibly clearing btpo_cycleid
1204 : * or deleting scanblkno page below!)
1205 : */
1206 15236 : if (vstate->cycleid != 0 &&
1207 15120 : opaque->btpo_cycleid == vstate->cycleid &&
1208 0 : !(opaque->btpo_flags & BTP_SPLIT_END) &&
1209 0 : !P_RIGHTMOST(opaque) &&
1210 0 : opaque->btpo_next < scanblkno)
1211 0 : backtrack_to = opaque->btpo_next;
1212 :
1213 15236 : ndeletable = 0;
1214 15236 : nupdatable = 0;
1215 15236 : minoff = P_FIRSTDATAKEY(opaque);
1216 15236 : maxoff = PageGetMaxOffsetNumber(page);
1217 15236 : nhtidsdead = 0;
1218 15236 : nhtidslive = 0;
1219 15236 : if (callback)
1220 : {
1221 : /* btbulkdelete callback tells us what to delete (or update) */
1222 2778726 : for (offnum = minoff;
1223 : offnum <= maxoff;
1224 2763606 : offnum = OffsetNumberNext(offnum))
1225 : {
1226 : IndexTuple itup;
1227 :
1228 2763606 : itup = (IndexTuple) PageGetItem(page,
1229 : PageGetItemId(page, offnum));
1230 :
1231 : Assert(!BTreeTupleIsPivot(itup));
1232 2763606 : if (!BTreeTupleIsPosting(itup))
1233 : {
1234 : /* Regular tuple, standard table TID representation */
1235 2673078 : if (callback(&itup->t_tid, callback_state))
1236 : {
1237 1115192 : deletable[ndeletable++] = offnum;
1238 1115192 : nhtidsdead++;
1239 : }
1240 : else
1241 1557886 : nhtidslive++;
1242 : }
1243 : else
1244 : {
1245 : BTVacuumPosting vacposting;
1246 : int nremaining;
1247 :
1248 : /* Posting list tuple */
1249 90528 : vacposting = btreevacuumposting(vstate, itup, offnum,
1250 : &nremaining);
1251 90528 : if (vacposting == NULL)
1252 : {
1253 : /*
1254 : * All table TIDs from the posting tuple remain, so no
1255 : * delete or update required
1256 : */
1257 : Assert(nremaining == BTreeTupleGetNPosting(itup));
1258 : }
1259 51304 : else if (nremaining > 0)
1260 : {
1261 :
1262 : /*
1263 : * Store metadata about posting list tuple in
1264 : * updatable array for entire page. Existing tuple
1265 : * will be updated during the later call to
1266 : * _bt_delitems_vacuum().
1267 : */
1268 : Assert(nremaining < BTreeTupleGetNPosting(itup));
1269 22584 : updatable[nupdatable++] = vacposting;
1270 22584 : nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1271 : }
1272 : else
1273 : {
1274 : /*
1275 : * All table TIDs from the posting list must be
1276 : * deleted. We'll delete the index tuple completely
1277 : * (no update required).
1278 : */
1279 : Assert(nremaining == 0);
1280 28720 : deletable[ndeletable++] = offnum;
1281 28720 : nhtidsdead += BTreeTupleGetNPosting(itup);
1282 28720 : pfree(vacposting);
1283 : }
1284 :
1285 90528 : nhtidslive += nremaining;
1286 : }
1287 : }
1288 : }
1289 :
1290 : /*
1291 : * Apply any needed deletes or updates. We issue just one
1292 : * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1293 : */
1294 15236 : if (ndeletable > 0 || nupdatable > 0)
1295 : {
1296 : Assert(nhtidsdead >= ndeletable + nupdatable);
1297 10544 : _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1298 : nupdatable);
1299 :
1300 10544 : stats->tuples_removed += nhtidsdead;
1301 : /* must recompute maxoff */
1302 10544 : maxoff = PageGetMaxOffsetNumber(page);
1303 :
1304 : /* can't leak memory here */
1305 33128 : for (int i = 0; i < nupdatable; i++)
1306 22584 : pfree(updatable[i]);
1307 : }
1308 : else
1309 : {
1310 : /*
1311 : * If the leaf page has been split during this vacuum cycle, it
1312 : * seems worth expending a write to clear btpo_cycleid even if we
1313 : * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1314 : * takes care of this.) This ensures we won't process the page
1315 : * again.
1316 : *
1317 : * We treat this like a hint-bit update because there's no need to
1318 : * WAL-log it.
1319 : */
1320 : Assert(nhtidsdead == 0);
1321 4692 : if (vstate->cycleid != 0 &&
1322 4576 : opaque->btpo_cycleid == vstate->cycleid)
1323 : {
1324 0 : opaque->btpo_cycleid = 0;
1325 0 : MarkBufferDirtyHint(buf, true);
1326 : }
1327 : }
1328 :
1329 : /*
1330 : * If the leaf page is now empty, try to delete it; else count the
1331 : * live tuples (live table TIDs in posting lists are counted as
1332 : * separate live tuples). We don't delete when backtracking, though,
1333 : * since that would require teaching _bt_pagedel() about backtracking
1334 : * (doesn't seem worth adding more complexity to deal with that).
1335 : *
1336 : * We don't count the number of live TIDs during cleanup-only calls to
1337 : * btvacuumscan (i.e. when callback is not set). We count the number
1338 : * of index tuples directly instead. This avoids the expense of
1339 : * directly examining all of the tuples on each page. VACUUM will
1340 : * treat num_index_tuples as an estimate in cleanup-only case, so it
1341 : * doesn't matter that this underestimates num_index_tuples
1342 : * significantly in some cases.
1343 : */
1344 15236 : if (minoff > maxoff)
1345 4734 : attempt_pagedel = (blkno == scanblkno);
1346 10502 : else if (callback)
1347 10394 : stats->num_index_tuples += nhtidslive;
1348 : else
1349 108 : stats->num_index_tuples += maxoff - minoff + 1;
1350 :
1351 : Assert(!attempt_pagedel || nhtidslive == 0);
1352 : }
1353 :
1354 16532 : if (attempt_pagedel)
1355 : {
1356 : MemoryContext oldcontext;
1357 :
1358 : /* Run pagedel in a temp context to avoid memory leakage */
1359 4734 : MemoryContextReset(vstate->pagedelcontext);
1360 4734 : oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1361 :
1362 : /*
1363 : * _bt_pagedel maintains the bulk delete stats on our behalf;
1364 : * pages_newly_deleted and pages_deleted are likely to be incremented
1365 : * during call
1366 : */
1367 : Assert(blkno == scanblkno);
1368 4734 : _bt_pagedel(rel, buf, vstate);
1369 :
1370 4734 : MemoryContextSwitchTo(oldcontext);
1371 : /* pagedel released buffer, so we shouldn't */
1372 : }
1373 : else
1374 11798 : _bt_relbuf(rel, buf);
1375 :
1376 16532 : if (backtrack_to != P_NONE)
1377 : {
1378 0 : blkno = backtrack_to;
1379 0 : goto backtrack;
1380 : }
1381 : }
1382 :
1383 : /*
1384 : * btreevacuumposting --- determine TIDs still needed in posting list
1385 : *
1386 : * Returns metadata describing how to build replacement tuple without the TIDs
1387 : * that VACUUM needs to delete. Returned value is NULL in the common case
1388 : * where no changes are needed to caller's posting list tuple (we avoid
1389 : * allocating memory here as an optimization).
1390 : *
1391 : * The number of TIDs that should remain in the posting list tuple is set for
1392 : * caller in *nremaining.
1393 : */
1394 : static BTVacuumPosting
1395 90528 : btreevacuumposting(BTVacState *vstate, IndexTuple posting,
1396 : OffsetNumber updatedoffset, int *nremaining)
1397 : {
1398 90528 : int live = 0;
1399 90528 : int nitem = BTreeTupleGetNPosting(posting);
1400 90528 : ItemPointer items = BTreeTupleGetPosting(posting);
1401 90528 : BTVacuumPosting vacposting = NULL;
1402 :
1403 529880 : for (int i = 0; i < nitem; i++)
1404 : {
1405 439352 : if (!vstate->callback(items + i, vstate->callback_state))
1406 : {
1407 : /* Live table TID */
1408 253396 : live++;
1409 : }
1410 185956 : else if (vacposting == NULL)
1411 : {
1412 : /*
1413 : * First dead table TID encountered.
1414 : *
1415 : * It's now clear that we need to delete one or more dead table
1416 : * TIDs, so start maintaining metadata describing how to update
1417 : * existing posting list tuple.
1418 : */
1419 51304 : vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1420 : nitem * sizeof(uint16));
1421 :
1422 51304 : vacposting->itup = posting;
1423 51304 : vacposting->updatedoffset = updatedoffset;
1424 51304 : vacposting->ndeletedtids = 0;
1425 51304 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1426 : }
1427 : else
1428 : {
1429 : /* Second or subsequent dead table TID */
1430 134652 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1431 : }
1432 : }
1433 :
1434 90528 : *nremaining = live;
1435 90528 : return vacposting;
1436 : }
1437 :
1438 : /*
1439 : * btcanreturn() -- Check whether btree indexes support index-only scans.
1440 : *
1441 : * btrees always do, so this is trivial.
1442 : */
1443 : bool
1444 912254 : btcanreturn(Relation index, int attno)
1445 : {
1446 912254 : return true;
1447 : }
|