Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * gistvacuum.c
4 : * vacuuming routines for the postgres GiST index access method.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/gist/gistvacuum.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include "access/genam.h"
18 : #include "access/gist_private.h"
19 : #include "access/transam.h"
20 : #include "commands/vacuum.h"
21 : #include "lib/integerset.h"
22 : #include "miscadmin.h"
23 : #include "storage/indexfsm.h"
24 : #include "storage/lmgr.h"
25 : #include "storage/read_stream.h"
26 : #include "utils/memutils.h"
27 :
28 : /* Working state needed by gistbulkdelete */
29 : typedef struct
30 : {
31 : IndexVacuumInfo *info;
32 : IndexBulkDeleteResult *stats;
33 : IndexBulkDeleteCallback callback;
34 : void *callback_state;
35 : GistNSN startNSN;
36 :
37 : /*
38 : * These are used to memorize all internal and empty leaf pages. They are
39 : * used for deleting all the empty pages.
40 : */
41 : IntegerSet *internal_page_set;
42 : IntegerSet *empty_leaf_set;
43 : MemoryContext page_set_context;
44 : } GistVacState;
45 :
46 : static void gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
47 : IndexBulkDeleteCallback callback, void *callback_state);
48 : static void gistvacuumpage(GistVacState *vstate, Buffer buffer);
49 : static void gistvacuum_delete_empty_pages(IndexVacuumInfo *info,
50 : GistVacState *vstate);
51 : static bool gistdeletepage(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
52 : Buffer parentBuffer, OffsetNumber downlink,
53 : Buffer leafBuffer);
54 :
55 : /*
56 : * VACUUM bulkdelete stage: remove index entries.
57 : */
58 : IndexBulkDeleteResult *
59 42 : gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
60 : IndexBulkDeleteCallback callback, void *callback_state)
61 : {
62 : /* allocate stats if first time through, else re-use existing struct */
63 42 : if (stats == NULL)
64 42 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
65 :
66 42 : gistvacuumscan(info, stats, callback, callback_state);
67 :
68 42 : return stats;
69 : }
70 :
71 : /*
72 : * VACUUM cleanup stage: delete empty pages, and update index statistics.
73 : */
74 : IndexBulkDeleteResult *
75 82 : gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
76 : {
77 : /* No-op in ANALYZE ONLY mode */
78 82 : if (info->analyze_only)
79 16 : return stats;
80 :
81 : /*
82 : * If gistbulkdelete was called, we need not do anything, just return the
83 : * stats from the latest gistbulkdelete call. If it wasn't called, we
84 : * still need to do a pass over the index, to obtain index statistics.
85 : */
86 66 : if (stats == NULL)
87 : {
88 42 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
89 42 : gistvacuumscan(info, stats, NULL, NULL);
90 : }
91 :
92 : /*
93 : * It's quite possible for us to be fooled by concurrent page splits into
94 : * double-counting some index tuples, so disbelieve any total that exceeds
95 : * the underlying heap's count ... if we know that accurately. Otherwise
96 : * this might just make matters worse.
97 : */
98 66 : if (!info->estimated_count)
99 : {
100 66 : if (stats->num_index_tuples > info->num_heap_tuples)
101 0 : stats->num_index_tuples = info->num_heap_tuples;
102 : }
103 :
104 66 : return stats;
105 : }
106 :
107 : /*
108 : * gistvacuumscan --- scan the index for VACUUMing purposes
109 : *
110 : * This scans the index for leaf tuples that are deletable according to the
111 : * vacuum callback, and updates the stats. Both btbulkdelete and
112 : * btvacuumcleanup invoke this (the latter only if no btbulkdelete call
113 : * occurred).
114 : *
115 : * This also makes note of any empty leaf pages, as well as all internal
116 : * pages while looping over all index pages. After scanning all the pages, we
117 : * remove the empty pages so that they can be reused. Any deleted pages are
118 : * added directly to the free space map. (They should've been added there
119 : * when they were originally deleted, already, but it's possible that the FSM
120 : * was lost at a crash, for example.)
121 : *
122 : * The caller is responsible for initially allocating/zeroing a stats struct.
123 : */
124 : static void
125 84 : gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
126 : IndexBulkDeleteCallback callback, void *callback_state)
127 : {
128 84 : Relation rel = info->index;
129 : GistVacState vstate;
130 : BlockNumber num_pages;
131 : bool needLock;
132 : MemoryContext oldctx;
133 : BlockRangeReadStreamPrivate p;
134 84 : ReadStream *stream = NULL;
135 :
136 : /*
137 : * Reset fields that track information about the entire index now. This
138 : * avoids double-counting in the case where a single VACUUM command
139 : * requires multiple scans of the index.
140 : *
141 : * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
142 : * since they track information about the VACUUM command, and so must last
143 : * across each call to gistvacuumscan().
144 : *
145 : * (Note that pages_free is treated as state about the whole index, not
146 : * the current VACUUM. This is appropriate because RecordFreeIndexPage()
147 : * calls are idempotent, and get repeated for the same deleted pages in
148 : * some scenarios. The point for us is to track the number of recyclable
149 : * pages in the index at the end of the VACUUM command.)
150 : */
151 84 : stats->num_pages = 0;
152 84 : stats->estimated_count = false;
153 84 : stats->num_index_tuples = 0;
154 84 : stats->pages_deleted = 0;
155 84 : stats->pages_free = 0;
156 :
157 : /*
158 : * Create the integer sets to remember all the internal and the empty leaf
159 : * pages in page_set_context. Internally, the integer set will remember
160 : * this context so that the subsequent allocations for these integer sets
161 : * will be done from the same context.
162 : *
163 : * XXX the allocation sizes used below pre-date generation context's block
164 : * growing code. These values should likely be benchmarked and set to
165 : * more suitable values.
166 : */
167 84 : vstate.page_set_context = GenerationContextCreate(CurrentMemoryContext,
168 : "GiST VACUUM page set context",
169 : 16 * 1024,
170 : 16 * 1024,
171 : 16 * 1024);
172 84 : oldctx = MemoryContextSwitchTo(vstate.page_set_context);
173 84 : vstate.internal_page_set = intset_create();
174 84 : vstate.empty_leaf_set = intset_create();
175 84 : MemoryContextSwitchTo(oldctx);
176 :
177 : /* Set up info to pass down to gistvacuumpage */
178 84 : vstate.info = info;
179 84 : vstate.stats = stats;
180 84 : vstate.callback = callback;
181 84 : vstate.callback_state = callback_state;
182 84 : if (RelationNeedsWAL(rel))
183 84 : vstate.startNSN = GetInsertRecPtr();
184 : else
185 0 : vstate.startNSN = gistGetFakeLSN(rel);
186 :
187 : /*
188 : * The outer loop iterates over all index pages, in physical order (we
189 : * hope the kernel will cooperate in providing read-ahead for speed). It
190 : * is critical that we visit all leaf pages, including ones added after we
191 : * start the scan, else we might fail to delete some deletable tuples.
192 : * Hence, we must repeatedly check the relation length. We must acquire
193 : * the relation-extension lock while doing so to avoid a race condition:
194 : * if someone else is extending the relation, there is a window where
195 : * bufmgr/smgr have created a new all-zero page but it hasn't yet been
196 : * write-locked by gistNewBuffer(). If we manage to scan such a page
197 : * here, we'll improperly assume it can be recycled. Taking the lock
198 : * synchronizes things enough to prevent a problem: either num_pages won't
199 : * include the new page, or gistNewBuffer already has write lock on the
200 : * buffer and it will be fully initialized before we can examine it. (See
201 : * also vacuumlazy.c, which has the same issue.) Also, we need not worry
202 : * if a page is added immediately after we look; the page splitting code
203 : * already has write-lock on the left page before it adds a right page, so
204 : * we must already have processed any tuples due to be moved into such a
205 : * page.
206 : *
207 : * We can skip locking for new or temp relations, however, since no one
208 : * else could be accessing them.
209 : */
210 84 : needLock = !RELATION_IS_LOCAL(rel);
211 :
212 84 : p.current_blocknum = GIST_ROOT_BLKNO;
213 :
214 : /*
215 : * It is safe to use batchmode as block_range_read_stream_cb takes no
216 : * locks.
217 : */
218 84 : stream = read_stream_begin_relation(READ_STREAM_FULL |
219 : READ_STREAM_USE_BATCHING,
220 : info->strategy,
221 : rel,
222 : MAIN_FORKNUM,
223 : block_range_read_stream_cb,
224 : &p,
225 : 0);
226 : for (;;)
227 : {
228 : /* Get the current relation length */
229 168 : if (needLock)
230 168 : LockRelationForExtension(rel, ExclusiveLock);
231 168 : num_pages = RelationGetNumberOfBlocks(rel);
232 168 : if (needLock)
233 168 : UnlockRelationForExtension(rel, ExclusiveLock);
234 :
235 : /* Quit if we've scanned the whole relation */
236 168 : if (p.current_blocknum >= num_pages)
237 84 : break;
238 :
239 84 : p.last_exclusive = num_pages;
240 :
241 : /* Iterate over pages, then loop back to recheck relation length */
242 : while (true)
243 2338 : {
244 : Buffer buf;
245 :
246 : /* call vacuum_delay_point while not holding any buffer lock */
247 2422 : vacuum_delay_point(false);
248 :
249 2422 : buf = read_stream_next_buffer(stream, NULL);
250 :
251 2422 : if (!BufferIsValid(buf))
252 84 : break;
253 :
254 2338 : gistvacuumpage(&vstate, buf);
255 : }
256 :
257 : Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
258 :
259 : /*
260 : * We have to reset the read stream to use it again. After returning
261 : * InvalidBuffer, the read stream API won't invoke our callback again
262 : * until the stream has been reset.
263 : */
264 84 : read_stream_reset(stream);
265 : }
266 :
267 84 : read_stream_end(stream);
268 :
269 : /*
270 : * If we found any recyclable pages (and recorded them in the FSM), then
271 : * forcibly update the upper-level FSM pages to ensure that searchers can
272 : * find them. It's possible that the pages were also found during
273 : * previous scans and so this is a waste of time, but it's cheap enough
274 : * relative to scanning the index that it shouldn't matter much, and
275 : * making sure that free pages are available sooner not later seems
276 : * worthwhile.
277 : *
278 : * Note that if no recyclable pages exist, we don't bother vacuuming the
279 : * FSM at all.
280 : */
281 84 : if (stats->pages_free > 0)
282 0 : IndexFreeSpaceMapVacuum(rel);
283 :
284 : /* update statistics */
285 84 : stats->num_pages = num_pages;
286 :
287 : /*
288 : * If we saw any empty pages, try to unlink them from the tree so that
289 : * they can be reused.
290 : */
291 84 : gistvacuum_delete_empty_pages(info, &vstate);
292 :
293 : /* we don't need the internal and empty page sets anymore */
294 84 : MemoryContextDelete(vstate.page_set_context);
295 84 : vstate.page_set_context = NULL;
296 84 : vstate.internal_page_set = NULL;
297 84 : vstate.empty_leaf_set = NULL;
298 84 : }
299 :
300 : /*
301 : * gistvacuumpage --- VACUUM one page
302 : *
303 : * This processes a single page for gistbulkdelete(). `buffer` contains the
304 : * page to process. In some cases we must go back and reexamine
305 : * previously-scanned pages; this routine recurses when necessary to handle
306 : * that case.
307 : */
308 : static void
309 2338 : gistvacuumpage(GistVacState *vstate, Buffer buffer)
310 : {
311 2338 : IndexVacuumInfo *info = vstate->info;
312 2338 : IndexBulkDeleteCallback callback = vstate->callback;
313 2338 : void *callback_state = vstate->callback_state;
314 2338 : Relation rel = info->index;
315 2338 : BlockNumber orig_blkno = BufferGetBlockNumber(buffer);
316 : Page page;
317 : BlockNumber recurse_to;
318 :
319 : /*
320 : * orig_blkno is the highest block number reached by the outer
321 : * gistvacuumscan() loop. This will be the same as blkno unless we are
322 : * recursing to reexamine a previous page.
323 : */
324 2338 : BlockNumber blkno = orig_blkno;
325 :
326 2338 : restart:
327 2338 : recurse_to = InvalidBlockNumber;
328 :
329 : /*
330 : * We are not going to stay here for a long time, aggressively grab an
331 : * exclusive lock.
332 : */
333 2338 : LockBuffer(buffer, GIST_EXCLUSIVE);
334 2338 : page = (Page) BufferGetPage(buffer);
335 :
336 2338 : if (gistPageRecyclable(page))
337 : {
338 : /* Okay to recycle this page */
339 0 : RecordFreeIndexPage(rel, blkno);
340 0 : vstate->stats->pages_deleted++;
341 0 : vstate->stats->pages_free++;
342 : }
343 2338 : else if (GistPageIsDeleted(page))
344 : {
345 : /* Already deleted, but can't recycle yet */
346 0 : vstate->stats->pages_deleted++;
347 : }
348 2338 : else if (GistPageIsLeaf(page))
349 : {
350 : OffsetNumber todelete[MaxOffsetNumber];
351 2286 : int ntodelete = 0;
352 : int nremain;
353 2286 : GISTPageOpaque opaque = GistPageGetOpaque(page);
354 2286 : OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
355 :
356 : /*
357 : * Check whether we need to recurse back to earlier pages. What we
358 : * are concerned about is a page split that happened since we started
359 : * the vacuum scan. If the split moved some tuples to a lower page
360 : * then we might have missed 'em. If so, set up for tail recursion.
361 : *
362 : * This is similar to the checks we do during searches, when following
363 : * a downlink, but we don't need to jump to higher-numbered pages,
364 : * because we will process them later, anyway.
365 : */
366 4572 : if ((GistFollowRight(page) ||
367 2286 : vstate->startNSN < GistPageGetNSN(page)) &&
368 0 : (opaque->rightlink != InvalidBlockNumber) &&
369 0 : (opaque->rightlink < orig_blkno))
370 : {
371 0 : recurse_to = opaque->rightlink;
372 : }
373 :
374 : /*
375 : * Scan over all items to see which ones need to be deleted according
376 : * to the callback function.
377 : */
378 2286 : if (callback)
379 : {
380 : OffsetNumber off;
381 :
382 62308 : for (off = FirstOffsetNumber;
383 : off <= maxoff;
384 61780 : off = OffsetNumberNext(off))
385 : {
386 61780 : ItemId iid = PageGetItemId(page, off);
387 61780 : IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
388 :
389 61780 : if (callback(&(idxtuple->t_tid), callback_state))
390 44740 : todelete[ntodelete++] = off;
391 : }
392 : }
393 :
394 : /*
395 : * Apply any needed deletes. We issue just one WAL record per page,
396 : * so as to minimize WAL traffic.
397 : */
398 2286 : if (ntodelete > 0)
399 : {
400 510 : START_CRIT_SECTION();
401 :
402 510 : MarkBufferDirty(buffer);
403 :
404 510 : PageIndexMultiDelete(page, todelete, ntodelete);
405 510 : GistMarkTuplesDeleted(page);
406 :
407 510 : if (RelationNeedsWAL(rel))
408 510 : {
409 : XLogRecPtr recptr;
410 :
411 510 : recptr = gistXLogUpdate(buffer,
412 : todelete, ntodelete,
413 : NULL, 0, InvalidBuffer);
414 510 : PageSetLSN(page, recptr);
415 : }
416 : else
417 0 : PageSetLSN(page, gistGetFakeLSN(rel));
418 :
419 510 : END_CRIT_SECTION();
420 :
421 510 : vstate->stats->tuples_removed += ntodelete;
422 : /* must recompute maxoff */
423 510 : maxoff = PageGetMaxOffsetNumber(page);
424 : }
425 :
426 2286 : nremain = maxoff - FirstOffsetNumber + 1;
427 2286 : if (nremain == 0)
428 : {
429 : /*
430 : * The page is now completely empty. Remember its block number,
431 : * so that we will try to delete the page in the second stage.
432 : *
433 : * Skip this when recursing, because IntegerSet requires that the
434 : * values are added in ascending order. The next VACUUM will pick
435 : * it up.
436 : */
437 174 : if (blkno == orig_blkno)
438 174 : intset_add_member(vstate->empty_leaf_set, blkno);
439 : }
440 : else
441 2112 : vstate->stats->num_index_tuples += nremain;
442 : }
443 : else
444 : {
445 : /*
446 : * On an internal page, check for "invalid tuples", left behind by an
447 : * incomplete page split on PostgreSQL 9.0 or below. These are not
448 : * created by newer PostgreSQL versions, but unfortunately, there is
449 : * no version number anywhere in a GiST index, so we don't know
450 : * whether this index might still contain invalid tuples or not.
451 : */
452 52 : OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
453 : OffsetNumber off;
454 :
455 2306 : for (off = FirstOffsetNumber;
456 : off <= maxoff;
457 2254 : off = OffsetNumberNext(off))
458 : {
459 2254 : ItemId iid = PageGetItemId(page, off);
460 2254 : IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
461 :
462 2254 : if (GistTupleIsInvalid(idxtuple))
463 0 : ereport(LOG,
464 : (errmsg("index \"%s\" contains an inner tuple marked as invalid",
465 : RelationGetRelationName(rel)),
466 : errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."),
467 : errhint("Please REINDEX it.")));
468 : }
469 :
470 : /*
471 : * Remember the block number of this page, so that we can revisit it
472 : * later in gistvacuum_delete_empty_pages(), when we search for
473 : * parents of empty leaf pages.
474 : */
475 52 : if (blkno == orig_blkno)
476 52 : intset_add_member(vstate->internal_page_set, blkno);
477 : }
478 :
479 2338 : UnlockReleaseBuffer(buffer);
480 :
481 : /*
482 : * This is really tail recursion, but if the compiler is too stupid to
483 : * optimize it as such, we'd eat an uncomfortably large amount of stack
484 : * space per recursion level (due to the deletable[] array). A failure is
485 : * improbable since the number of levels isn't likely to be large ... but
486 : * just in case, let's hand-optimize into a loop.
487 : */
488 2338 : if (recurse_to != InvalidBlockNumber)
489 : {
490 0 : blkno = recurse_to;
491 :
492 : /* check for vacuum delay while not holding any buffer lock */
493 0 : vacuum_delay_point(false);
494 :
495 0 : buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
496 : info->strategy);
497 0 : goto restart;
498 : }
499 2338 : }
500 :
501 : /*
502 : * Scan all internal pages, and try to delete their empty child pages.
503 : */
504 : static void
505 84 : gistvacuum_delete_empty_pages(IndexVacuumInfo *info, GistVacState *vstate)
506 : {
507 84 : Relation rel = info->index;
508 : BlockNumber empty_pages_remaining;
509 : uint64 blkno;
510 :
511 : /*
512 : * Rescan all inner pages to find those that have empty child pages.
513 : */
514 84 : empty_pages_remaining = intset_num_entries(vstate->empty_leaf_set);
515 84 : intset_begin_iterate(vstate->internal_page_set);
516 100 : while (empty_pages_remaining > 0 &&
517 14 : intset_iterate_next(vstate->internal_page_set, &blkno))
518 : {
519 : Buffer buffer;
520 : Page page;
521 : OffsetNumber off,
522 : maxoff;
523 : OffsetNumber todelete[MaxOffsetNumber];
524 : BlockNumber leafs_to_delete[MaxOffsetNumber];
525 : int ntodelete;
526 : int deleted;
527 :
528 2 : buffer = ReadBufferExtended(rel, MAIN_FORKNUM, (BlockNumber) blkno,
529 : RBM_NORMAL, info->strategy);
530 :
531 2 : LockBuffer(buffer, GIST_SHARE);
532 2 : page = (Page) BufferGetPage(buffer);
533 :
534 2 : if (PageIsNew(page) || GistPageIsDeleted(page) || GistPageIsLeaf(page))
535 : {
536 : /*
537 : * This page was an internal page earlier, but now it's something
538 : * else. Shouldn't happen...
539 : */
540 : Assert(false);
541 0 : UnlockReleaseBuffer(buffer);
542 0 : continue;
543 : }
544 :
545 : /*
546 : * Scan all the downlinks, and see if any of them point to empty leaf
547 : * pages.
548 : */
549 2 : maxoff = PageGetMaxOffsetNumber(page);
550 2 : ntodelete = 0;
551 328 : for (off = FirstOffsetNumber;
552 326 : off <= maxoff && ntodelete < maxoff - 1;
553 326 : off = OffsetNumberNext(off))
554 : {
555 326 : ItemId iid = PageGetItemId(page, off);
556 326 : IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
557 : BlockNumber leafblk;
558 :
559 326 : leafblk = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
560 326 : if (intset_is_member(vstate->empty_leaf_set, leafblk))
561 : {
562 162 : leafs_to_delete[ntodelete] = leafblk;
563 162 : todelete[ntodelete++] = off;
564 : }
565 : }
566 :
567 : /*
568 : * In order to avoid deadlock, child page must be locked before
569 : * parent, so we must release the lock on the parent, lock the child,
570 : * and then re-acquire the lock the parent. (And we wouldn't want to
571 : * do I/O, while holding a lock, anyway.)
572 : *
573 : * At the instant that we're not holding a lock on the parent, the
574 : * downlink might get moved by a concurrent insert, so we must
575 : * re-check that it still points to the same child page after we have
576 : * acquired both locks. Also, another backend might have inserted a
577 : * tuple to the page, so that it is no longer empty. gistdeletepage()
578 : * re-checks all these conditions.
579 : */
580 2 : LockBuffer(buffer, GIST_UNLOCK);
581 :
582 2 : deleted = 0;
583 164 : for (int i = 0; i < ntodelete; i++)
584 : {
585 : Buffer leafbuf;
586 :
587 : /*
588 : * Don't remove the last downlink from the parent. That would
589 : * confuse the insertion code.
590 : */
591 162 : if (PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
592 0 : break;
593 :
594 162 : leafbuf = ReadBufferExtended(rel, MAIN_FORKNUM, leafs_to_delete[i],
595 : RBM_NORMAL, info->strategy);
596 162 : LockBuffer(leafbuf, GIST_EXCLUSIVE);
597 162 : gistcheckpage(rel, leafbuf);
598 :
599 162 : LockBuffer(buffer, GIST_EXCLUSIVE);
600 162 : if (gistdeletepage(info, vstate->stats,
601 162 : buffer, todelete[i] - deleted,
602 : leafbuf))
603 162 : deleted++;
604 162 : LockBuffer(buffer, GIST_UNLOCK);
605 :
606 162 : UnlockReleaseBuffer(leafbuf);
607 : }
608 :
609 2 : ReleaseBuffer(buffer);
610 :
611 : /*
612 : * We can stop the scan as soon as we have seen the downlinks, even if
613 : * we were not able to remove them all.
614 : */
615 2 : empty_pages_remaining -= ntodelete;
616 : }
617 84 : }
618 :
619 : /*
620 : * gistdeletepage takes a leaf page, and its parent, and tries to delete the
621 : * leaf. Both pages must be locked.
622 : *
623 : * Even if the page was empty when we first saw it, a concurrent inserter might
624 : * have added a tuple to it since. Similarly, the downlink might have moved.
625 : * We re-check all the conditions, to make sure the page is still deletable,
626 : * before modifying anything.
627 : *
628 : * Returns true, if the page was deleted, and false if a concurrent update
629 : * prevented it.
630 : */
631 : static bool
632 162 : gistdeletepage(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
633 : Buffer parentBuffer, OffsetNumber downlink,
634 : Buffer leafBuffer)
635 : {
636 162 : Page parentPage = BufferGetPage(parentBuffer);
637 162 : Page leafPage = BufferGetPage(leafBuffer);
638 : ItemId iid;
639 : IndexTuple idxtuple;
640 : XLogRecPtr recptr;
641 : FullTransactionId txid;
642 :
643 : /*
644 : * Check that the leaf is still empty and deletable.
645 : */
646 162 : if (!GistPageIsLeaf(leafPage))
647 : {
648 : /* a leaf page should never become a non-leaf page */
649 : Assert(false);
650 0 : return false;
651 : }
652 :
653 162 : if (GistFollowRight(leafPage))
654 0 : return false; /* don't mess with a concurrent page split */
655 :
656 162 : if (PageGetMaxOffsetNumber(leafPage) != InvalidOffsetNumber)
657 0 : return false; /* not empty anymore */
658 :
659 : /*
660 : * Ok, the leaf is deletable. Is the downlink in the parent page still
661 : * valid? It might have been moved by a concurrent insert. We could try
662 : * to re-find it by scanning the page again, possibly moving right if the
663 : * was split. But for now, let's keep it simple and just give up. The
664 : * next VACUUM will pick it up.
665 : */
666 162 : if (PageIsNew(parentPage) || GistPageIsDeleted(parentPage) ||
667 162 : GistPageIsLeaf(parentPage))
668 : {
669 : /* shouldn't happen, internal pages are never deleted */
670 : Assert(false);
671 0 : return false;
672 : }
673 :
674 162 : if (PageGetMaxOffsetNumber(parentPage) < downlink
675 162 : || PageGetMaxOffsetNumber(parentPage) <= FirstOffsetNumber)
676 0 : return false;
677 :
678 162 : iid = PageGetItemId(parentPage, downlink);
679 162 : idxtuple = (IndexTuple) PageGetItem(parentPage, iid);
680 324 : if (BufferGetBlockNumber(leafBuffer) !=
681 162 : ItemPointerGetBlockNumber(&(idxtuple->t_tid)))
682 0 : return false;
683 :
684 : /*
685 : * All good, proceed with the deletion.
686 : *
687 : * The page cannot be immediately recycled, because in-progress scans that
688 : * saw the downlink might still visit it. Mark the page with the current
689 : * next-XID counter, so that we know when it can be recycled. Once that
690 : * XID becomes older than GlobalXmin, we know that all scans that are
691 : * currently in progress must have ended. (That's much more conservative
692 : * than needed, but let's keep it safe and simple.)
693 : */
694 162 : txid = ReadNextFullTransactionId();
695 :
696 162 : START_CRIT_SECTION();
697 :
698 : /* mark the page as deleted */
699 162 : MarkBufferDirty(leafBuffer);
700 162 : GistPageSetDeleted(leafPage, txid);
701 162 : stats->pages_newly_deleted++;
702 162 : stats->pages_deleted++;
703 :
704 : /* remove the downlink from the parent */
705 162 : MarkBufferDirty(parentBuffer);
706 162 : PageIndexTupleDelete(parentPage, downlink);
707 :
708 162 : if (RelationNeedsWAL(info->index))
709 162 : recptr = gistXLogPageDelete(leafBuffer, txid, parentBuffer, downlink);
710 : else
711 0 : recptr = gistGetFakeLSN(info->index);
712 162 : PageSetLSN(parentPage, recptr);
713 162 : PageSetLSN(leafPage, recptr);
714 :
715 162 : END_CRIT_SECTION();
716 :
717 162 : return true;
718 : }
|