Line data Source code
1 : /*
2 : * brin_pageops.c
3 : * Page-handling routines for BRIN indexes
4 : *
5 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
6 : * Portions Copyright (c) 1994, Regents of the University of California
7 : *
8 : * IDENTIFICATION
9 : * src/backend/access/brin/brin_pageops.c
10 : */
11 : #include "postgres.h"
12 :
13 : #include "access/brin_page.h"
14 : #include "access/brin_pageops.h"
15 : #include "access/brin_revmap.h"
16 : #include "access/brin_xlog.h"
17 : #include "access/xloginsert.h"
18 : #include "miscadmin.h"
19 : #include "storage/bufmgr.h"
20 : #include "storage/freespace.h"
21 : #include "storage/lmgr.h"
22 : #include "utils/rel.h"
23 :
24 : /*
25 : * Maximum size of an entry in a BRIN_PAGETYPE_REGULAR page. We can tolerate
26 : * a single item per page, unlike other index AMs.
27 : */
28 : #define BrinMaxItemSize \
29 : MAXALIGN_DOWN(BLCKSZ - \
30 : (MAXALIGN(SizeOfPageHeaderData + \
31 : sizeof(ItemIdData)) + \
32 : MAXALIGN(sizeof(BrinSpecialSpace))))
33 :
34 : static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
35 : bool *extended);
36 : static Size br_page_get_freespace(Page page);
37 : static void brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer);
38 :
39 :
40 : /*
41 : * Update tuple origtup (size origsz), located in offset oldoff of buffer
42 : * oldbuf, to newtup (size newsz) as summary tuple for the page range starting
43 : * at heapBlk. oldbuf must not be locked on entry, and is not locked at exit.
44 : *
45 : * If samepage is true, attempt to put the new tuple in the same page, but if
46 : * there's no room, use some other one.
47 : *
48 : * If the update is successful, return true; the revmap is updated to point to
49 : * the new tuple. If the update is not done for whatever reason, return false.
50 : * Caller may retry the update if this happens.
51 : */
52 : bool
53 27450 : brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
54 : BrinRevmap *revmap, BlockNumber heapBlk,
55 : Buffer oldbuf, OffsetNumber oldoff,
56 : const BrinTuple *origtup, Size origsz,
57 : const BrinTuple *newtup, Size newsz,
58 : bool samepage)
59 : {
60 : Page oldpage;
61 : ItemId oldlp;
62 : BrinTuple *oldtup;
63 : Size oldsz;
64 : Buffer newbuf;
65 27450 : BlockNumber newblk = InvalidBlockNumber;
66 : bool extended;
67 :
68 : Assert(newsz == MAXALIGN(newsz));
69 :
70 : /* If the item is oversized, don't bother. */
71 27450 : if (newsz > BrinMaxItemSize)
72 : {
73 0 : ereport(ERROR,
74 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
75 : errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
76 : newsz, BrinMaxItemSize, RelationGetRelationName(idxrel))));
77 : return false; /* keep compiler quiet */
78 : }
79 :
80 : /* make sure the revmap is long enough to contain the entry we need */
81 27450 : brinRevmapExtend(revmap, heapBlk);
82 :
83 27450 : if (!samepage)
84 : {
85 : /* need a page on which to put the item */
86 614 : newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended);
87 614 : if (!BufferIsValid(newbuf))
88 : {
89 : Assert(!extended);
90 0 : return false;
91 : }
92 :
93 : /*
94 : * Note: it's possible (though unlikely) that the returned newbuf is
95 : * the same as oldbuf, if brin_getinsertbuffer determined that the old
96 : * buffer does in fact have enough space.
97 : */
98 614 : if (newbuf == oldbuf)
99 : {
100 : Assert(!extended);
101 0 : newbuf = InvalidBuffer;
102 : }
103 : else
104 614 : newblk = BufferGetBlockNumber(newbuf);
105 : }
106 : else
107 : {
108 26836 : LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
109 26836 : newbuf = InvalidBuffer;
110 26836 : extended = false;
111 : }
112 27450 : oldpage = BufferGetPage(oldbuf);
113 27450 : oldlp = PageGetItemId(oldpage, oldoff);
114 :
115 : /*
116 : * Check that the old tuple wasn't updated concurrently: it might have
117 : * moved someplace else entirely, and for that matter the whole page
118 : * might've become a revmap page. Note that in the first two cases
119 : * checked here, the "oldlp" we just calculated is garbage; but
120 : * PageGetItemId() is simple enough that it was safe to do that
121 : * calculation anyway.
122 : */
123 54900 : if (!BRIN_IS_REGULAR_PAGE(oldpage) ||
124 27450 : oldoff > PageGetMaxOffsetNumber(oldpage) ||
125 27450 : !ItemIdIsNormal(oldlp))
126 : {
127 0 : LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
128 :
129 : /*
130 : * If this happens, and the new buffer was obtained by extending the
131 : * relation, then we need to ensure we don't leave it uninitialized or
132 : * forget about it.
133 : */
134 0 : if (BufferIsValid(newbuf))
135 : {
136 0 : if (extended)
137 0 : brin_initialize_empty_new_buffer(idxrel, newbuf);
138 0 : UnlockReleaseBuffer(newbuf);
139 0 : if (extended)
140 0 : FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
141 : }
142 0 : return false;
143 : }
144 :
145 27450 : oldsz = ItemIdGetLength(oldlp);
146 27450 : oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp);
147 :
148 : /*
149 : * ... or it might have been updated in place to different contents.
150 : */
151 27450 : if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz))
152 : {
153 0 : LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
154 0 : if (BufferIsValid(newbuf))
155 : {
156 : /* As above, initialize and record new page if we got one */
157 0 : if (extended)
158 0 : brin_initialize_empty_new_buffer(idxrel, newbuf);
159 0 : UnlockReleaseBuffer(newbuf);
160 0 : if (extended)
161 0 : FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
162 : }
163 0 : return false;
164 : }
165 :
166 : /*
167 : * Great, the old tuple is intact. We can proceed with the update.
168 : *
169 : * If there's enough room in the old page for the new tuple, replace it.
170 : *
171 : * Note that there might now be enough space on the page even though the
172 : * caller told us there isn't, if a concurrent update moved another tuple
173 : * elsewhere or replaced a tuple with a smaller one.
174 : */
175 54318 : if (((BrinPageFlags(oldpage) & BRIN_EVACUATE_PAGE) == 0) &&
176 26868 : brin_can_do_samepage_update(oldbuf, origsz, newsz))
177 : {
178 26836 : START_CRIT_SECTION();
179 26836 : if (!PageIndexTupleOverwrite(oldpage, oldoff, newtup, newsz))
180 0 : elog(ERROR, "failed to replace BRIN tuple");
181 26836 : MarkBufferDirty(oldbuf);
182 :
183 : /* XLOG stuff */
184 26836 : if (RelationNeedsWAL(idxrel))
185 : {
186 : xl_brin_samepage_update xlrec;
187 : XLogRecPtr recptr;
188 26830 : uint8 info = XLOG_BRIN_SAMEPAGE_UPDATE;
189 :
190 26830 : xlrec.offnum = oldoff;
191 :
192 26830 : XLogBeginInsert();
193 26830 : XLogRegisterData(&xlrec, SizeOfBrinSamepageUpdate);
194 :
195 26830 : XLogRegisterBuffer(0, oldbuf, REGBUF_STANDARD);
196 26830 : XLogRegisterBufData(0, newtup, newsz);
197 :
198 26830 : recptr = XLogInsert(RM_BRIN_ID, info);
199 :
200 26830 : PageSetLSN(oldpage, recptr);
201 : }
202 :
203 26836 : END_CRIT_SECTION();
204 :
205 26836 : LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
206 :
207 26836 : if (BufferIsValid(newbuf))
208 : {
209 : /* As above, initialize and record new page if we got one */
210 0 : if (extended)
211 0 : brin_initialize_empty_new_buffer(idxrel, newbuf);
212 0 : UnlockReleaseBuffer(newbuf);
213 0 : if (extended)
214 0 : FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
215 : }
216 :
217 26836 : return true;
218 : }
219 614 : else if (newbuf == InvalidBuffer)
220 : {
221 : /*
222 : * Not enough space, but caller said that there was. Tell them to
223 : * start over.
224 : */
225 0 : LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
226 0 : return false;
227 : }
228 : else
229 : {
230 : /*
231 : * Not enough free space on the oldpage. Put the new tuple on the new
232 : * page, and update the revmap.
233 : */
234 614 : Page newpage = BufferGetPage(newbuf);
235 : Buffer revmapbuf;
236 : ItemPointerData newtid;
237 : OffsetNumber newoff;
238 614 : Size freespace = 0;
239 :
240 614 : revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
241 :
242 614 : START_CRIT_SECTION();
243 :
244 : /*
245 : * We need to initialize the page if it's newly obtained. Note we
246 : * will WAL-log the initialization as part of the update, so we don't
247 : * need to do that here.
248 : */
249 614 : if (extended)
250 22 : brin_page_init(newpage, BRIN_PAGETYPE_REGULAR);
251 :
252 614 : PageIndexTupleDeleteNoCompact(oldpage, oldoff);
253 614 : newoff = PageAddItem(newpage, newtup, newsz, InvalidOffsetNumber, false, false);
254 614 : if (newoff == InvalidOffsetNumber)
255 0 : elog(ERROR, "failed to add BRIN tuple to new page");
256 614 : MarkBufferDirty(oldbuf);
257 614 : MarkBufferDirty(newbuf);
258 :
259 : /* needed to update FSM below */
260 614 : if (extended)
261 22 : freespace = br_page_get_freespace(newpage);
262 :
263 614 : ItemPointerSet(&newtid, newblk, newoff);
264 614 : brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid);
265 614 : MarkBufferDirty(revmapbuf);
266 :
267 : /* XLOG stuff */
268 614 : if (RelationNeedsWAL(idxrel))
269 : {
270 : xl_brin_update xlrec;
271 : XLogRecPtr recptr;
272 : uint8 info;
273 :
274 614 : info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0);
275 :
276 614 : xlrec.insert.offnum = newoff;
277 614 : xlrec.insert.heapBlk = heapBlk;
278 614 : xlrec.insert.pagesPerRange = pagesPerRange;
279 614 : xlrec.oldOffnum = oldoff;
280 :
281 614 : XLogBeginInsert();
282 :
283 : /* new page */
284 614 : XLogRegisterData(&xlrec, SizeOfBrinUpdate);
285 :
286 614 : XLogRegisterBuffer(0, newbuf, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0));
287 614 : XLogRegisterBufData(0, newtup, newsz);
288 :
289 : /* revmap page */
290 614 : XLogRegisterBuffer(1, revmapbuf, 0);
291 :
292 : /* old page */
293 614 : XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD);
294 :
295 614 : recptr = XLogInsert(RM_BRIN_ID, info);
296 :
297 614 : PageSetLSN(oldpage, recptr);
298 614 : PageSetLSN(newpage, recptr);
299 614 : PageSetLSN(BufferGetPage(revmapbuf), recptr);
300 : }
301 :
302 614 : END_CRIT_SECTION();
303 :
304 614 : LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
305 614 : LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
306 614 : UnlockReleaseBuffer(newbuf);
307 :
308 614 : if (extended)
309 : {
310 22 : RecordPageWithFreeSpace(idxrel, newblk, freespace);
311 22 : FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
312 : }
313 :
314 614 : return true;
315 : }
316 : }
317 :
318 : /*
319 : * Return whether brin_doupdate can do a samepage update.
320 : */
321 : bool
322 53736 : brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz)
323 : {
324 : return
325 61656 : ((newsz <= origsz) ||
326 7920 : PageGetExactFreeSpace(BufferGetPage(buffer)) >= (newsz - origsz));
327 : }
328 :
329 : /*
330 : * Insert an index tuple into the index relation. The revmap is updated to
331 : * mark the range containing the given page as pointing to the inserted entry.
332 : * A WAL record is written.
333 : *
334 : * The buffer, if valid, is first checked for free space to insert the new
335 : * entry; if there isn't enough, a new buffer is obtained and pinned. No
336 : * buffer lock must be held on entry, no buffer lock is held on exit.
337 : *
338 : * Return value is the offset number where the tuple was inserted.
339 : */
340 : OffsetNumber
341 5656 : brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
342 : BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk,
343 : const BrinTuple *tup, Size itemsz)
344 : {
345 : Page page;
346 : BlockNumber blk;
347 : OffsetNumber off;
348 5656 : Size freespace = 0;
349 : Buffer revmapbuf;
350 : ItemPointerData tid;
351 : bool extended;
352 :
353 : Assert(itemsz == MAXALIGN(itemsz));
354 :
355 : /* If the item is oversized, don't even bother. */
356 5656 : if (itemsz > BrinMaxItemSize)
357 : {
358 0 : ereport(ERROR,
359 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
360 : errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
361 : itemsz, BrinMaxItemSize, RelationGetRelationName(idxrel))));
362 : return InvalidOffsetNumber; /* keep compiler quiet */
363 : }
364 :
365 : /* Make sure the revmap is long enough to contain the entry we need */
366 5656 : brinRevmapExtend(revmap, heapBlk);
367 :
368 : /*
369 : * Acquire lock on buffer supplied by caller, if any. If it doesn't have
370 : * enough space, unpin it to obtain a new one below.
371 : */
372 5656 : if (BufferIsValid(*buffer))
373 : {
374 : /*
375 : * It's possible that another backend (or ourselves!) extended the
376 : * revmap over the page we held a pin on, so we cannot assume that
377 : * it's still a regular page.
378 : */
379 2348 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
380 2348 : if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz)
381 : {
382 120 : UnlockReleaseBuffer(*buffer);
383 120 : *buffer = InvalidBuffer;
384 : }
385 : }
386 :
387 : /*
388 : * If we still don't have a usable buffer, have brin_getinsertbuffer
389 : * obtain one for us.
390 : */
391 5656 : if (!BufferIsValid(*buffer))
392 : {
393 : do
394 3428 : *buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended);
395 3428 : while (!BufferIsValid(*buffer));
396 : }
397 : else
398 2228 : extended = false;
399 :
400 : /* Now obtain lock on revmap buffer */
401 5656 : revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
402 :
403 5656 : page = BufferGetPage(*buffer);
404 5656 : blk = BufferGetBlockNumber(*buffer);
405 :
406 : /* Execute the actual insertion */
407 5656 : START_CRIT_SECTION();
408 5656 : if (extended)
409 486 : brin_page_init(page, BRIN_PAGETYPE_REGULAR);
410 5656 : off = PageAddItem(page, tup, itemsz, InvalidOffsetNumber, false, false);
411 5656 : if (off == InvalidOffsetNumber)
412 0 : elog(ERROR, "failed to add BRIN tuple to new page");
413 5656 : MarkBufferDirty(*buffer);
414 :
415 : /* needed to update FSM below */
416 5656 : if (extended)
417 486 : freespace = br_page_get_freespace(page);
418 :
419 5656 : ItemPointerSet(&tid, blk, off);
420 5656 : brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid);
421 5656 : MarkBufferDirty(revmapbuf);
422 :
423 : /* XLOG stuff */
424 5656 : if (RelationNeedsWAL(idxrel))
425 : {
426 : xl_brin_insert xlrec;
427 : XLogRecPtr recptr;
428 : uint8 info;
429 :
430 4736 : info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0);
431 4736 : xlrec.heapBlk = heapBlk;
432 4736 : xlrec.pagesPerRange = pagesPerRange;
433 4736 : xlrec.offnum = off;
434 :
435 4736 : XLogBeginInsert();
436 4736 : XLogRegisterData(&xlrec, SizeOfBrinInsert);
437 :
438 4736 : XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0));
439 4736 : XLogRegisterBufData(0, tup, itemsz);
440 :
441 4736 : XLogRegisterBuffer(1, revmapbuf, 0);
442 :
443 4736 : recptr = XLogInsert(RM_BRIN_ID, info);
444 :
445 4736 : PageSetLSN(page, recptr);
446 4736 : PageSetLSN(BufferGetPage(revmapbuf), recptr);
447 : }
448 :
449 5656 : END_CRIT_SECTION();
450 :
451 : /* Tuple is firmly on buffer; we can release our locks */
452 5656 : LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
453 5656 : LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
454 :
455 : BRIN_elog((DEBUG2, "inserted tuple (%u,%u) for range starting at %u",
456 : blk, off, heapBlk));
457 :
458 5656 : if (extended)
459 : {
460 486 : RecordPageWithFreeSpace(idxrel, blk, freespace);
461 486 : FreeSpaceMapVacuumRange(idxrel, blk, blk + 1);
462 : }
463 :
464 5656 : return off;
465 : }
466 :
467 : /*
468 : * Initialize a page with the given type.
469 : *
470 : * Caller is responsible for marking it dirty, as appropriate.
471 : */
472 : void
473 1518 : brin_page_init(Page page, uint16 type)
474 : {
475 1518 : PageInit(page, BLCKSZ, sizeof(BrinSpecialSpace));
476 :
477 1518 : BrinPageType(page) = type;
478 1518 : }
479 :
480 : /*
481 : * Initialize a new BRIN index's metapage.
482 : */
483 : void
484 448 : brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version)
485 : {
486 : BrinMetaPageData *metadata;
487 :
488 448 : brin_page_init(page, BRIN_PAGETYPE_META);
489 :
490 448 : metadata = (BrinMetaPageData *) PageGetContents(page);
491 :
492 448 : metadata->brinMagic = BRIN_META_MAGIC;
493 448 : metadata->brinVersion = version;
494 448 : metadata->pagesPerRange = pagesPerRange;
495 :
496 : /*
497 : * Note we cheat here a little. 0 is not a valid revmap block number
498 : * (because it's the metapage buffer), but doing this enables the first
499 : * revmap page to be created when the index is.
500 : */
501 448 : metadata->lastRevmapPage = 0;
502 :
503 : /*
504 : * Set pd_lower just past the end of the metadata. This is essential,
505 : * because without doing so, metadata will be lost if xlog.c compresses
506 : * the page.
507 : */
508 448 : ((PageHeader) page)->pd_lower =
509 448 : ((char *) metadata + sizeof(BrinMetaPageData)) - (char *) page;
510 448 : }
511 :
512 : /*
513 : * Initiate page evacuation protocol.
514 : *
515 : * The page must be locked in exclusive mode by the caller.
516 : *
517 : * If the page is not yet initialized or empty, return false without doing
518 : * anything; it can be used for revmap without any further changes. If it
519 : * contains tuples, mark it for evacuation and return true.
520 : */
521 : bool
522 370 : brin_start_evacuating_page(Relation idxRel, Buffer buf)
523 : {
524 : OffsetNumber off;
525 : OffsetNumber maxoff;
526 : Page page;
527 :
528 370 : page = BufferGetPage(buf);
529 :
530 370 : if (PageIsNew(page))
531 366 : return false;
532 :
533 4 : maxoff = PageGetMaxOffsetNumber(page);
534 584 : for (off = FirstOffsetNumber; off <= maxoff; off++)
535 : {
536 : ItemId lp;
537 :
538 582 : lp = PageGetItemId(page, off);
539 582 : if (ItemIdIsUsed(lp))
540 : {
541 : /*
542 : * Prevent other backends from adding more stuff to this page:
543 : * BRIN_EVACUATE_PAGE informs br_page_get_freespace that this page
544 : * can no longer be used to add new tuples. Note that this flag
545 : * is not WAL-logged, except accidentally.
546 : */
547 2 : BrinPageFlags(page) |= BRIN_EVACUATE_PAGE;
548 2 : MarkBufferDirtyHint(buf, true);
549 :
550 2 : return true;
551 : }
552 : }
553 2 : return false;
554 : }
555 :
556 : /*
557 : * Move all tuples out of a page.
558 : *
559 : * The caller must hold lock on the page. The lock and pin are released.
560 : */
561 : void
562 2 : brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
563 : BrinRevmap *revmap, Buffer buf)
564 : {
565 : OffsetNumber off;
566 : OffsetNumber maxoff;
567 : Page page;
568 2 : BrinTuple *btup = NULL;
569 2 : Size btupsz = 0;
570 :
571 2 : page = BufferGetPage(buf);
572 :
573 : Assert(BrinPageFlags(page) & BRIN_EVACUATE_PAGE);
574 :
575 2 : maxoff = PageGetMaxOffsetNumber(page);
576 584 : for (off = FirstOffsetNumber; off <= maxoff; off++)
577 : {
578 : BrinTuple *tup;
579 : Size sz;
580 : ItemId lp;
581 :
582 582 : CHECK_FOR_INTERRUPTS();
583 :
584 582 : lp = PageGetItemId(page, off);
585 582 : if (ItemIdIsUsed(lp))
586 : {
587 582 : sz = ItemIdGetLength(lp);
588 582 : tup = (BrinTuple *) PageGetItem(page, lp);
589 582 : tup = brin_copy_tuple(tup, sz, btup, &btupsz);
590 :
591 582 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
592 :
593 582 : if (!brin_doupdate(idxRel, pagesPerRange, revmap, tup->bt_blkno,
594 : buf, off, tup, sz, tup, sz, false))
595 0 : off--; /* retry */
596 :
597 582 : LockBuffer(buf, BUFFER_LOCK_SHARE);
598 :
599 : /* It's possible that someone extended the revmap over this page */
600 582 : if (!BRIN_IS_REGULAR_PAGE(page))
601 0 : break;
602 : }
603 : }
604 :
605 2 : UnlockReleaseBuffer(buf);
606 2 : }
607 :
608 : /*
609 : * Given a BRIN index page, initialize it if necessary, and record its
610 : * current free space in the FSM.
611 : *
612 : * The main use for this is when, during vacuuming, an uninitialized page is
613 : * found, which could be the result of relation extension followed by a crash
614 : * before the page can be used.
615 : *
616 : * Here, we don't bother to update upper FSM pages, instead expecting that our
617 : * caller (brin_vacuum_scan) will fix them at the end of the scan. Elsewhere
618 : * in this file, it's generally a good idea to propagate additions of free
619 : * space into the upper FSM pages immediately.
620 : */
621 : void
622 476 : brin_page_cleanup(Relation idxrel, Buffer buf)
623 : {
624 476 : Page page = BufferGetPage(buf);
625 :
626 : /*
627 : * If a page was left uninitialized, initialize it now; also record it in
628 : * FSM.
629 : *
630 : * Somebody else might be extending the relation concurrently. To avoid
631 : * re-initializing the page before they can grab the buffer lock, we
632 : * acquire the extension lock momentarily. Since they hold the extension
633 : * lock from before getting the page and after its been initialized, we're
634 : * sure to see their initialization.
635 : */
636 476 : if (PageIsNew(page))
637 : {
638 0 : LockRelationForExtension(idxrel, ShareLock);
639 0 : UnlockRelationForExtension(idxrel, ShareLock);
640 :
641 0 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
642 0 : if (PageIsNew(page))
643 : {
644 0 : brin_initialize_empty_new_buffer(idxrel, buf);
645 0 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
646 0 : return;
647 : }
648 0 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
649 : }
650 :
651 : /* Nothing to be done for non-regular index pages */
652 476 : if (BRIN_IS_META_PAGE(BufferGetPage(buf)) ||
653 370 : BRIN_IS_REVMAP_PAGE(BufferGetPage(buf)))
654 212 : return;
655 :
656 : /* Measure free space and record it */
657 264 : RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf),
658 : br_page_get_freespace(page));
659 : }
660 :
661 : /*
662 : * Return a pinned and exclusively locked buffer which can be used to insert an
663 : * index item of size itemsz (caller must ensure not to request sizes
664 : * impossible to fulfill). If oldbuf is a valid buffer, it is also locked (in
665 : * an order determined to avoid deadlocks).
666 : *
667 : * If we find that the old page is no longer a regular index page (because
668 : * of a revmap extension), the old buffer is unlocked and we return
669 : * InvalidBuffer.
670 : *
671 : * If there's no existing page with enough free space to accommodate the new
672 : * item, the relation is extended. If this happens, *extended is set to true,
673 : * and it is the caller's responsibility to initialize the page (and WAL-log
674 : * that fact) prior to use. The caller should also update the FSM with the
675 : * page's remaining free space after the insertion.
676 : *
677 : * Note that the caller is not expected to update FSM unless *extended is set
678 : * true. This policy means that we'll update FSM when a page is created, and
679 : * when it's found to have too little space for a desired tuple insertion,
680 : * but not every single time we add a tuple to the page.
681 : *
682 : * Note that in some corner cases it is possible for this routine to extend
683 : * the relation and then not return the new page. It is this routine's
684 : * responsibility to WAL-log the page initialization and to record the page in
685 : * FSM if that happens, since the caller certainly can't do it.
686 : */
687 : static Buffer
688 4042 : brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
689 : bool *extended)
690 : {
691 : BlockNumber oldblk;
692 : BlockNumber newblk;
693 : Page page;
694 : Size freespace;
695 :
696 : /* callers must have checked */
697 : Assert(itemsz <= BrinMaxItemSize);
698 :
699 4042 : if (BufferIsValid(oldbuf))
700 614 : oldblk = BufferGetBlockNumber(oldbuf);
701 : else
702 3428 : oldblk = InvalidBlockNumber;
703 :
704 : /* Choose initial target page, re-using existing target if known */
705 4042 : newblk = RelationGetTargetBlock(irel);
706 4042 : if (newblk == InvalidBlockNumber)
707 436 : newblk = GetPageWithFreeSpace(irel, itemsz);
708 :
709 : /*
710 : * Loop until we find a page with sufficient free space. By the time we
711 : * return to caller out of this loop, both buffers are valid and locked;
712 : * if we have to restart here, neither page is locked and newblk isn't
713 : * pinned (if it's even valid).
714 : */
715 : for (;;)
716 154 : {
717 : Buffer buf;
718 4196 : bool extensionLockHeld = false;
719 :
720 4196 : CHECK_FOR_INTERRUPTS();
721 :
722 4196 : *extended = false;
723 :
724 4196 : if (newblk == InvalidBlockNumber)
725 : {
726 : /*
727 : * There's not enough free space in any existing index page,
728 : * according to the FSM: extend the relation to obtain a shiny new
729 : * page.
730 : *
731 : * XXX: It's likely possible to use RBM_ZERO_AND_LOCK here,
732 : * which'd avoid the need to hold the extension lock during buffer
733 : * reclaim.
734 : */
735 508 : if (!RELATION_IS_LOCAL(irel))
736 : {
737 58 : LockRelationForExtension(irel, ExclusiveLock);
738 58 : extensionLockHeld = true;
739 : }
740 508 : buf = ReadBuffer(irel, P_NEW);
741 508 : newblk = BufferGetBlockNumber(buf);
742 508 : *extended = true;
743 :
744 : BRIN_elog((DEBUG2, "brin_getinsertbuffer: extending to page %u",
745 : BufferGetBlockNumber(buf)));
746 : }
747 3688 : else if (newblk == oldblk)
748 : {
749 : /*
750 : * There's an odd corner-case here where the FSM is out-of-date,
751 : * and gave us the old page.
752 : */
753 26 : buf = oldbuf;
754 : }
755 : else
756 : {
757 3662 : buf = ReadBuffer(irel, newblk);
758 : }
759 :
760 : /*
761 : * We lock the old buffer first, if it's earlier than the new one; but
762 : * then we need to check that it hasn't been turned into a revmap page
763 : * concurrently. If we detect that that happened, give up and tell
764 : * caller to start over.
765 : */
766 4196 : if (BufferIsValid(oldbuf) && oldblk < newblk)
767 : {
768 622 : LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
769 622 : if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)))
770 : {
771 0 : LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
772 :
773 : /*
774 : * It is possible that the new page was obtained from
775 : * extending the relation. In that case, we must be sure to
776 : * record it in the FSM before leaving, because otherwise the
777 : * space would be lost forever. However, we cannot let an
778 : * uninitialized page get in the FSM, so we need to initialize
779 : * it first.
780 : */
781 0 : if (*extended)
782 0 : brin_initialize_empty_new_buffer(irel, buf);
783 :
784 0 : if (extensionLockHeld)
785 0 : UnlockRelationForExtension(irel, ExclusiveLock);
786 :
787 0 : ReleaseBuffer(buf);
788 :
789 0 : if (*extended)
790 : {
791 0 : FreeSpaceMapVacuumRange(irel, newblk, newblk + 1);
792 : /* shouldn't matter, but don't confuse caller */
793 0 : *extended = false;
794 : }
795 :
796 0 : return InvalidBuffer;
797 : }
798 : }
799 :
800 4196 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
801 :
802 4196 : if (extensionLockHeld)
803 58 : UnlockRelationForExtension(irel, ExclusiveLock);
804 :
805 4196 : page = BufferGetPage(buf);
806 :
807 : /*
808 : * We have a new buffer to insert into. Check that the new page has
809 : * enough free space, and return it if it does; otherwise start over.
810 : * (br_page_get_freespace also checks that the FSM didn't hand us a
811 : * page that has since been repurposed for the revmap.)
812 : */
813 8392 : freespace = *extended ?
814 4196 : BrinMaxItemSize : br_page_get_freespace(page);
815 4196 : if (freespace >= itemsz)
816 : {
817 4042 : RelationSetTargetBlock(irel, newblk);
818 :
819 : /*
820 : * Lock the old buffer if not locked already. Note that in this
821 : * case we know for sure it's a regular page: it's later than the
822 : * new page we just got, which is not a revmap page, and revmap
823 : * pages are always consecutive.
824 : */
825 4042 : if (BufferIsValid(oldbuf) && oldblk > newblk)
826 : {
827 0 : LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
828 : Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)));
829 : }
830 :
831 4042 : return buf;
832 : }
833 :
834 : /* This page is no good. */
835 :
836 : /*
837 : * If an entirely new page does not contain enough free space for the
838 : * new item, then surely that item is oversized. Complain loudly; but
839 : * first make sure we initialize the page and record it as free, for
840 : * next time.
841 : */
842 154 : if (*extended)
843 : {
844 0 : brin_initialize_empty_new_buffer(irel, buf);
845 : /* since this should not happen, skip FreeSpaceMapVacuum */
846 :
847 0 : ereport(ERROR,
848 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
849 : errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
850 : itemsz, freespace, RelationGetRelationName(irel))));
851 : return InvalidBuffer; /* keep compiler quiet */
852 : }
853 :
854 154 : if (newblk != oldblk)
855 128 : UnlockReleaseBuffer(buf);
856 154 : if (BufferIsValid(oldbuf) && oldblk <= newblk)
857 34 : LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
858 :
859 : /*
860 : * Update the FSM with the new, presumably smaller, freespace value
861 : * for this page, then search for a new target page.
862 : */
863 154 : newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz);
864 : }
865 : }
866 :
867 : /*
868 : * Initialize a page as an empty regular BRIN page, WAL-log this, and record
869 : * the page in FSM.
870 : *
871 : * There are several corner situations in which we extend the relation to
872 : * obtain a new page and later find that we cannot use it immediately. When
873 : * that happens, we don't want to leave the page go unrecorded in FSM, because
874 : * there is no mechanism to get the space back and the index would bloat.
875 : * Also, because we would not WAL-log the action that would initialize the
876 : * page, the page would go uninitialized in a standby (or after recovery).
877 : *
878 : * While we record the page in FSM here, caller is responsible for doing FSM
879 : * upper-page update if that seems appropriate.
880 : */
881 : static void
882 0 : brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer)
883 : {
884 : Page page;
885 :
886 : BRIN_elog((DEBUG2,
887 : "brin_initialize_empty_new_buffer: initializing blank page %u",
888 : BufferGetBlockNumber(buffer)));
889 :
890 0 : START_CRIT_SECTION();
891 0 : page = BufferGetPage(buffer);
892 0 : brin_page_init(page, BRIN_PAGETYPE_REGULAR);
893 0 : MarkBufferDirty(buffer);
894 0 : log_newpage_buffer(buffer, true);
895 0 : END_CRIT_SECTION();
896 :
897 : /*
898 : * We update the FSM for this page, but this is not WAL-logged. This is
899 : * acceptable because VACUUM will scan the index and update the FSM with
900 : * pages whose FSM records were forgotten in a crash.
901 : */
902 0 : RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buffer),
903 : br_page_get_freespace(page));
904 0 : }
905 :
906 :
907 : /*
908 : * Return the amount of free space on a regular BRIN index page.
909 : *
910 : * If the page is not a regular page, or has been marked with the
911 : * BRIN_EVACUATE_PAGE flag, returns 0.
912 : */
913 : static Size
914 6808 : br_page_get_freespace(Page page)
915 : {
916 6808 : if (!BRIN_IS_REGULAR_PAGE(page) ||
917 6808 : (BrinPageFlags(page) & BRIN_EVACUATE_PAGE) != 0)
918 0 : return 0;
919 : else
920 6808 : return PageGetFreeSpace(page);
921 : }
|