Line data Source code
1 : /*
2 : * brin_pageops.c
3 : * Page-handling routines for BRIN indexes
4 : *
5 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
6 : * Portions Copyright (c) 1994, Regents of the University of California
7 : *
8 : * IDENTIFICATION
9 : * src/backend/access/brin/brin_pageops.c
10 : */
11 : #include "postgres.h"
12 :
13 : #include "access/brin_page.h"
14 : #include "access/brin_pageops.h"
15 : #include "access/brin_revmap.h"
16 : #include "access/brin_xlog.h"
17 : #include "access/xloginsert.h"
18 : #include "miscadmin.h"
19 : #include "storage/bufmgr.h"
20 : #include "storage/freespace.h"
21 : #include "storage/lmgr.h"
22 : #include "utils/rel.h"
23 :
24 : /*
25 : * Maximum size of an entry in a BRIN_PAGETYPE_REGULAR page. We can tolerate
26 : * a single item per page, unlike other index AMs.
27 : */
28 : #define BrinMaxItemSize \
29 : MAXALIGN_DOWN(BLCKSZ - \
30 : (MAXALIGN(SizeOfPageHeaderData + \
31 : sizeof(ItemIdData)) + \
32 : MAXALIGN(sizeof(BrinSpecialSpace))))
33 :
34 : static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
35 : bool *extended);
36 : static Size br_page_get_freespace(Page page);
37 : static void brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer);
38 :
39 :
40 : /*
41 : * Update tuple origtup (size origsz), located in offset oldoff of buffer
42 : * oldbuf, to newtup (size newsz) as summary tuple for the page range starting
43 : * at heapBlk. oldbuf must not be locked on entry, and is not locked at exit.
44 : *
45 : * If samepage is true, attempt to put the new tuple in the same page, but if
46 : * there's no room, use some other one.
47 : *
48 : * If the update is successful, return true; the revmap is updated to point to
49 : * the new tuple. If the update is not done for whatever reason, return false.
50 : * Caller may retry the update if this happens.
51 : */
52 : bool
53 27428 : brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
54 : BrinRevmap *revmap, BlockNumber heapBlk,
55 : Buffer oldbuf, OffsetNumber oldoff,
56 : const BrinTuple *origtup, Size origsz,
57 : const BrinTuple *newtup, Size newsz,
58 : bool samepage)
59 : {
60 : Page oldpage;
61 : ItemId oldlp;
62 : BrinTuple *oldtup;
63 : Size oldsz;
64 : Buffer newbuf;
65 27428 : BlockNumber newblk = InvalidBlockNumber;
66 : bool extended;
67 :
68 : Assert(newsz == MAXALIGN(newsz));
69 :
70 : /* If the item is oversized, don't bother. */
71 27428 : if (newsz > BrinMaxItemSize)
72 : {
73 0 : ereport(ERROR,
74 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
75 : errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
76 : newsz, BrinMaxItemSize, RelationGetRelationName(idxrel))));
77 : return false; /* keep compiler quiet */
78 : }
79 :
80 : /* make sure the revmap is long enough to contain the entry we need */
81 27428 : brinRevmapExtend(revmap, heapBlk);
82 :
83 27428 : if (!samepage)
84 : {
85 : /* need a page on which to put the item */
86 614 : newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended);
87 614 : if (!BufferIsValid(newbuf))
88 : {
89 : Assert(!extended);
90 0 : return false;
91 : }
92 :
93 : /*
94 : * Note: it's possible (though unlikely) that the returned newbuf is
95 : * the same as oldbuf, if brin_getinsertbuffer determined that the old
96 : * buffer does in fact have enough space.
97 : */
98 614 : if (newbuf == oldbuf)
99 : {
100 : Assert(!extended);
101 0 : newbuf = InvalidBuffer;
102 : }
103 : else
104 614 : newblk = BufferGetBlockNumber(newbuf);
105 : }
106 : else
107 : {
108 26814 : LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
109 26814 : newbuf = InvalidBuffer;
110 26814 : extended = false;
111 : }
112 27428 : oldpage = BufferGetPage(oldbuf);
113 27428 : oldlp = PageGetItemId(oldpage, oldoff);
114 :
115 : /*
116 : * Check that the old tuple wasn't updated concurrently: it might have
117 : * moved someplace else entirely, and for that matter the whole page
118 : * might've become a revmap page. Note that in the first two cases
119 : * checked here, the "oldlp" we just calculated is garbage; but
120 : * PageGetItemId() is simple enough that it was safe to do that
121 : * calculation anyway.
122 : */
123 54856 : if (!BRIN_IS_REGULAR_PAGE(oldpage) ||
124 27428 : oldoff > PageGetMaxOffsetNumber(oldpage) ||
125 27428 : !ItemIdIsNormal(oldlp))
126 : {
127 0 : LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
128 :
129 : /*
130 : * If this happens, and the new buffer was obtained by extending the
131 : * relation, then we need to ensure we don't leave it uninitialized or
132 : * forget about it.
133 : */
134 0 : if (BufferIsValid(newbuf))
135 : {
136 0 : if (extended)
137 0 : brin_initialize_empty_new_buffer(idxrel, newbuf);
138 0 : UnlockReleaseBuffer(newbuf);
139 0 : if (extended)
140 0 : FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
141 : }
142 0 : return false;
143 : }
144 :
145 27428 : oldsz = ItemIdGetLength(oldlp);
146 27428 : oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp);
147 :
148 : /*
149 : * ... or it might have been updated in place to different contents.
150 : */
151 27428 : if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz))
152 : {
153 0 : LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
154 0 : if (BufferIsValid(newbuf))
155 : {
156 : /* As above, initialize and record new page if we got one */
157 0 : if (extended)
158 0 : brin_initialize_empty_new_buffer(idxrel, newbuf);
159 0 : UnlockReleaseBuffer(newbuf);
160 0 : if (extended)
161 0 : FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
162 : }
163 0 : return false;
164 : }
165 :
166 : /*
167 : * Great, the old tuple is intact. We can proceed with the update.
168 : *
169 : * If there's enough room in the old page for the new tuple, replace it.
170 : *
171 : * Note that there might now be enough space on the page even though the
172 : * caller told us there isn't, if a concurrent update moved another tuple
173 : * elsewhere or replaced a tuple with a smaller one.
174 : */
175 54274 : if (((BrinPageFlags(oldpage) & BRIN_EVACUATE_PAGE) == 0) &&
176 26846 : brin_can_do_samepage_update(oldbuf, origsz, newsz))
177 : {
178 26814 : START_CRIT_SECTION();
179 26814 : if (!PageIndexTupleOverwrite(oldpage, oldoff, (Item) unconstify(BrinTuple *, newtup), newsz))
180 0 : elog(ERROR, "failed to replace BRIN tuple");
181 26814 : MarkBufferDirty(oldbuf);
182 :
183 : /* XLOG stuff */
184 26814 : if (RelationNeedsWAL(idxrel))
185 : {
186 : xl_brin_samepage_update xlrec;
187 : XLogRecPtr recptr;
188 26808 : uint8 info = XLOG_BRIN_SAMEPAGE_UPDATE;
189 :
190 26808 : xlrec.offnum = oldoff;
191 :
192 26808 : XLogBeginInsert();
193 26808 : XLogRegisterData((char *) &xlrec, SizeOfBrinSamepageUpdate);
194 :
195 26808 : XLogRegisterBuffer(0, oldbuf, REGBUF_STANDARD);
196 26808 : XLogRegisterBufData(0, (const char *) newtup, newsz);
197 :
198 26808 : recptr = XLogInsert(RM_BRIN_ID, info);
199 :
200 26808 : PageSetLSN(oldpage, recptr);
201 : }
202 :
203 26814 : END_CRIT_SECTION();
204 :
205 26814 : LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
206 :
207 26814 : if (BufferIsValid(newbuf))
208 : {
209 : /* As above, initialize and record new page if we got one */
210 0 : if (extended)
211 0 : brin_initialize_empty_new_buffer(idxrel, newbuf);
212 0 : UnlockReleaseBuffer(newbuf);
213 0 : if (extended)
214 0 : FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
215 : }
216 :
217 26814 : return true;
218 : }
219 614 : else if (newbuf == InvalidBuffer)
220 : {
221 : /*
222 : * Not enough space, but caller said that there was. Tell them to
223 : * start over.
224 : */
225 0 : LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
226 0 : return false;
227 : }
228 : else
229 : {
230 : /*
231 : * Not enough free space on the oldpage. Put the new tuple on the new
232 : * page, and update the revmap.
233 : */
234 614 : Page newpage = BufferGetPage(newbuf);
235 : Buffer revmapbuf;
236 : ItemPointerData newtid;
237 : OffsetNumber newoff;
238 614 : Size freespace = 0;
239 :
240 614 : revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
241 :
242 614 : START_CRIT_SECTION();
243 :
244 : /*
245 : * We need to initialize the page if it's newly obtained. Note we
246 : * will WAL-log the initialization as part of the update, so we don't
247 : * need to do that here.
248 : */
249 614 : if (extended)
250 22 : brin_page_init(newpage, BRIN_PAGETYPE_REGULAR);
251 :
252 614 : PageIndexTupleDeleteNoCompact(oldpage, oldoff);
253 614 : newoff = PageAddItem(newpage, (Item) unconstify(BrinTuple *, newtup), newsz,
254 : InvalidOffsetNumber, false, false);
255 614 : if (newoff == InvalidOffsetNumber)
256 0 : elog(ERROR, "failed to add BRIN tuple to new page");
257 614 : MarkBufferDirty(oldbuf);
258 614 : MarkBufferDirty(newbuf);
259 :
260 : /* needed to update FSM below */
261 614 : if (extended)
262 22 : freespace = br_page_get_freespace(newpage);
263 :
264 614 : ItemPointerSet(&newtid, newblk, newoff);
265 614 : brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid);
266 614 : MarkBufferDirty(revmapbuf);
267 :
268 : /* XLOG stuff */
269 614 : if (RelationNeedsWAL(idxrel))
270 : {
271 : xl_brin_update xlrec;
272 : XLogRecPtr recptr;
273 : uint8 info;
274 :
275 614 : info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0);
276 :
277 614 : xlrec.insert.offnum = newoff;
278 614 : xlrec.insert.heapBlk = heapBlk;
279 614 : xlrec.insert.pagesPerRange = pagesPerRange;
280 614 : xlrec.oldOffnum = oldoff;
281 :
282 614 : XLogBeginInsert();
283 :
284 : /* new page */
285 614 : XLogRegisterData((char *) &xlrec, SizeOfBrinUpdate);
286 :
287 614 : XLogRegisterBuffer(0, newbuf, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0));
288 614 : XLogRegisterBufData(0, (const char *) newtup, newsz);
289 :
290 : /* revmap page */
291 614 : XLogRegisterBuffer(1, revmapbuf, 0);
292 :
293 : /* old page */
294 614 : XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD);
295 :
296 614 : recptr = XLogInsert(RM_BRIN_ID, info);
297 :
298 614 : PageSetLSN(oldpage, recptr);
299 614 : PageSetLSN(newpage, recptr);
300 614 : PageSetLSN(BufferGetPage(revmapbuf), recptr);
301 : }
302 :
303 614 : END_CRIT_SECTION();
304 :
305 614 : LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
306 614 : LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
307 614 : UnlockReleaseBuffer(newbuf);
308 :
309 614 : if (extended)
310 : {
311 22 : RecordPageWithFreeSpace(idxrel, newblk, freespace);
312 22 : FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
313 : }
314 :
315 614 : return true;
316 : }
317 : }
318 :
319 : /*
320 : * Return whether brin_doupdate can do a samepage update.
321 : */
322 : bool
323 53692 : brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz)
324 : {
325 : return
326 61576 : ((newsz <= origsz) ||
327 7884 : PageGetExactFreeSpace(BufferGetPage(buffer)) >= (newsz - origsz));
328 : }
329 :
330 : /*
331 : * Insert an index tuple into the index relation. The revmap is updated to
332 : * mark the range containing the given page as pointing to the inserted entry.
333 : * A WAL record is written.
334 : *
335 : * The buffer, if valid, is first checked for free space to insert the new
336 : * entry; if there isn't enough, a new buffer is obtained and pinned. No
337 : * buffer lock must be held on entry, no buffer lock is held on exit.
338 : *
339 : * Return value is the offset number where the tuple was inserted.
340 : */
341 : OffsetNumber
342 5632 : brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
343 : BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk,
344 : BrinTuple *tup, Size itemsz)
345 : {
346 : Page page;
347 : BlockNumber blk;
348 : OffsetNumber off;
349 5632 : Size freespace = 0;
350 : Buffer revmapbuf;
351 : ItemPointerData tid;
352 : bool extended;
353 :
354 : Assert(itemsz == MAXALIGN(itemsz));
355 :
356 : /* If the item is oversized, don't even bother. */
357 5632 : if (itemsz > BrinMaxItemSize)
358 : {
359 0 : ereport(ERROR,
360 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
361 : errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
362 : itemsz, BrinMaxItemSize, RelationGetRelationName(idxrel))));
363 : return InvalidOffsetNumber; /* keep compiler quiet */
364 : }
365 :
366 : /* Make sure the revmap is long enough to contain the entry we need */
367 5632 : brinRevmapExtend(revmap, heapBlk);
368 :
369 : /*
370 : * Acquire lock on buffer supplied by caller, if any. If it doesn't have
371 : * enough space, unpin it to obtain a new one below.
372 : */
373 5632 : if (BufferIsValid(*buffer))
374 : {
375 : /*
376 : * It's possible that another backend (or ourselves!) extended the
377 : * revmap over the page we held a pin on, so we cannot assume that
378 : * it's still a regular page.
379 : */
380 2348 : LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
381 2348 : if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz)
382 : {
383 120 : UnlockReleaseBuffer(*buffer);
384 120 : *buffer = InvalidBuffer;
385 : }
386 : }
387 :
388 : /*
389 : * If we still don't have a usable buffer, have brin_getinsertbuffer
390 : * obtain one for us.
391 : */
392 5632 : if (!BufferIsValid(*buffer))
393 : {
394 : do
395 3404 : *buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended);
396 3404 : while (!BufferIsValid(*buffer));
397 : }
398 : else
399 2228 : extended = false;
400 :
401 : /* Now obtain lock on revmap buffer */
402 5632 : revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
403 :
404 5632 : page = BufferGetPage(*buffer);
405 5632 : blk = BufferGetBlockNumber(*buffer);
406 :
407 : /* Execute the actual insertion */
408 5632 : START_CRIT_SECTION();
409 5632 : if (extended)
410 466 : brin_page_init(page, BRIN_PAGETYPE_REGULAR);
411 5632 : off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber,
412 : false, false);
413 5632 : if (off == InvalidOffsetNumber)
414 0 : elog(ERROR, "failed to add BRIN tuple to new page");
415 5632 : MarkBufferDirty(*buffer);
416 :
417 : /* needed to update FSM below */
418 5632 : if (extended)
419 466 : freespace = br_page_get_freespace(page);
420 :
421 5632 : ItemPointerSet(&tid, blk, off);
422 5632 : brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid);
423 5632 : MarkBufferDirty(revmapbuf);
424 :
425 : /* XLOG stuff */
426 5632 : if (RelationNeedsWAL(idxrel))
427 : {
428 : xl_brin_insert xlrec;
429 : XLogRecPtr recptr;
430 : uint8 info;
431 :
432 4732 : info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0);
433 4732 : xlrec.heapBlk = heapBlk;
434 4732 : xlrec.pagesPerRange = pagesPerRange;
435 4732 : xlrec.offnum = off;
436 :
437 4732 : XLogBeginInsert();
438 4732 : XLogRegisterData((char *) &xlrec, SizeOfBrinInsert);
439 :
440 4732 : XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0));
441 4732 : XLogRegisterBufData(0, (char *) tup, itemsz);
442 :
443 4732 : XLogRegisterBuffer(1, revmapbuf, 0);
444 :
445 4732 : recptr = XLogInsert(RM_BRIN_ID, info);
446 :
447 4732 : PageSetLSN(page, recptr);
448 4732 : PageSetLSN(BufferGetPage(revmapbuf), recptr);
449 : }
450 :
451 5632 : END_CRIT_SECTION();
452 :
453 : /* Tuple is firmly on buffer; we can release our locks */
454 5632 : LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
455 5632 : LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
456 :
457 : BRIN_elog((DEBUG2, "inserted tuple (%u,%u) for range starting at %u",
458 : blk, off, heapBlk));
459 :
460 5632 : if (extended)
461 : {
462 466 : RecordPageWithFreeSpace(idxrel, blk, freespace);
463 466 : FreeSpaceMapVacuumRange(idxrel, blk, blk + 1);
464 : }
465 :
466 5632 : return off;
467 : }
468 :
469 : /*
470 : * Initialize a page with the given type.
471 : *
472 : * Caller is responsible for marking it dirty, as appropriate.
473 : */
474 : void
475 1458 : brin_page_init(Page page, uint16 type)
476 : {
477 1458 : PageInit(page, BLCKSZ, sizeof(BrinSpecialSpace));
478 :
479 1458 : BrinPageType(page) = type;
480 1458 : }
481 :
482 : /*
483 : * Initialize a new BRIN index's metapage.
484 : */
485 : void
486 428 : brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version)
487 : {
488 : BrinMetaPageData *metadata;
489 :
490 428 : brin_page_init(page, BRIN_PAGETYPE_META);
491 :
492 428 : metadata = (BrinMetaPageData *) PageGetContents(page);
493 :
494 428 : metadata->brinMagic = BRIN_META_MAGIC;
495 428 : metadata->brinVersion = version;
496 428 : metadata->pagesPerRange = pagesPerRange;
497 :
498 : /*
499 : * Note we cheat here a little. 0 is not a valid revmap block number
500 : * (because it's the metapage buffer), but doing this enables the first
501 : * revmap page to be created when the index is.
502 : */
503 428 : metadata->lastRevmapPage = 0;
504 :
505 : /*
506 : * Set pd_lower just past the end of the metadata. This is essential,
507 : * because without doing so, metadata will be lost if xlog.c compresses
508 : * the page.
509 : */
510 428 : ((PageHeader) page)->pd_lower =
511 428 : ((char *) metadata + sizeof(BrinMetaPageData)) - (char *) page;
512 428 : }
513 :
514 : /*
515 : * Initiate page evacuation protocol.
516 : *
517 : * The page must be locked in exclusive mode by the caller.
518 : *
519 : * If the page is not yet initialized or empty, return false without doing
520 : * anything; it can be used for revmap without any further changes. If it
521 : * contains tuples, mark it for evacuation and return true.
522 : */
523 : bool
524 350 : brin_start_evacuating_page(Relation idxRel, Buffer buf)
525 : {
526 : OffsetNumber off;
527 : OffsetNumber maxoff;
528 : Page page;
529 :
530 350 : page = BufferGetPage(buf);
531 :
532 350 : if (PageIsNew(page))
533 346 : return false;
534 :
535 4 : maxoff = PageGetMaxOffsetNumber(page);
536 584 : for (off = FirstOffsetNumber; off <= maxoff; off++)
537 : {
538 : ItemId lp;
539 :
540 582 : lp = PageGetItemId(page, off);
541 582 : if (ItemIdIsUsed(lp))
542 : {
543 : /*
544 : * Prevent other backends from adding more stuff to this page:
545 : * BRIN_EVACUATE_PAGE informs br_page_get_freespace that this page
546 : * can no longer be used to add new tuples. Note that this flag
547 : * is not WAL-logged, except accidentally.
548 : */
549 2 : BrinPageFlags(page) |= BRIN_EVACUATE_PAGE;
550 2 : MarkBufferDirtyHint(buf, true);
551 :
552 2 : return true;
553 : }
554 : }
555 2 : return false;
556 : }
557 :
558 : /*
559 : * Move all tuples out of a page.
560 : *
561 : * The caller must hold lock on the page. The lock and pin are released.
562 : */
563 : void
564 2 : brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
565 : BrinRevmap *revmap, Buffer buf)
566 : {
567 : OffsetNumber off;
568 : OffsetNumber maxoff;
569 : Page page;
570 2 : BrinTuple *btup = NULL;
571 2 : Size btupsz = 0;
572 :
573 2 : page = BufferGetPage(buf);
574 :
575 : Assert(BrinPageFlags(page) & BRIN_EVACUATE_PAGE);
576 :
577 2 : maxoff = PageGetMaxOffsetNumber(page);
578 584 : for (off = FirstOffsetNumber; off <= maxoff; off++)
579 : {
580 : BrinTuple *tup;
581 : Size sz;
582 : ItemId lp;
583 :
584 582 : CHECK_FOR_INTERRUPTS();
585 :
586 582 : lp = PageGetItemId(page, off);
587 582 : if (ItemIdIsUsed(lp))
588 : {
589 582 : sz = ItemIdGetLength(lp);
590 582 : tup = (BrinTuple *) PageGetItem(page, lp);
591 582 : tup = brin_copy_tuple(tup, sz, btup, &btupsz);
592 :
593 582 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
594 :
595 582 : if (!brin_doupdate(idxRel, pagesPerRange, revmap, tup->bt_blkno,
596 : buf, off, tup, sz, tup, sz, false))
597 0 : off--; /* retry */
598 :
599 582 : LockBuffer(buf, BUFFER_LOCK_SHARE);
600 :
601 : /* It's possible that someone extended the revmap over this page */
602 582 : if (!BRIN_IS_REGULAR_PAGE(page))
603 0 : break;
604 : }
605 : }
606 :
607 2 : UnlockReleaseBuffer(buf);
608 2 : }
609 :
610 : /*
611 : * Given a BRIN index page, initialize it if necessary, and record its
612 : * current free space in the FSM.
613 : *
614 : * The main use for this is when, during vacuuming, an uninitialized page is
615 : * found, which could be the result of relation extension followed by a crash
616 : * before the page can be used.
617 : *
618 : * Here, we don't bother to update upper FSM pages, instead expecting that our
619 : * caller (brin_vacuum_scan) will fix them at the end of the scan. Elsewhere
620 : * in this file, it's generally a good idea to propagate additions of free
621 : * space into the upper FSM pages immediately.
622 : */
623 : void
624 442 : brin_page_cleanup(Relation idxrel, Buffer buf)
625 : {
626 442 : Page page = BufferGetPage(buf);
627 :
628 : /*
629 : * If a page was left uninitialized, initialize it now; also record it in
630 : * FSM.
631 : *
632 : * Somebody else might be extending the relation concurrently. To avoid
633 : * re-initializing the page before they can grab the buffer lock, we
634 : * acquire the extension lock momentarily. Since they hold the extension
635 : * lock from before getting the page and after its been initialized, we're
636 : * sure to see their initialization.
637 : */
638 442 : if (PageIsNew(page))
639 : {
640 0 : LockRelationForExtension(idxrel, ShareLock);
641 0 : UnlockRelationForExtension(idxrel, ShareLock);
642 :
643 0 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
644 0 : if (PageIsNew(page))
645 : {
646 0 : brin_initialize_empty_new_buffer(idxrel, buf);
647 0 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
648 0 : return;
649 : }
650 0 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
651 : }
652 :
653 : /* Nothing to be done for non-regular index pages */
654 442 : if (BRIN_IS_META_PAGE(BufferGetPage(buf)) ||
655 340 : BRIN_IS_REVMAP_PAGE(BufferGetPage(buf)))
656 204 : return;
657 :
658 : /* Measure free space and record it */
659 238 : RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf),
660 : br_page_get_freespace(page));
661 : }
662 :
663 : /*
664 : * Return a pinned and exclusively locked buffer which can be used to insert an
665 : * index item of size itemsz (caller must ensure not to request sizes
666 : * impossible to fulfill). If oldbuf is a valid buffer, it is also locked (in
667 : * an order determined to avoid deadlocks).
668 : *
669 : * If we find that the old page is no longer a regular index page (because
670 : * of a revmap extension), the old buffer is unlocked and we return
671 : * InvalidBuffer.
672 : *
673 : * If there's no existing page with enough free space to accommodate the new
674 : * item, the relation is extended. If this happens, *extended is set to true,
675 : * and it is the caller's responsibility to initialize the page (and WAL-log
676 : * that fact) prior to use. The caller should also update the FSM with the
677 : * page's remaining free space after the insertion.
678 : *
679 : * Note that the caller is not expected to update FSM unless *extended is set
680 : * true. This policy means that we'll update FSM when a page is created, and
681 : * when it's found to have too little space for a desired tuple insertion,
682 : * but not every single time we add a tuple to the page.
683 : *
684 : * Note that in some corner cases it is possible for this routine to extend
685 : * the relation and then not return the new page. It is this routine's
686 : * responsibility to WAL-log the page initialization and to record the page in
687 : * FSM if that happens, since the caller certainly can't do it.
688 : */
689 : static Buffer
690 4018 : brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
691 : bool *extended)
692 : {
693 : BlockNumber oldblk;
694 : BlockNumber newblk;
695 : Page page;
696 : Size freespace;
697 :
698 : /* callers must have checked */
699 : Assert(itemsz <= BrinMaxItemSize);
700 :
701 4018 : if (BufferIsValid(oldbuf))
702 614 : oldblk = BufferGetBlockNumber(oldbuf);
703 : else
704 3404 : oldblk = InvalidBlockNumber;
705 :
706 : /* Choose initial target page, re-using existing target if known */
707 4018 : newblk = RelationGetTargetBlock(irel);
708 4018 : if (newblk == InvalidBlockNumber)
709 412 : newblk = GetPageWithFreeSpace(irel, itemsz);
710 :
711 : /*
712 : * Loop until we find a page with sufficient free space. By the time we
713 : * return to caller out of this loop, both buffers are valid and locked;
714 : * if we have to restart here, neither page is locked and newblk isn't
715 : * pinned (if it's even valid).
716 : */
717 : for (;;)
718 154 : {
719 : Buffer buf;
720 4172 : bool extensionLockHeld = false;
721 :
722 4172 : CHECK_FOR_INTERRUPTS();
723 :
724 4172 : *extended = false;
725 :
726 4172 : if (newblk == InvalidBlockNumber)
727 : {
728 : /*
729 : * There's not enough free space in any existing index page,
730 : * according to the FSM: extend the relation to obtain a shiny new
731 : * page.
732 : *
733 : * XXX: It's likely possible to use RBM_ZERO_AND_LOCK here,
734 : * which'd avoid the need to hold the extension lock during buffer
735 : * reclaim.
736 : */
737 488 : if (!RELATION_IS_LOCAL(irel))
738 : {
739 46 : LockRelationForExtension(irel, ExclusiveLock);
740 46 : extensionLockHeld = true;
741 : }
742 488 : buf = ReadBuffer(irel, P_NEW);
743 488 : newblk = BufferGetBlockNumber(buf);
744 488 : *extended = true;
745 :
746 : BRIN_elog((DEBUG2, "brin_getinsertbuffer: extending to page %u",
747 : BufferGetBlockNumber(buf)));
748 : }
749 3684 : else if (newblk == oldblk)
750 : {
751 : /*
752 : * There's an odd corner-case here where the FSM is out-of-date,
753 : * and gave us the old page.
754 : */
755 26 : buf = oldbuf;
756 : }
757 : else
758 : {
759 3658 : buf = ReadBuffer(irel, newblk);
760 : }
761 :
762 : /*
763 : * We lock the old buffer first, if it's earlier than the new one; but
764 : * then we need to check that it hasn't been turned into a revmap page
765 : * concurrently. If we detect that that happened, give up and tell
766 : * caller to start over.
767 : */
768 4172 : if (BufferIsValid(oldbuf) && oldblk < newblk)
769 : {
770 622 : LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
771 622 : if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)))
772 : {
773 0 : LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
774 :
775 : /*
776 : * It is possible that the new page was obtained from
777 : * extending the relation. In that case, we must be sure to
778 : * record it in the FSM before leaving, because otherwise the
779 : * space would be lost forever. However, we cannot let an
780 : * uninitialized page get in the FSM, so we need to initialize
781 : * it first.
782 : */
783 0 : if (*extended)
784 0 : brin_initialize_empty_new_buffer(irel, buf);
785 :
786 0 : if (extensionLockHeld)
787 0 : UnlockRelationForExtension(irel, ExclusiveLock);
788 :
789 0 : ReleaseBuffer(buf);
790 :
791 0 : if (*extended)
792 : {
793 0 : FreeSpaceMapVacuumRange(irel, newblk, newblk + 1);
794 : /* shouldn't matter, but don't confuse caller */
795 0 : *extended = false;
796 : }
797 :
798 0 : return InvalidBuffer;
799 : }
800 : }
801 :
802 4172 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
803 :
804 4172 : if (extensionLockHeld)
805 46 : UnlockRelationForExtension(irel, ExclusiveLock);
806 :
807 4172 : page = BufferGetPage(buf);
808 :
809 : /*
810 : * We have a new buffer to insert into. Check that the new page has
811 : * enough free space, and return it if it does; otherwise start over.
812 : * (br_page_get_freespace also checks that the FSM didn't hand us a
813 : * page that has since been repurposed for the revmap.)
814 : */
815 8344 : freespace = *extended ?
816 4172 : BrinMaxItemSize : br_page_get_freespace(page);
817 4172 : if (freespace >= itemsz)
818 : {
819 4018 : RelationSetTargetBlock(irel, newblk);
820 :
821 : /*
822 : * Lock the old buffer if not locked already. Note that in this
823 : * case we know for sure it's a regular page: it's later than the
824 : * new page we just got, which is not a revmap page, and revmap
825 : * pages are always consecutive.
826 : */
827 4018 : if (BufferIsValid(oldbuf) && oldblk > newblk)
828 : {
829 0 : LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
830 : Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)));
831 : }
832 :
833 4018 : return buf;
834 : }
835 :
836 : /* This page is no good. */
837 :
838 : /*
839 : * If an entirely new page does not contain enough free space for the
840 : * new item, then surely that item is oversized. Complain loudly; but
841 : * first make sure we initialize the page and record it as free, for
842 : * next time.
843 : */
844 154 : if (*extended)
845 : {
846 0 : brin_initialize_empty_new_buffer(irel, buf);
847 : /* since this should not happen, skip FreeSpaceMapVacuum */
848 :
849 0 : ereport(ERROR,
850 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
851 : errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
852 : itemsz, freespace, RelationGetRelationName(irel))));
853 : return InvalidBuffer; /* keep compiler quiet */
854 : }
855 :
856 154 : if (newblk != oldblk)
857 128 : UnlockReleaseBuffer(buf);
858 154 : if (BufferIsValid(oldbuf) && oldblk <= newblk)
859 34 : LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
860 :
861 : /*
862 : * Update the FSM with the new, presumably smaller, freespace value
863 : * for this page, then search for a new target page.
864 : */
865 154 : newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz);
866 : }
867 : }
868 :
869 : /*
870 : * Initialize a page as an empty regular BRIN page, WAL-log this, and record
871 : * the page in FSM.
872 : *
873 : * There are several corner situations in which we extend the relation to
874 : * obtain a new page and later find that we cannot use it immediately. When
875 : * that happens, we don't want to leave the page go unrecorded in FSM, because
876 : * there is no mechanism to get the space back and the index would bloat.
877 : * Also, because we would not WAL-log the action that would initialize the
878 : * page, the page would go uninitialized in a standby (or after recovery).
879 : *
880 : * While we record the page in FSM here, caller is responsible for doing FSM
881 : * upper-page update if that seems appropriate.
882 : */
883 : static void
884 0 : brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer)
885 : {
886 : Page page;
887 :
888 : BRIN_elog((DEBUG2,
889 : "brin_initialize_empty_new_buffer: initializing blank page %u",
890 : BufferGetBlockNumber(buffer)));
891 :
892 0 : START_CRIT_SECTION();
893 0 : page = BufferGetPage(buffer);
894 0 : brin_page_init(page, BRIN_PAGETYPE_REGULAR);
895 0 : MarkBufferDirty(buffer);
896 0 : log_newpage_buffer(buffer, true);
897 0 : END_CRIT_SECTION();
898 :
899 : /*
900 : * We update the FSM for this page, but this is not WAL-logged. This is
901 : * acceptable because VACUUM will scan the index and update the FSM with
902 : * pages whose FSM records were forgotten in a crash.
903 : */
904 0 : RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buffer),
905 : br_page_get_freespace(page));
906 0 : }
907 :
908 :
909 : /*
910 : * Return the amount of free space on a regular BRIN index page.
911 : *
912 : * If the page is not a regular page, or has been marked with the
913 : * BRIN_EVACUATE_PAGE flag, returns 0.
914 : */
915 : static Size
916 6758 : br_page_get_freespace(Page page)
917 : {
918 6758 : if (!BRIN_IS_REGULAR_PAGE(page) ||
919 6758 : (BrinPageFlags(page) & BRIN_EVACUATE_PAGE) != 0)
920 0 : return 0;
921 : else
922 6758 : return PageGetFreeSpace(page);
923 : }
|