Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtxlog.c
4 : * WAL replay logic for btrees.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/nbtree/nbtxlog.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include "access/bufmask.h"
18 : #include "access/nbtree.h"
19 : #include "access/nbtxlog.h"
20 : #include "access/transam.h"
21 : #include "access/xlogutils.h"
22 : #include "storage/standby.h"
23 : #include "utils/memutils.h"
24 :
25 : static MemoryContext opCtx; /* working memory for operations */
26 :
27 : /*
28 : * _bt_restore_page -- re-enter all the index tuples on a page
29 : *
30 : * The page is freshly init'd, and *from (length len) is a copy of what
31 : * had been its upper part (pd_upper to pd_special). We assume that the
32 : * tuples had been added to the page in item-number order, and therefore
33 : * the one with highest item number appears first (lowest on the page).
34 : */
35 : static void
36 3112 : _bt_restore_page(Page page, char *from, int len)
37 : {
38 : IndexTupleData itupdata;
39 : Size itemsz;
40 3112 : char *end = from + len;
41 : void *items[MaxIndexTuplesPerPage];
42 : uint16 itemsizes[MaxIndexTuplesPerPage];
43 : int i;
44 : int nitems;
45 :
46 : /*
47 : * To get the items back in the original order, we add them to the page in
48 : * reverse. To figure out where one tuple ends and another begins, we
49 : * have to scan them in forward order first.
50 : */
51 3112 : i = 0;
52 203464 : while (from < end)
53 : {
54 : /*
55 : * As we step through the items, 'from' won't always be properly
56 : * aligned, so we need to use memcpy(). Further, we use void * here
57 : * for our items array for the same reason; wouldn't want the compiler
58 : * or anyone thinking that an item is aligned when it isn't.
59 : */
60 200352 : memcpy(&itupdata, from, sizeof(IndexTupleData));
61 200352 : itemsz = IndexTupleSize(&itupdata);
62 200352 : itemsz = MAXALIGN(itemsz);
63 :
64 200352 : items[i] = from;
65 200352 : itemsizes[i] = itemsz;
66 200352 : i++;
67 :
68 200352 : from += itemsz;
69 : }
70 3112 : nitems = i;
71 :
72 203464 : for (i = nitems - 1; i >= 0; i--)
73 : {
74 200352 : if (PageAddItem(page, items[i], itemsizes[i], nitems - i, false, false) == InvalidOffsetNumber)
75 0 : elog(PANIC, "_bt_restore_page: cannot add item to page");
76 : }
77 3112 : }
78 :
79 : static void
80 1474 : _bt_restore_meta(XLogReaderState *record, uint8 block_id)
81 : {
82 1474 : XLogRecPtr lsn = record->EndRecPtr;
83 : Buffer metabuf;
84 : Page metapg;
85 : BTMetaPageData *md;
86 : BTPageOpaque pageop;
87 : xl_btree_metadata *xlrec;
88 : char *ptr;
89 : Size len;
90 :
91 1474 : metabuf = XLogInitBufferForRedo(record, block_id);
92 1474 : ptr = XLogRecGetBlockData(record, block_id, &len);
93 :
94 : Assert(len == sizeof(xl_btree_metadata));
95 : Assert(BufferGetBlockNumber(metabuf) == BTREE_METAPAGE);
96 1474 : xlrec = (xl_btree_metadata *) ptr;
97 1474 : metapg = BufferGetPage(metabuf);
98 :
99 1474 : _bt_pageinit(metapg, BufferGetPageSize(metabuf));
100 :
101 1474 : md = BTPageGetMeta(metapg);
102 1474 : md->btm_magic = BTREE_MAGIC;
103 1474 : md->btm_version = xlrec->version;
104 1474 : md->btm_root = xlrec->root;
105 1474 : md->btm_level = xlrec->level;
106 1474 : md->btm_fastroot = xlrec->fastroot;
107 1474 : md->btm_fastlevel = xlrec->fastlevel;
108 : /* Cannot log BTREE_MIN_VERSION index metapage without upgrade */
109 : Assert(md->btm_version >= BTREE_NOVAC_VERSION);
110 1474 : md->btm_last_cleanup_num_delpages = xlrec->last_cleanup_num_delpages;
111 1474 : md->btm_last_cleanup_num_heap_tuples = -1.0;
112 1474 : md->btm_allequalimage = xlrec->allequalimage;
113 :
114 1474 : pageop = BTPageGetOpaque(metapg);
115 1474 : pageop->btpo_flags = BTP_META;
116 :
117 : /*
118 : * Set pd_lower just past the end of the metadata. This is essential,
119 : * because without doing so, metadata will be lost if xlog.c compresses
120 : * the page.
121 : */
122 1474 : ((PageHeader) metapg)->pd_lower =
123 1474 : ((char *) md + sizeof(BTMetaPageData)) - (char *) metapg;
124 :
125 1474 : PageSetLSN(metapg, lsn);
126 1474 : MarkBufferDirty(metabuf);
127 1474 : UnlockReleaseBuffer(metabuf);
128 1474 : }
129 :
130 : /*
131 : * _bt_clear_incomplete_split -- clear INCOMPLETE_SPLIT flag on a page
132 : *
133 : * This is a common subroutine of the redo functions of all the WAL record
134 : * types that can insert a downlink: insert, split, and newroot.
135 : */
136 : static void
137 3014 : _bt_clear_incomplete_split(XLogReaderState *record, uint8 block_id)
138 : {
139 3014 : XLogRecPtr lsn = record->EndRecPtr;
140 : Buffer buf;
141 :
142 3014 : if (XLogReadBufferForRedo(record, block_id, &buf) == BLK_NEEDS_REDO)
143 : {
144 3014 : Page page = BufferGetPage(buf);
145 3014 : BTPageOpaque pageop = BTPageGetOpaque(page);
146 :
147 : Assert(P_INCOMPLETE_SPLIT(pageop));
148 3014 : pageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
149 :
150 3014 : PageSetLSN(page, lsn);
151 3014 : MarkBufferDirty(buf);
152 : }
153 3014 : if (BufferIsValid(buf))
154 3014 : UnlockReleaseBuffer(buf);
155 3014 : }
156 :
157 : static void
158 1067218 : btree_xlog_insert(bool isleaf, bool ismeta, bool posting,
159 : XLogReaderState *record)
160 : {
161 1067218 : XLogRecPtr lsn = record->EndRecPtr;
162 1067218 : xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record);
163 : Buffer buffer;
164 : Page page;
165 :
166 : /*
167 : * Insertion to an internal page finishes an incomplete split at the child
168 : * level. Clear the incomplete-split flag in the child. Note: during
169 : * normal operation, the child and parent pages are locked at the same
170 : * time (the locks are coupled), so that clearing the flag and inserting
171 : * the downlink appear atomic to other backends. We don't bother with
172 : * that during replay, because readers don't care about the
173 : * incomplete-split flag and there cannot be updates happening.
174 : */
175 1067218 : if (!isleaf)
176 2814 : _bt_clear_incomplete_split(record, 1);
177 1067218 : if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
178 : {
179 : Size datalen;
180 1048516 : char *datapos = XLogRecGetBlockData(record, 0, &datalen);
181 :
182 1048516 : page = BufferGetPage(buffer);
183 :
184 1048516 : if (!posting)
185 : {
186 : /* Simple retail insertion */
187 1042030 : if (PageAddItem(page, datapos, datalen, xlrec->offnum, false, false) == InvalidOffsetNumber)
188 0 : elog(PANIC, "failed to add new item");
189 : }
190 : else
191 : {
192 : ItemId itemid;
193 : IndexTuple oposting,
194 : newitem,
195 : nposting;
196 : uint16 postingoff;
197 :
198 : /*
199 : * A posting list split occurred during leaf page insertion. WAL
200 : * record data will start with an offset number representing the
201 : * point in an existing posting list that a split occurs at.
202 : *
203 : * Use _bt_swap_posting() to repeat posting list split steps from
204 : * primary. Note that newitem from WAL record is 'orignewitem',
205 : * not the final version of newitem that is actually inserted on
206 : * page.
207 : */
208 6486 : postingoff = *((uint16 *) datapos);
209 6486 : datapos += sizeof(uint16);
210 6486 : datalen -= sizeof(uint16);
211 :
212 6486 : itemid = PageGetItemId(page, OffsetNumberPrev(xlrec->offnum));
213 6486 : oposting = (IndexTuple) PageGetItem(page, itemid);
214 :
215 : /* Use mutable, aligned newitem copy in _bt_swap_posting() */
216 : Assert(isleaf && postingoff > 0);
217 6486 : newitem = CopyIndexTuple((IndexTuple) datapos);
218 6486 : nposting = _bt_swap_posting(newitem, oposting, postingoff);
219 :
220 : /* Replace existing posting list with post-split version */
221 6486 : memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
222 :
223 : /* Insert "final" new item (not orignewitem from WAL stream) */
224 : Assert(IndexTupleSize(newitem) == datalen);
225 6486 : if (PageAddItem(page, newitem, datalen, xlrec->offnum, false, false) == InvalidOffsetNumber)
226 0 : elog(PANIC, "failed to add posting split new item");
227 : }
228 :
229 1048516 : PageSetLSN(page, lsn);
230 1048516 : MarkBufferDirty(buffer);
231 : }
232 1067218 : if (BufferIsValid(buffer))
233 1067218 : UnlockReleaseBuffer(buffer);
234 :
235 : /*
236 : * Note: in normal operation, we'd update the metapage while still holding
237 : * lock on the page we inserted into. But during replay it's not
238 : * necessary to hold that lock, since no other index updates can be
239 : * happening concurrently, and readers will cope fine with following an
240 : * obsolete link from the metapage.
241 : */
242 1067218 : if (ismeta)
243 8 : _bt_restore_meta(record, 2);
244 1067218 : }
245 :
246 : static void
247 3014 : btree_xlog_split(bool newitemonleft, XLogReaderState *record)
248 : {
249 3014 : XLogRecPtr lsn = record->EndRecPtr;
250 3014 : xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
251 3014 : bool isleaf = (xlrec->level == 0);
252 : Buffer buf;
253 : Buffer rbuf;
254 : Page rpage;
255 : BTPageOpaque ropaque;
256 : char *datapos;
257 : Size datalen;
258 : BlockNumber origpagenumber;
259 : BlockNumber rightpagenumber;
260 : BlockNumber spagenumber;
261 :
262 3014 : XLogRecGetBlockTag(record, 0, NULL, NULL, &origpagenumber);
263 3014 : XLogRecGetBlockTag(record, 1, NULL, NULL, &rightpagenumber);
264 3014 : if (!XLogRecGetBlockTagExtended(record, 2, NULL, NULL, &spagenumber, NULL))
265 1846 : spagenumber = P_NONE;
266 :
267 : /*
268 : * Clear the incomplete split flag on the appropriate child page one level
269 : * down when origpage/buf is an internal page (there must have been
270 : * cascading page splits during original execution in the event of an
271 : * internal page split). This is like the corresponding btree_xlog_insert
272 : * call for internal pages. We're not clearing the incomplete split flag
273 : * for the current page split here (you can think of this as part of the
274 : * insert of newitem that the page split action needs to perform in
275 : * passing).
276 : *
277 : * Like in btree_xlog_insert, this can be done before locking other pages.
278 : * We never need to couple cross-level locks in REDO routines.
279 : */
280 3014 : if (!isleaf)
281 102 : _bt_clear_incomplete_split(record, 3);
282 :
283 : /* Reconstruct right (new) sibling page from scratch */
284 3014 : rbuf = XLogInitBufferForRedo(record, 1);
285 3014 : datapos = XLogRecGetBlockData(record, 1, &datalen);
286 3014 : rpage = BufferGetPage(rbuf);
287 :
288 3014 : _bt_pageinit(rpage, BufferGetPageSize(rbuf));
289 3014 : ropaque = BTPageGetOpaque(rpage);
290 :
291 3014 : ropaque->btpo_prev = origpagenumber;
292 3014 : ropaque->btpo_next = spagenumber;
293 3014 : ropaque->btpo_level = xlrec->level;
294 3014 : ropaque->btpo_flags = isleaf ? BTP_LEAF : 0;
295 3014 : ropaque->btpo_cycleid = 0;
296 :
297 3014 : _bt_restore_page(rpage, datapos, datalen);
298 :
299 3014 : PageSetLSN(rpage, lsn);
300 3014 : MarkBufferDirty(rbuf);
301 :
302 : /* Now reconstruct original page (left half of split) */
303 3014 : if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO)
304 : {
305 : /*
306 : * To retain the same physical order of the tuples that they had, we
307 : * initialize a temporary empty page for the left page and add all the
308 : * items to that in item number order. This mirrors how _bt_split()
309 : * works. Retaining the same physical order makes WAL consistency
310 : * checking possible. See also _bt_restore_page(), which does the
311 : * same for the right page.
312 : */
313 2974 : Page origpage = BufferGetPage(buf);
314 2974 : BTPageOpaque oopaque = BTPageGetOpaque(origpage);
315 : OffsetNumber off;
316 2974 : IndexTuple newitem = NULL,
317 2974 : left_hikey = NULL,
318 2974 : nposting = NULL;
319 2974 : Size newitemsz = 0,
320 2974 : left_hikeysz = 0;
321 : Page leftpage;
322 : OffsetNumber leftoff,
323 2974 : replacepostingoff = InvalidOffsetNumber;
324 :
325 2974 : datapos = XLogRecGetBlockData(record, 0, &datalen);
326 :
327 2974 : if (newitemonleft || xlrec->postingoff != 0)
328 : {
329 314 : newitem = (IndexTuple) datapos;
330 314 : newitemsz = MAXALIGN(IndexTupleSize(newitem));
331 314 : datapos += newitemsz;
332 314 : datalen -= newitemsz;
333 :
334 314 : if (xlrec->postingoff != 0)
335 : {
336 : ItemId itemid;
337 : IndexTuple oposting;
338 :
339 : /* Posting list must be at offset number before new item's */
340 10 : replacepostingoff = OffsetNumberPrev(xlrec->newitemoff);
341 :
342 : /* Use mutable, aligned newitem copy in _bt_swap_posting() */
343 10 : newitem = CopyIndexTuple(newitem);
344 10 : itemid = PageGetItemId(origpage, replacepostingoff);
345 10 : oposting = (IndexTuple) PageGetItem(origpage, itemid);
346 10 : nposting = _bt_swap_posting(newitem, oposting,
347 10 : xlrec->postingoff);
348 : }
349 : }
350 :
351 : /*
352 : * Extract left hikey and its size. We assume that 16-bit alignment
353 : * is enough to apply IndexTupleSize (since it's fetching from a
354 : * uint16 field).
355 : */
356 2974 : left_hikey = (IndexTuple) datapos;
357 2974 : left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey));
358 2974 : datapos += left_hikeysz;
359 2974 : datalen -= left_hikeysz;
360 :
361 : Assert(datalen == 0);
362 :
363 2974 : leftpage = PageGetTempPageCopySpecial(origpage);
364 :
365 : /* Add high key tuple from WAL record to temp page */
366 2974 : leftoff = P_HIKEY;
367 2974 : if (PageAddItem(leftpage, left_hikey, left_hikeysz, P_HIKEY, false, false) == InvalidOffsetNumber)
368 0 : elog(ERROR, "failed to add high key to left page after split");
369 2974 : leftoff = OffsetNumberNext(leftoff);
370 :
371 669926 : for (off = P_FIRSTDATAKEY(oopaque); off < xlrec->firstrightoff; off++)
372 : {
373 : ItemId itemid;
374 : Size itemsz;
375 : IndexTuple item;
376 :
377 : /* Add replacement posting list when required */
378 666952 : if (off == replacepostingoff)
379 : {
380 : Assert(newitemonleft ||
381 : xlrec->firstrightoff == xlrec->newitemoff);
382 10 : if (PageAddItem(leftpage, nposting, MAXALIGN(IndexTupleSize(nposting)), leftoff, false, false) == InvalidOffsetNumber)
383 0 : elog(ERROR, "failed to add new posting list item to left page after split");
384 10 : leftoff = OffsetNumberNext(leftoff);
385 10 : continue; /* don't insert oposting */
386 : }
387 :
388 : /* add the new item if it was inserted on left page */
389 666942 : else if (newitemonleft && off == xlrec->newitemoff)
390 : {
391 270 : if (PageAddItem(leftpage, newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber)
392 0 : elog(ERROR, "failed to add new item to left page after split");
393 270 : leftoff = OffsetNumberNext(leftoff);
394 : }
395 :
396 666942 : itemid = PageGetItemId(origpage, off);
397 666942 : itemsz = ItemIdGetLength(itemid);
398 666942 : item = (IndexTuple) PageGetItem(origpage, itemid);
399 666942 : if (PageAddItem(leftpage, item, itemsz, leftoff, false, false) == InvalidOffsetNumber)
400 0 : elog(ERROR, "failed to add old item to left page after split");
401 666942 : leftoff = OffsetNumberNext(leftoff);
402 : }
403 :
404 : /* cope with possibility that newitem goes at the end */
405 2974 : if (newitemonleft && off == xlrec->newitemoff)
406 : {
407 42 : if (PageAddItem(leftpage, newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber)
408 0 : elog(ERROR, "failed to add new item to left page after split");
409 42 : leftoff = OffsetNumberNext(leftoff);
410 : }
411 :
412 2974 : PageRestoreTempPage(leftpage, origpage);
413 :
414 : /* Fix opaque fields */
415 2974 : oopaque->btpo_flags = BTP_INCOMPLETE_SPLIT;
416 2974 : if (isleaf)
417 2872 : oopaque->btpo_flags |= BTP_LEAF;
418 2974 : oopaque->btpo_next = rightpagenumber;
419 2974 : oopaque->btpo_cycleid = 0;
420 :
421 2974 : PageSetLSN(origpage, lsn);
422 2974 : MarkBufferDirty(buf);
423 : }
424 :
425 : /* Fix left-link of the page to the right of the new right sibling */
426 3014 : if (spagenumber != P_NONE)
427 : {
428 : Buffer sbuf;
429 :
430 1168 : if (XLogReadBufferForRedo(record, 2, &sbuf) == BLK_NEEDS_REDO)
431 : {
432 570 : Page spage = BufferGetPage(sbuf);
433 570 : BTPageOpaque spageop = BTPageGetOpaque(spage);
434 :
435 570 : spageop->btpo_prev = rightpagenumber;
436 :
437 570 : PageSetLSN(spage, lsn);
438 570 : MarkBufferDirty(sbuf);
439 : }
440 1168 : if (BufferIsValid(sbuf))
441 1168 : UnlockReleaseBuffer(sbuf);
442 : }
443 :
444 : /*
445 : * Finally, release the remaining buffers. sbuf, rbuf, and buf must be
446 : * released together, so that readers cannot observe inconsistencies.
447 : */
448 3014 : UnlockReleaseBuffer(rbuf);
449 3014 : if (BufferIsValid(buf))
450 3014 : UnlockReleaseBuffer(buf);
451 3014 : }
452 :
453 : static void
454 4638 : btree_xlog_dedup(XLogReaderState *record)
455 : {
456 4638 : XLogRecPtr lsn = record->EndRecPtr;
457 4638 : xl_btree_dedup *xlrec = (xl_btree_dedup *) XLogRecGetData(record);
458 : Buffer buf;
459 :
460 4638 : if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO)
461 : {
462 4596 : char *ptr = XLogRecGetBlockData(record, 0, NULL);
463 4596 : Page page = BufferGetPage(buf);
464 4596 : BTPageOpaque opaque = BTPageGetOpaque(page);
465 : OffsetNumber offnum,
466 : minoff,
467 : maxoff;
468 : BTDedupState state;
469 : BTDedupInterval *intervals;
470 : Page newpage;
471 :
472 4596 : state = (BTDedupState) palloc(sizeof(BTDedupStateData));
473 4596 : state->deduplicate = true; /* unused */
474 4596 : state->nmaxitems = 0; /* unused */
475 : /* Conservatively use larger maxpostingsize than primary */
476 4596 : state->maxpostingsize = BTMaxItemSize;
477 4596 : state->base = NULL;
478 4596 : state->baseoff = InvalidOffsetNumber;
479 4596 : state->basetupsize = 0;
480 4596 : state->htids = palloc(state->maxpostingsize);
481 4596 : state->nhtids = 0;
482 4596 : state->nitems = 0;
483 4596 : state->phystupsize = 0;
484 4596 : state->nintervals = 0;
485 :
486 4596 : minoff = P_FIRSTDATAKEY(opaque);
487 4596 : maxoff = PageGetMaxOffsetNumber(page);
488 4596 : newpage = PageGetTempPageCopySpecial(page);
489 :
490 4596 : if (!P_RIGHTMOST(opaque))
491 : {
492 3996 : ItemId itemid = PageGetItemId(page, P_HIKEY);
493 3996 : Size itemsz = ItemIdGetLength(itemid);
494 3996 : IndexTuple item = (IndexTuple) PageGetItem(page, itemid);
495 :
496 3996 : if (PageAddItem(newpage, item, itemsz, P_HIKEY, false, false) == InvalidOffsetNumber)
497 0 : elog(ERROR, "deduplication failed to add highkey");
498 : }
499 :
500 4596 : intervals = (BTDedupInterval *) ptr;
501 4596 : for (offnum = minoff;
502 1059464 : offnum <= maxoff;
503 1054868 : offnum = OffsetNumberNext(offnum))
504 : {
505 1054868 : ItemId itemid = PageGetItemId(page, offnum);
506 1054868 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
507 :
508 1054868 : if (offnum == minoff)
509 4596 : _bt_dedup_start_pending(state, itup, offnum);
510 1050272 : else if (state->nintervals < xlrec->nintervals &&
511 775140 : state->baseoff == intervals[state->nintervals].baseoff &&
512 264144 : state->nitems < intervals[state->nintervals].nitems)
513 : {
514 175712 : if (!_bt_dedup_save_htid(state, itup))
515 0 : elog(ERROR, "deduplication failed to add heap tid to pending posting list");
516 : }
517 : else
518 : {
519 874560 : _bt_dedup_finish_pending(newpage, state);
520 874560 : _bt_dedup_start_pending(state, itup, offnum);
521 : }
522 : }
523 :
524 4596 : _bt_dedup_finish_pending(newpage, state);
525 : Assert(state->nintervals == xlrec->nintervals);
526 : Assert(memcmp(state->intervals, intervals,
527 : state->nintervals * sizeof(BTDedupInterval)) == 0);
528 :
529 4596 : if (P_HAS_GARBAGE(opaque))
530 : {
531 0 : BTPageOpaque nopaque = BTPageGetOpaque(newpage);
532 :
533 0 : nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
534 : }
535 :
536 4596 : PageRestoreTempPage(newpage, page);
537 4596 : PageSetLSN(page, lsn);
538 4596 : MarkBufferDirty(buf);
539 : }
540 :
541 4638 : if (BufferIsValid(buf))
542 4638 : UnlockReleaseBuffer(buf);
543 4638 : }
544 :
545 : static void
546 250 : btree_xlog_updates(Page page, OffsetNumber *updatedoffsets,
547 : xl_btree_update *updates, int nupdated)
548 : {
549 : BTVacuumPosting vacposting;
550 : IndexTuple origtuple;
551 : ItemId itemid;
552 : Size itemsz;
553 :
554 7712 : for (int i = 0; i < nupdated; i++)
555 : {
556 7462 : itemid = PageGetItemId(page, updatedoffsets[i]);
557 7462 : origtuple = (IndexTuple) PageGetItem(page, itemid);
558 :
559 7462 : vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
560 7462 : updates->ndeletedtids * sizeof(uint16));
561 7462 : vacposting->updatedoffset = updatedoffsets[i];
562 7462 : vacposting->itup = origtuple;
563 7462 : vacposting->ndeletedtids = updates->ndeletedtids;
564 7462 : memcpy(vacposting->deletetids,
565 : (char *) updates + SizeOfBtreeUpdate,
566 7462 : updates->ndeletedtids * sizeof(uint16));
567 :
568 7462 : _bt_update_posting(vacposting);
569 :
570 : /* Overwrite updated version of tuple */
571 7462 : itemsz = MAXALIGN(IndexTupleSize(vacposting->itup));
572 7462 : if (!PageIndexTupleOverwrite(page, updatedoffsets[i], vacposting->itup, itemsz))
573 0 : elog(PANIC, "failed to update partially dead item");
574 :
575 7462 : pfree(vacposting->itup);
576 7462 : pfree(vacposting);
577 :
578 : /* advance to next xl_btree_update from array */
579 7462 : updates = (xl_btree_update *)
580 7462 : ((char *) updates + SizeOfBtreeUpdate +
581 7462 : updates->ndeletedtids * sizeof(uint16));
582 : }
583 250 : }
584 :
585 : static void
586 2744 : btree_xlog_vacuum(XLogReaderState *record)
587 : {
588 2744 : XLogRecPtr lsn = record->EndRecPtr;
589 2744 : xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record);
590 : Buffer buffer;
591 : Page page;
592 : BTPageOpaque opaque;
593 :
594 : /*
595 : * We need to take a cleanup lock here, just like btvacuumpage(). However,
596 : * it isn't necessary to exhaustively get a cleanup lock on every block in
597 : * the index during recovery (just getting a cleanup lock on pages with
598 : * items to kill suffices). See nbtree/README for details.
599 : */
600 2744 : if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer)
601 : == BLK_NEEDS_REDO)
602 : {
603 1444 : char *ptr = XLogRecGetBlockData(record, 0, NULL);
604 :
605 1444 : page = BufferGetPage(buffer);
606 :
607 1444 : if (xlrec->nupdated > 0)
608 : {
609 : OffsetNumber *updatedoffsets;
610 : xl_btree_update *updates;
611 :
612 52 : updatedoffsets = (OffsetNumber *)
613 52 : (ptr + xlrec->ndeleted * sizeof(OffsetNumber));
614 52 : updates = (xl_btree_update *) ((char *) updatedoffsets +
615 52 : xlrec->nupdated *
616 : sizeof(OffsetNumber));
617 :
618 52 : btree_xlog_updates(page, updatedoffsets, updates, xlrec->nupdated);
619 : }
620 :
621 1444 : if (xlrec->ndeleted > 0)
622 1434 : PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted);
623 :
624 : /*
625 : * Clear the vacuum cycle ID, and mark the page as not containing any
626 : * LP_DEAD items
627 : */
628 1444 : opaque = BTPageGetOpaque(page);
629 1444 : opaque->btpo_cycleid = 0;
630 1444 : opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
631 :
632 1444 : PageSetLSN(page, lsn);
633 1444 : MarkBufferDirty(buffer);
634 : }
635 2744 : if (BufferIsValid(buffer))
636 2744 : UnlockReleaseBuffer(buffer);
637 2744 : }
638 :
639 : static void
640 1582 : btree_xlog_delete(XLogReaderState *record)
641 : {
642 1582 : XLogRecPtr lsn = record->EndRecPtr;
643 1582 : xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
644 : Buffer buffer;
645 : Page page;
646 : BTPageOpaque opaque;
647 :
648 : /*
649 : * If we have any conflict processing to do, it must happen before we
650 : * update the page
651 : */
652 1582 : if (InHotStandby)
653 : {
654 : RelFileLocator rlocator;
655 :
656 1582 : XLogRecGetBlockTag(record, 0, &rlocator, NULL, NULL);
657 :
658 1582 : ResolveRecoveryConflictWithSnapshot(xlrec->snapshotConflictHorizon,
659 1582 : xlrec->isCatalogRel,
660 : rlocator);
661 : }
662 :
663 : /*
664 : * We don't need to take a cleanup lock to apply these changes. See
665 : * nbtree/README for details.
666 : */
667 1582 : if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
668 : {
669 1546 : char *ptr = XLogRecGetBlockData(record, 0, NULL);
670 :
671 1546 : page = BufferGetPage(buffer);
672 :
673 1546 : if (xlrec->nupdated > 0)
674 : {
675 : OffsetNumber *updatedoffsets;
676 : xl_btree_update *updates;
677 :
678 198 : updatedoffsets = (OffsetNumber *)
679 198 : (ptr + xlrec->ndeleted * sizeof(OffsetNumber));
680 198 : updates = (xl_btree_update *) ((char *) updatedoffsets +
681 198 : xlrec->nupdated *
682 : sizeof(OffsetNumber));
683 :
684 198 : btree_xlog_updates(page, updatedoffsets, updates, xlrec->nupdated);
685 : }
686 :
687 1546 : if (xlrec->ndeleted > 0)
688 1510 : PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted);
689 :
690 : /*
691 : * Do *not* clear the vacuum cycle ID, but do mark the page as not
692 : * containing any LP_DEAD items
693 : */
694 1546 : opaque = BTPageGetOpaque(page);
695 1546 : opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
696 :
697 1546 : PageSetLSN(page, lsn);
698 1546 : MarkBufferDirty(buffer);
699 : }
700 1582 : if (BufferIsValid(buffer))
701 1582 : UnlockReleaseBuffer(buffer);
702 1582 : }
703 :
704 : static void
705 1224 : btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record)
706 : {
707 1224 : XLogRecPtr lsn = record->EndRecPtr;
708 1224 : xl_btree_mark_page_halfdead *xlrec = (xl_btree_mark_page_halfdead *) XLogRecGetData(record);
709 : Buffer buffer;
710 : Page page;
711 : BTPageOpaque pageop;
712 : IndexTupleData trunctuple;
713 :
714 : /*
715 : * In normal operation, we would lock all the pages this WAL record
716 : * touches before changing any of them. In WAL replay, it should be okay
717 : * to lock just one page at a time, since no concurrent index updates can
718 : * be happening, and readers should not care whether they arrive at the
719 : * target page or not (since it's surely empty).
720 : */
721 :
722 : /* to-be-deleted subtree's parent page */
723 1224 : if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
724 : {
725 : OffsetNumber poffset;
726 : ItemId itemid;
727 : IndexTuple itup;
728 : OffsetNumber nextoffset;
729 : BlockNumber rightsib;
730 :
731 1200 : page = BufferGetPage(buffer);
732 1200 : pageop = BTPageGetOpaque(page);
733 :
734 1200 : poffset = xlrec->poffset;
735 :
736 1200 : nextoffset = OffsetNumberNext(poffset);
737 1200 : itemid = PageGetItemId(page, nextoffset);
738 1200 : itup = (IndexTuple) PageGetItem(page, itemid);
739 1200 : rightsib = BTreeTupleGetDownLink(itup);
740 :
741 1200 : itemid = PageGetItemId(page, poffset);
742 1200 : itup = (IndexTuple) PageGetItem(page, itemid);
743 1200 : BTreeTupleSetDownLink(itup, rightsib);
744 1200 : nextoffset = OffsetNumberNext(poffset);
745 1200 : PageIndexTupleDelete(page, nextoffset);
746 :
747 1200 : PageSetLSN(page, lsn);
748 1200 : MarkBufferDirty(buffer);
749 : }
750 :
751 : /*
752 : * Don't need to couple cross-level locks in REDO routines, so release
753 : * lock on internal page immediately
754 : */
755 1224 : if (BufferIsValid(buffer))
756 1224 : UnlockReleaseBuffer(buffer);
757 :
758 : /* Rewrite the leaf page as a halfdead page */
759 1224 : buffer = XLogInitBufferForRedo(record, 0);
760 1224 : page = BufferGetPage(buffer);
761 :
762 1224 : _bt_pageinit(page, BufferGetPageSize(buffer));
763 1224 : pageop = BTPageGetOpaque(page);
764 :
765 1224 : pageop->btpo_prev = xlrec->leftblk;
766 1224 : pageop->btpo_next = xlrec->rightblk;
767 1224 : pageop->btpo_level = 0;
768 1224 : pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
769 1224 : pageop->btpo_cycleid = 0;
770 :
771 : /*
772 : * Construct a dummy high key item that points to top parent page (value
773 : * is InvalidBlockNumber when the top parent page is the leaf page itself)
774 : */
775 1224 : MemSet(&trunctuple, 0, sizeof(IndexTupleData));
776 1224 : trunctuple.t_info = sizeof(IndexTupleData);
777 1224 : BTreeTupleSetTopParent(&trunctuple, xlrec->topparent);
778 :
779 1224 : if (PageAddItem(page, &trunctuple, sizeof(IndexTupleData), P_HIKEY, false, false) == InvalidOffsetNumber)
780 0 : elog(ERROR, "could not add dummy high key to half-dead page");
781 :
782 1224 : PageSetLSN(page, lsn);
783 1224 : MarkBufferDirty(buffer);
784 1224 : UnlockReleaseBuffer(buffer);
785 1224 : }
786 :
787 :
788 : static void
789 1232 : btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
790 : {
791 1232 : XLogRecPtr lsn = record->EndRecPtr;
792 1232 : xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record);
793 : BlockNumber leftsib;
794 : BlockNumber rightsib;
795 : uint32 level;
796 : bool isleaf;
797 : FullTransactionId safexid;
798 : Buffer leftbuf;
799 : Buffer target;
800 : Buffer rightbuf;
801 : Page page;
802 : BTPageOpaque pageop;
803 :
804 1232 : leftsib = xlrec->leftsib;
805 1232 : rightsib = xlrec->rightsib;
806 1232 : level = xlrec->level;
807 1232 : isleaf = (level == 0);
808 1232 : safexid = xlrec->safexid;
809 :
810 : /* No leaftopparent for level 0 (leaf page) or level 1 target */
811 : Assert(!BlockNumberIsValid(xlrec->leaftopparent) || level > 1);
812 :
813 : /*
814 : * In normal operation, we would lock all the pages this WAL record
815 : * touches before changing any of them. In WAL replay, we at least lock
816 : * the pages in the same standard left-to-right order (leftsib, target,
817 : * rightsib), and don't release the sibling locks until the target is
818 : * marked deleted.
819 : */
820 :
821 : /* Fix right-link of left sibling, if any */
822 1232 : if (leftsib != P_NONE)
823 : {
824 120 : if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO)
825 : {
826 118 : page = BufferGetPage(leftbuf);
827 118 : pageop = BTPageGetOpaque(page);
828 118 : pageop->btpo_next = rightsib;
829 :
830 118 : PageSetLSN(page, lsn);
831 118 : MarkBufferDirty(leftbuf);
832 : }
833 : }
834 : else
835 1112 : leftbuf = InvalidBuffer;
836 :
837 : /* Rewrite target page as empty deleted page */
838 1232 : target = XLogInitBufferForRedo(record, 0);
839 1232 : page = BufferGetPage(target);
840 :
841 1232 : _bt_pageinit(page, BufferGetPageSize(target));
842 1232 : pageop = BTPageGetOpaque(page);
843 :
844 1232 : pageop->btpo_prev = leftsib;
845 1232 : pageop->btpo_next = rightsib;
846 1232 : pageop->btpo_level = level;
847 1232 : BTPageSetDeleted(page, safexid);
848 1232 : if (isleaf)
849 1222 : pageop->btpo_flags |= BTP_LEAF;
850 1232 : pageop->btpo_cycleid = 0;
851 :
852 1232 : PageSetLSN(page, lsn);
853 1232 : MarkBufferDirty(target);
854 :
855 : /* Fix left-link of right sibling */
856 1232 : if (XLogReadBufferForRedo(record, 2, &rightbuf) == BLK_NEEDS_REDO)
857 : {
858 30 : page = BufferGetPage(rightbuf);
859 30 : pageop = BTPageGetOpaque(page);
860 30 : pageop->btpo_prev = leftsib;
861 :
862 30 : PageSetLSN(page, lsn);
863 30 : MarkBufferDirty(rightbuf);
864 : }
865 :
866 : /* Release siblings */
867 1232 : if (BufferIsValid(leftbuf))
868 120 : UnlockReleaseBuffer(leftbuf);
869 1232 : if (BufferIsValid(rightbuf))
870 1232 : UnlockReleaseBuffer(rightbuf);
871 :
872 : /* Release target */
873 1232 : UnlockReleaseBuffer(target);
874 :
875 : /*
876 : * If we deleted a parent of the targeted leaf page, instead of the leaf
877 : * itself, update the leaf to point to the next remaining child in the
878 : * to-be-deleted subtree
879 : */
880 1232 : if (XLogRecHasBlockRef(record, 3))
881 : {
882 : /*
883 : * There is no real data on the page, so we just re-create it from
884 : * scratch using the information from the WAL record.
885 : *
886 : * Note that we don't end up here when the target page is also the
887 : * leafbuf page. There is no need to add a dummy hikey item with a
888 : * top parent link when deleting leafbuf because it's the last page
889 : * we'll delete in the subtree undergoing deletion.
890 : */
891 : Buffer leafbuf;
892 : IndexTupleData trunctuple;
893 :
894 : Assert(!isleaf);
895 :
896 10 : leafbuf = XLogInitBufferForRedo(record, 3);
897 10 : page = BufferGetPage(leafbuf);
898 :
899 10 : _bt_pageinit(page, BufferGetPageSize(leafbuf));
900 10 : pageop = BTPageGetOpaque(page);
901 :
902 10 : pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
903 10 : pageop->btpo_prev = xlrec->leafleftsib;
904 10 : pageop->btpo_next = xlrec->leafrightsib;
905 10 : pageop->btpo_level = 0;
906 10 : pageop->btpo_cycleid = 0;
907 :
908 : /* Add a dummy hikey item */
909 20 : MemSet(&trunctuple, 0, sizeof(IndexTupleData));
910 10 : trunctuple.t_info = sizeof(IndexTupleData);
911 10 : BTreeTupleSetTopParent(&trunctuple, xlrec->leaftopparent);
912 :
913 10 : if (PageAddItem(page, &trunctuple, sizeof(IndexTupleData), P_HIKEY, false, false) == InvalidOffsetNumber)
914 0 : elog(ERROR, "could not add dummy high key to half-dead page");
915 :
916 10 : PageSetLSN(page, lsn);
917 10 : MarkBufferDirty(leafbuf);
918 10 : UnlockReleaseBuffer(leafbuf);
919 : }
920 :
921 : /* Update metapage if needed */
922 1232 : if (info == XLOG_BTREE_UNLINK_PAGE_META)
923 12 : _bt_restore_meta(record, 4);
924 1232 : }
925 :
926 : static void
927 1416 : btree_xlog_newroot(XLogReaderState *record)
928 : {
929 1416 : XLogRecPtr lsn = record->EndRecPtr;
930 1416 : xl_btree_newroot *xlrec = (xl_btree_newroot *) XLogRecGetData(record);
931 : Buffer buffer;
932 : Page page;
933 : BTPageOpaque pageop;
934 : char *ptr;
935 : Size len;
936 :
937 1416 : buffer = XLogInitBufferForRedo(record, 0);
938 1416 : page = BufferGetPage(buffer);
939 :
940 1416 : _bt_pageinit(page, BufferGetPageSize(buffer));
941 1416 : pageop = BTPageGetOpaque(page);
942 :
943 1416 : pageop->btpo_flags = BTP_ROOT;
944 1416 : pageop->btpo_prev = pageop->btpo_next = P_NONE;
945 1416 : pageop->btpo_level = xlrec->level;
946 1416 : if (xlrec->level == 0)
947 1318 : pageop->btpo_flags |= BTP_LEAF;
948 1416 : pageop->btpo_cycleid = 0;
949 :
950 1416 : if (xlrec->level > 0)
951 : {
952 98 : ptr = XLogRecGetBlockData(record, 0, &len);
953 98 : _bt_restore_page(page, ptr, len);
954 :
955 : /* Clear the incomplete-split flag in left child */
956 98 : _bt_clear_incomplete_split(record, 1);
957 : }
958 :
959 1416 : PageSetLSN(page, lsn);
960 1416 : MarkBufferDirty(buffer);
961 1416 : UnlockReleaseBuffer(buffer);
962 :
963 1416 : _bt_restore_meta(record, 2);
964 1416 : }
965 :
966 : /*
967 : * In general VACUUM must defer recycling as a way of avoiding certain race
968 : * conditions. Deleted pages contain a safexid value that is used by VACUUM
969 : * to determine whether or not it's safe to place a page that was deleted by
970 : * VACUUM earlier into the FSM now. See nbtree/README.
971 : *
972 : * As far as any backend operating during original execution is concerned, the
973 : * FSM is a cache of recycle-safe pages; the mere presence of the page in the
974 : * FSM indicates that the page must already be safe to recycle (actually,
975 : * _bt_allocbuf() verifies it's safe using BTPageIsRecyclable(), but that's
976 : * just because it would be unwise to completely trust the FSM, given its
977 : * current limitations).
978 : *
979 : * This isn't sufficient to prevent similar concurrent recycling race
980 : * conditions during Hot Standby, though. For that we need to log a
981 : * xl_btree_reuse_page record at the point that a page is actually recycled
982 : * and reused for an entirely unrelated page inside _bt_split(). These
983 : * records include the same safexid value from the original deleted page,
984 : * stored in the record's snapshotConflictHorizon field.
985 : *
986 : * The GlobalVisCheckRemovableFullXid() test in BTPageIsRecyclable() is used
987 : * to determine if it's safe to recycle a page. This mirrors our own test:
988 : * the PGPROC->xmin > limitXmin test inside GetConflictingVirtualXIDs().
989 : * Consequently, one XID value achieves the same exclusion effect on primary
990 : * and standby.
991 : */
992 : static void
993 108 : btree_xlog_reuse_page(XLogReaderState *record)
994 : {
995 108 : xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record);
996 :
997 108 : if (InHotStandby)
998 108 : ResolveRecoveryConflictWithSnapshotFullXid(xlrec->snapshotConflictHorizon,
999 108 : xlrec->isCatalogRel,
1000 : xlrec->locator);
1001 108 : }
1002 :
1003 : void
1004 1083214 : btree_redo(XLogReaderState *record)
1005 : {
1006 1083214 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
1007 : MemoryContext oldCtx;
1008 :
1009 1083214 : oldCtx = MemoryContextSwitchTo(opCtx);
1010 1083214 : switch (info)
1011 : {
1012 1057690 : case XLOG_BTREE_INSERT_LEAF:
1013 1057690 : btree_xlog_insert(true, false, false, record);
1014 1057690 : break;
1015 2806 : case XLOG_BTREE_INSERT_UPPER:
1016 2806 : btree_xlog_insert(false, false, false, record);
1017 2806 : break;
1018 8 : case XLOG_BTREE_INSERT_META:
1019 8 : btree_xlog_insert(false, true, false, record);
1020 8 : break;
1021 332 : case XLOG_BTREE_SPLIT_L:
1022 332 : btree_xlog_split(true, record);
1023 332 : break;
1024 2682 : case XLOG_BTREE_SPLIT_R:
1025 2682 : btree_xlog_split(false, record);
1026 2682 : break;
1027 6714 : case XLOG_BTREE_INSERT_POST:
1028 6714 : btree_xlog_insert(true, false, true, record);
1029 6714 : break;
1030 4638 : case XLOG_BTREE_DEDUP:
1031 4638 : btree_xlog_dedup(record);
1032 4638 : break;
1033 2744 : case XLOG_BTREE_VACUUM:
1034 2744 : btree_xlog_vacuum(record);
1035 2744 : break;
1036 1582 : case XLOG_BTREE_DELETE:
1037 1582 : btree_xlog_delete(record);
1038 1582 : break;
1039 1224 : case XLOG_BTREE_MARK_PAGE_HALFDEAD:
1040 1224 : btree_xlog_mark_page_halfdead(info, record);
1041 1224 : break;
1042 1232 : case XLOG_BTREE_UNLINK_PAGE:
1043 : case XLOG_BTREE_UNLINK_PAGE_META:
1044 1232 : btree_xlog_unlink_page(info, record);
1045 1232 : break;
1046 1416 : case XLOG_BTREE_NEWROOT:
1047 1416 : btree_xlog_newroot(record);
1048 1416 : break;
1049 108 : case XLOG_BTREE_REUSE_PAGE:
1050 108 : btree_xlog_reuse_page(record);
1051 108 : break;
1052 38 : case XLOG_BTREE_META_CLEANUP:
1053 38 : _bt_restore_meta(record, 0);
1054 38 : break;
1055 0 : default:
1056 0 : elog(PANIC, "btree_redo: unknown op code %u", info);
1057 : }
1058 1083214 : MemoryContextSwitchTo(oldCtx);
1059 1083214 : MemoryContextReset(opCtx);
1060 1083214 : }
1061 :
1062 : void
1063 410 : btree_xlog_startup(void)
1064 : {
1065 410 : opCtx = AllocSetContextCreate(CurrentMemoryContext,
1066 : "Btree recovery temporary context",
1067 : ALLOCSET_DEFAULT_SIZES);
1068 410 : }
1069 :
1070 : void
1071 296 : btree_xlog_cleanup(void)
1072 : {
1073 296 : MemoryContextDelete(opCtx);
1074 296 : opCtx = NULL;
1075 296 : }
1076 :
1077 : /*
1078 : * Mask a btree page before performing consistency checks on it.
1079 : */
1080 : void
1081 1735692 : btree_mask(char *pagedata, BlockNumber blkno)
1082 : {
1083 1735692 : Page page = (Page) pagedata;
1084 : BTPageOpaque maskopaq;
1085 :
1086 1735692 : mask_page_lsn_and_checksum(page);
1087 :
1088 1735692 : mask_page_hint_bits(page);
1089 1735692 : mask_unused_space(page);
1090 :
1091 1735692 : maskopaq = BTPageGetOpaque(page);
1092 :
1093 1735692 : if (P_ISLEAF(maskopaq))
1094 : {
1095 : /*
1096 : * In btree leaf pages, it is possible to modify the LP_FLAGS without
1097 : * emitting any WAL record. Hence, mask the line pointer flags. See
1098 : * _bt_killitems(), _bt_check_unique() for details.
1099 : */
1100 1727568 : mask_lp_flags(page);
1101 : }
1102 :
1103 : /*
1104 : * BTP_HAS_GARBAGE is just an un-logged hint bit. So, mask it. See
1105 : * _bt_delete_or_dedup_one_page(), _bt_killitems(), and _bt_check_unique()
1106 : * for details.
1107 : */
1108 1735692 : maskopaq->btpo_flags &= ~BTP_HAS_GARBAGE;
1109 :
1110 : /*
1111 : * During replay of a btree page split, we don't set the BTP_SPLIT_END
1112 : * flag of the right sibling and initialize the cycle_id to 0 for the same
1113 : * page. See btree_xlog_split() for details.
1114 : */
1115 1735692 : maskopaq->btpo_flags &= ~BTP_SPLIT_END;
1116 1735692 : maskopaq->btpo_cycleid = 0;
1117 1735692 : }
|