Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * generic_xlog.c
4 : * Implementation of generic xlog records.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * src/backend/access/transam/generic_xlog.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 : #include "postgres.h"
15 :
16 : #include "access/bufmask.h"
17 : #include "access/generic_xlog.h"
18 : #include "access/xlogutils.h"
19 : #include "miscadmin.h"
20 :
21 : /*-------------------------------------------------------------------------
22 : * Internally, a delta between pages consists of a set of fragments. Each
23 : * fragment represents changes made in a given region of a page. A fragment
24 : * is made up as follows:
25 : *
26 : * - offset of page region (OffsetNumber)
27 : * - length of page region (OffsetNumber)
28 : * - data - the data to place into the region ('length' number of bytes)
29 : *
30 : * Unchanged regions of a page are not represented in its delta. As a result,
31 : * a delta can be more compact than the full page image. But having an
32 : * unchanged region between two fragments that is smaller than the fragment
33 : * header (offset+length) does not pay off in terms of the overall size of
34 : * the delta. For this reason, we merge adjacent fragments if the unchanged
35 : * region between them is <= MATCH_THRESHOLD bytes.
36 : *
37 : * We do not bother to merge fragments across the "lower" and "upper" parts
38 : * of a page; it's very seldom the case that pd_lower and pd_upper are within
39 : * MATCH_THRESHOLD bytes of each other, and handling that infrequent case
40 : * would complicate and slow down the delta-computation code unduly.
41 : * Therefore, the worst-case delta size includes two fragment headers plus
42 : * a full page's worth of data.
43 : *-------------------------------------------------------------------------
44 : */
45 : #define FRAGMENT_HEADER_SIZE (2 * sizeof(OffsetNumber))
46 : #define MATCH_THRESHOLD FRAGMENT_HEADER_SIZE
47 : #define MAX_DELTA_SIZE (BLCKSZ + 2 * FRAGMENT_HEADER_SIZE)
48 :
49 : /* Struct of generic xlog data for single page */
50 : typedef struct
51 : {
52 : Buffer buffer; /* registered buffer */
53 : int flags; /* flags for this buffer */
54 : int deltaLen; /* space consumed in delta field */
55 : char *image; /* copy of page image for modification, do not
56 : * do it in-place to have aligned memory chunk */
57 : char delta[MAX_DELTA_SIZE]; /* delta between page images */
58 : } GenericXLogPageData;
59 :
60 : /*
61 : * State of generic xlog record construction. Must be allocated at an I/O
62 : * aligned address.
63 : */
64 : struct GenericXLogState
65 : {
66 : /* Page images (properly aligned, must be first) */
67 : PGIOAlignedBlock images[MAX_GENERIC_XLOG_PAGES];
68 : /* Info about each page, see above */
69 : GenericXLogPageData pages[MAX_GENERIC_XLOG_PAGES];
70 : bool isLogged;
71 : };
72 :
73 : static void writeFragment(GenericXLogPageData *pageData, OffsetNumber offset,
74 : OffsetNumber length, const char *data);
75 : static void computeRegionDelta(GenericXLogPageData *pageData,
76 : const char *curpage, const char *targetpage,
77 : int targetStart, int targetEnd,
78 : int validStart, int validEnd);
79 : static void computeDelta(GenericXLogPageData *pageData, Page curpage, Page targetpage);
80 : static void applyPageRedo(Page page, const char *delta, Size deltaSize);
81 :
82 :
83 : /*
84 : * Write next fragment into pageData's delta.
85 : *
86 : * The fragment has the given offset and length, and data points to the
87 : * actual data (of length length).
88 : */
89 : static void
90 832930 : writeFragment(GenericXLogPageData *pageData, OffsetNumber offset, OffsetNumber length,
91 : const char *data)
92 : {
93 832930 : char *ptr = pageData->delta + pageData->deltaLen;
94 :
95 : /* Verify we have enough space */
96 : Assert(pageData->deltaLen + sizeof(offset) +
97 : sizeof(length) + length <= sizeof(pageData->delta));
98 :
99 : /* Write fragment data */
100 832930 : memcpy(ptr, &offset, sizeof(offset));
101 832930 : ptr += sizeof(offset);
102 832930 : memcpy(ptr, &length, sizeof(length));
103 832930 : ptr += sizeof(length);
104 832930 : memcpy(ptr, data, length);
105 832930 : ptr += length;
106 :
107 832930 : pageData->deltaLen = ptr - pageData->delta;
108 832930 : }
109 :
110 : /*
111 : * Compute the XLOG fragments needed to transform a region of curpage into the
112 : * corresponding region of targetpage, and append them to pageData's delta
113 : * field. The region to transform runs from targetStart to targetEnd-1.
114 : * Bytes in curpage outside the range validStart to validEnd-1 should be
115 : * considered invalid, and always overwritten with target data.
116 : *
117 : * This function is a hot spot, so it's worth being as tense as possible
118 : * about the data-matching loops.
119 : */
120 : static void
121 421260 : computeRegionDelta(GenericXLogPageData *pageData,
122 : const char *curpage, const char *targetpage,
123 : int targetStart, int targetEnd,
124 : int validStart, int validEnd)
125 : {
126 : int i,
127 : loopEnd,
128 421260 : fragmentBegin = -1,
129 421260 : fragmentEnd = -1;
130 :
131 : /* Deal with any invalid start region by including it in first fragment */
132 421260 : if (validStart > targetStart)
133 : {
134 0 : fragmentBegin = targetStart;
135 0 : targetStart = validStart;
136 : }
137 :
138 : /* We'll deal with any invalid end region after the main loop */
139 421260 : loopEnd = Min(targetEnd, validEnd);
140 :
141 : /* Examine all the potentially matchable bytes */
142 421260 : i = targetStart;
143 3382716 : while (i < loopEnd)
144 : {
145 2962312 : if (curpage[i] != targetpage[i])
146 : {
147 : /* On unmatched byte, start new fragment if not already in one */
148 2750144 : if (fragmentBegin < 0)
149 625156 : fragmentBegin = i;
150 : /* Mark unmatched-data endpoint as uncertain */
151 2750144 : fragmentEnd = -1;
152 : /* Extend the fragment as far as possible in a tight loop */
153 2750144 : i++;
154 4484838 : while (i < loopEnd && curpage[i] != targetpage[i])
155 1734694 : i++;
156 2750144 : if (i >= loopEnd)
157 856 : break;
158 : }
159 :
160 : /* Found a matched byte, so remember end of unmatched fragment */
161 2961456 : fragmentEnd = i;
162 :
163 : /*
164 : * Extend the match as far as possible in a tight loop. (On typical
165 : * workloads, this inner loop is the bulk of this function's runtime.)
166 : */
167 2961456 : i++;
168 1185630266 : while (i < loopEnd && curpage[i] == targetpage[i])
169 1182668810 : i++;
170 :
171 : /*
172 : * There are several possible cases at this point:
173 : *
174 : * 1. We have no unwritten fragment (fragmentBegin < 0). There's
175 : * nothing to write; and it doesn't matter what fragmentEnd is.
176 : *
177 : * 2. We found more than MATCH_THRESHOLD consecutive matching bytes.
178 : * Dump out the unwritten fragment, stopping at fragmentEnd.
179 : *
180 : * 3. The match extends to loopEnd. We'll do nothing here, exit the
181 : * loop, and then dump the unwritten fragment, after merging it with
182 : * the invalid end region if any. If we don't so merge, fragmentEnd
183 : * establishes how much the final writeFragment call needs to write.
184 : *
185 : * 4. We found an unmatched byte before loopEnd. The loop will repeat
186 : * and will enter the unmatched-byte stanza above. So in this case
187 : * also, it doesn't matter what fragmentEnd is. The matched bytes
188 : * will get merged into the continuing unmatched fragment.
189 : *
190 : * Only in case 3 do we reach the bottom of the loop with a meaningful
191 : * fragmentEnd value, which is why it's OK that we unconditionally
192 : * assign "fragmentEnd = i" above.
193 : */
194 2961456 : if (fragmentBegin >= 0 && i - fragmentEnd > MATCH_THRESHOLD)
195 : {
196 623854 : writeFragment(pageData, fragmentBegin,
197 623854 : fragmentEnd - fragmentBegin,
198 : targetpage + fragmentBegin);
199 623854 : fragmentBegin = -1;
200 623854 : fragmentEnd = -1; /* not really necessary */
201 : }
202 : }
203 :
204 : /* Deal with any invalid end region by including it in final fragment */
205 421260 : if (loopEnd < targetEnd)
206 : {
207 207774 : if (fragmentBegin < 0)
208 207774 : fragmentBegin = loopEnd;
209 207774 : fragmentEnd = targetEnd;
210 : }
211 :
212 : /* Write final fragment if any */
213 421260 : if (fragmentBegin >= 0)
214 : {
215 209076 : if (fragmentEnd < 0)
216 856 : fragmentEnd = targetEnd;
217 209076 : writeFragment(pageData, fragmentBegin,
218 209076 : fragmentEnd - fragmentBegin,
219 : targetpage + fragmentBegin);
220 : }
221 421260 : }
222 :
223 : /*
224 : * Compute the XLOG delta record needed to transform curpage into targetpage,
225 : * and store it in pageData's delta field.
226 : */
227 : static void
228 210630 : computeDelta(GenericXLogPageData *pageData, Page curpage, Page targetpage)
229 : {
230 210630 : int targetLower = ((PageHeader) targetpage)->pd_lower,
231 210630 : targetUpper = ((PageHeader) targetpage)->pd_upper,
232 210630 : curLower = ((PageHeader) curpage)->pd_lower,
233 210630 : curUpper = ((PageHeader) curpage)->pd_upper;
234 :
235 210630 : pageData->deltaLen = 0;
236 :
237 : /* Compute delta records for lower part of page ... */
238 210630 : computeRegionDelta(pageData, curpage, targetpage,
239 : 0, targetLower,
240 : 0, curLower);
241 : /* ... and for upper part, ignoring what's between */
242 210630 : computeRegionDelta(pageData, curpage, targetpage,
243 : targetUpper, BLCKSZ,
244 : curUpper, BLCKSZ);
245 :
246 : /*
247 : * If xlog debug is enabled, then check produced delta. Result of delta
248 : * application to curpage should be equivalent to targetpage.
249 : */
250 : #ifdef WAL_DEBUG
251 : if (XLOG_DEBUG)
252 : {
253 : PGAlignedBlock tmp;
254 :
255 : memcpy(tmp.data, curpage, BLCKSZ);
256 : applyPageRedo(tmp.data, pageData->delta, pageData->deltaLen);
257 : if (memcmp(tmp.data, targetpage, targetLower) != 0 ||
258 : memcmp(tmp.data + targetUpper, targetpage + targetUpper,
259 : BLCKSZ - targetUpper) != 0)
260 : elog(ERROR, "result of generic xlog apply does not match");
261 : }
262 : #endif
263 210630 : }
264 :
265 : /*
266 : * Start new generic xlog record for modifications to specified relation.
267 : */
268 : GenericXLogState *
269 210954 : GenericXLogStart(Relation relation)
270 : {
271 : GenericXLogState *state;
272 : int i;
273 :
274 210954 : state = (GenericXLogState *) palloc_aligned(sizeof(GenericXLogState),
275 : PG_IO_ALIGN_SIZE,
276 : 0);
277 210954 : state->isLogged = RelationNeedsWAL(relation);
278 :
279 1054770 : for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
280 : {
281 843816 : state->pages[i].image = state->images[i].data;
282 843816 : state->pages[i].buffer = InvalidBuffer;
283 : }
284 :
285 210954 : return state;
286 : }
287 :
288 : /*
289 : * Register new buffer for generic xlog record.
290 : *
291 : * Returns pointer to the page's image in the GenericXLogState, which
292 : * is what the caller should modify.
293 : *
294 : * If the buffer is already registered, just return its existing entry.
295 : * (It's not very clear what to do with the flags in such a case, but
296 : * for now we stay with the original flags.)
297 : */
298 : Page
299 212470 : GenericXLogRegisterBuffer(GenericXLogState *state, Buffer buffer, int flags)
300 : {
301 : int block_id;
302 :
303 : /* Search array for existing entry or first unused slot */
304 213986 : for (block_id = 0; block_id < MAX_GENERIC_XLOG_PAGES; block_id++)
305 : {
306 213986 : GenericXLogPageData *page = &state->pages[block_id];
307 :
308 213986 : if (BufferIsInvalid(page->buffer))
309 : {
310 : /* Empty slot, so use it (there cannot be a match later) */
311 212470 : page->buffer = buffer;
312 212470 : page->flags = flags;
313 212470 : memcpy(page->image, BufferGetPage(buffer), BLCKSZ);
314 212470 : return (Page) page->image;
315 : }
316 1516 : else if (page->buffer == buffer)
317 : {
318 : /*
319 : * Buffer is already registered. Just return the image, which is
320 : * already prepared.
321 : */
322 0 : return (Page) page->image;
323 : }
324 : }
325 :
326 0 : elog(ERROR, "maximum number %d of generic xlog buffers is exceeded",
327 : MAX_GENERIC_XLOG_PAGES);
328 : /* keep compiler quiet */
329 : return NULL;
330 : }
331 :
332 : /*
333 : * Apply changes represented by GenericXLogState to the actual buffers,
334 : * and emit a generic xlog record.
335 : */
336 : XLogRecPtr
337 209424 : GenericXLogFinish(GenericXLogState *state)
338 : {
339 : XLogRecPtr lsn;
340 : int i;
341 :
342 209424 : if (state->isLogged)
343 : {
344 : /* Logged relation: make xlog record in critical section. */
345 209412 : XLogBeginInsert();
346 :
347 209412 : START_CRIT_SECTION();
348 :
349 : /*
350 : * Compute deltas if necessary, write changes to buffers, mark buffers
351 : * dirty, and register changes.
352 : */
353 1047060 : for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
354 : {
355 837648 : GenericXLogPageData *pageData = &state->pages[i];
356 : Page page;
357 : PageHeader pageHeader;
358 :
359 837648 : if (BufferIsInvalid(pageData->buffer))
360 626720 : continue;
361 :
362 210928 : page = BufferGetPage(pageData->buffer);
363 210928 : pageHeader = (PageHeader) pageData->image;
364 :
365 : /*
366 : * Compute delta while we still have both the unmodified page and
367 : * the new image. Not needed if we are logging the full image.
368 : */
369 210928 : if (!(pageData->flags & GENERIC_XLOG_FULL_IMAGE))
370 210630 : computeDelta(pageData, page, (Page) pageData->image);
371 :
372 : /*
373 : * Apply the image, being careful to zero the "hole" between
374 : * pd_lower and pd_upper in order to avoid divergence between
375 : * actual page state and what replay would produce.
376 : */
377 210928 : memcpy(page, pageData->image, pageHeader->pd_lower);
378 210928 : memset(page + pageHeader->pd_lower, 0,
379 210928 : pageHeader->pd_upper - pageHeader->pd_lower);
380 210928 : memcpy(page + pageHeader->pd_upper,
381 210928 : pageData->image + pageHeader->pd_upper,
382 210928 : BLCKSZ - pageHeader->pd_upper);
383 :
384 210928 : MarkBufferDirty(pageData->buffer);
385 :
386 210928 : if (pageData->flags & GENERIC_XLOG_FULL_IMAGE)
387 : {
388 298 : XLogRegisterBuffer(i, pageData->buffer,
389 : REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
390 : }
391 : else
392 : {
393 210630 : XLogRegisterBuffer(i, pageData->buffer, REGBUF_STANDARD);
394 210630 : XLogRegisterBufData(i, pageData->delta, pageData->deltaLen);
395 : }
396 : }
397 :
398 : /* Insert xlog record */
399 209412 : lsn = XLogInsert(RM_GENERIC_ID, 0);
400 :
401 : /* Set LSN */
402 1047060 : for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
403 : {
404 837648 : GenericXLogPageData *pageData = &state->pages[i];
405 :
406 837648 : if (BufferIsInvalid(pageData->buffer))
407 626720 : continue;
408 210928 : PageSetLSN(BufferGetPage(pageData->buffer), lsn);
409 : }
410 209412 : END_CRIT_SECTION();
411 : }
412 : else
413 : {
414 : /* Unlogged relation: skip xlog-related stuff */
415 12 : START_CRIT_SECTION();
416 60 : for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
417 : {
418 48 : GenericXLogPageData *pageData = &state->pages[i];
419 :
420 48 : if (BufferIsInvalid(pageData->buffer))
421 36 : continue;
422 24 : memcpy(BufferGetPage(pageData->buffer),
423 12 : pageData->image,
424 : BLCKSZ);
425 : /* We don't worry about zeroing the "hole" in this case */
426 12 : MarkBufferDirty(pageData->buffer);
427 : }
428 12 : END_CRIT_SECTION();
429 : /* We don't have a LSN to return, in this case */
430 12 : lsn = InvalidXLogRecPtr;
431 : }
432 :
433 209424 : pfree(state);
434 :
435 209424 : return lsn;
436 : }
437 :
438 : /*
439 : * Abort generic xlog record construction. No changes are applied to buffers.
440 : *
441 : * Note: caller is responsible for releasing locks/pins on buffers, if needed.
442 : */
443 : void
444 1530 : GenericXLogAbort(GenericXLogState *state)
445 : {
446 1530 : pfree(state);
447 1530 : }
448 :
449 : /*
450 : * Apply delta to given page image.
451 : */
452 : static void
453 0 : applyPageRedo(Page page, const char *delta, Size deltaSize)
454 : {
455 0 : const char *ptr = delta;
456 0 : const char *end = delta + deltaSize;
457 :
458 0 : while (ptr < end)
459 : {
460 : OffsetNumber offset,
461 : length;
462 :
463 0 : memcpy(&offset, ptr, sizeof(offset));
464 0 : ptr += sizeof(offset);
465 0 : memcpy(&length, ptr, sizeof(length));
466 0 : ptr += sizeof(length);
467 :
468 0 : memcpy(page + offset, ptr, length);
469 :
470 0 : ptr += length;
471 : }
472 0 : }
473 :
474 : /*
475 : * Redo function for generic xlog record.
476 : */
477 : void
478 0 : generic_redo(XLogReaderState *record)
479 : {
480 0 : XLogRecPtr lsn = record->EndRecPtr;
481 : Buffer buffers[MAX_GENERIC_XLOG_PAGES];
482 : uint8 block_id;
483 :
484 : /* Protect limited size of buffers[] array */
485 : Assert(XLogRecMaxBlockId(record) < MAX_GENERIC_XLOG_PAGES);
486 :
487 : /* Iterate over blocks */
488 0 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
489 : {
490 : XLogRedoAction action;
491 :
492 0 : if (!XLogRecHasBlockRef(record, block_id))
493 : {
494 0 : buffers[block_id] = InvalidBuffer;
495 0 : continue;
496 : }
497 :
498 0 : action = XLogReadBufferForRedo(record, block_id, &buffers[block_id]);
499 :
500 : /* Apply redo to given block if needed */
501 0 : if (action == BLK_NEEDS_REDO)
502 : {
503 : Page page;
504 : PageHeader pageHeader;
505 : char *blockDelta;
506 : Size blockDeltaSize;
507 :
508 0 : page = BufferGetPage(buffers[block_id]);
509 0 : blockDelta = XLogRecGetBlockData(record, block_id, &blockDeltaSize);
510 0 : applyPageRedo(page, blockDelta, blockDeltaSize);
511 :
512 : /*
513 : * Since the delta contains no information about what's in the
514 : * "hole" between pd_lower and pd_upper, set that to zero to
515 : * ensure we produce the same page state that application of the
516 : * logged action by GenericXLogFinish did.
517 : */
518 0 : pageHeader = (PageHeader) page;
519 0 : memset(page + pageHeader->pd_lower, 0,
520 0 : pageHeader->pd_upper - pageHeader->pd_lower);
521 :
522 0 : PageSetLSN(page, lsn);
523 0 : MarkBufferDirty(buffers[block_id]);
524 : }
525 : }
526 :
527 : /* Changes are done: unlock and release all buffers */
528 0 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
529 : {
530 0 : if (BufferIsValid(buffers[block_id]))
531 0 : UnlockReleaseBuffer(buffers[block_id]);
532 : }
533 0 : }
534 :
535 : /*
536 : * Mask a generic page before performing consistency checks on it.
537 : */
538 : void
539 0 : generic_mask(char *page, BlockNumber blkno)
540 : {
541 0 : mask_page_lsn_and_checksum(page);
542 :
543 0 : mask_unused_space(page);
544 0 : }
|