Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * parsexlog.c
4 : * Functions for reading Write-Ahead-Log
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *-------------------------------------------------------------------------
10 : */
11 :
12 : #include "postgres_fe.h"
13 :
14 : #include <unistd.h>
15 :
16 : #include "access/rmgr.h"
17 : #include "access/xact.h"
18 : #include "access/xlog_internal.h"
19 : #include "access/xlogreader.h"
20 : #include "catalog/pg_control.h"
21 : #include "catalog/storage_xlog.h"
22 : #include "commands/dbcommands_xlog.h"
23 : #include "fe_utils/archive.h"
24 : #include "filemap.h"
25 : #include "pg_rewind.h"
26 :
27 : /*
28 : * RmgrNames is an array of the built-in resource manager names, to make error
29 : * messages a bit nicer.
30 : */
31 : #define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode) \
32 : name,
33 :
34 : static const char *const RmgrNames[RM_MAX_ID + 1] = {
35 : #include "access/rmgrlist.h"
36 : };
37 :
38 : #define RmgrName(rmid) (((rmid) <= RM_MAX_BUILTIN_ID) ? \
39 : RmgrNames[rmid] : "custom")
40 :
41 : static void extractPageInfo(XLogReaderState *record);
42 :
43 : static int xlogreadfd = -1;
44 : static XLogSegNo xlogreadsegno = 0;
45 : static char xlogfpath[MAXPGPATH];
46 :
47 : typedef struct XLogPageReadPrivate
48 : {
49 : const char *restoreCommand;
50 : int tliIndex;
51 : } XLogPageReadPrivate;
52 :
53 : static int SimpleXLogPageRead(XLogReaderState *xlogreader,
54 : XLogRecPtr targetPagePtr,
55 : int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
56 :
57 : /*
58 : * Read WAL from the datadir/pg_wal, starting from 'startpoint' on timeline
59 : * index 'tliIndex' in target timeline history, until 'endpoint'. Make note of
60 : * the data blocks touched by the WAL records, and return them in a page map.
61 : *
62 : * 'endpoint' is the end of the last record to read. The record starting at
63 : * 'endpoint' is the first one that is not read.
64 : */
65 : void
66 28 : extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex,
67 : XLogRecPtr endpoint, const char *restoreCommand)
68 : {
69 : XLogRecord *record;
70 : XLogReaderState *xlogreader;
71 : char *errormsg;
72 : XLogPageReadPrivate private;
73 :
74 28 : private.tliIndex = tliIndex;
75 28 : private.restoreCommand = restoreCommand;
76 28 : xlogreader = XLogReaderAllocate(WalSegSz, datadir,
77 28 : XL_ROUTINE(.page_read = &SimpleXLogPageRead),
78 : &private);
79 28 : if (xlogreader == NULL)
80 0 : pg_fatal("out of memory while allocating a WAL reading processor");
81 :
82 28 : XLogBeginRead(xlogreader, startpoint);
83 : do
84 : {
85 172886 : record = XLogReadRecord(xlogreader, &errormsg);
86 :
87 172886 : if (record == NULL)
88 : {
89 0 : XLogRecPtr errptr = xlogreader->EndRecPtr;
90 :
91 0 : if (errormsg)
92 0 : pg_fatal("could not read WAL record at %X/%X: %s",
93 : LSN_FORMAT_ARGS(errptr),
94 : errormsg);
95 : else
96 0 : pg_fatal("could not read WAL record at %X/%X",
97 : LSN_FORMAT_ARGS(errptr));
98 : }
99 :
100 172886 : extractPageInfo(xlogreader);
101 172886 : } while (xlogreader->EndRecPtr < endpoint);
102 :
103 : /*
104 : * If 'endpoint' didn't point exactly at a record boundary, the caller
105 : * messed up.
106 : */
107 28 : if (xlogreader->EndRecPtr != endpoint)
108 0 : pg_fatal("end pointer %X/%X is not a valid end point; expected %X/%X",
109 : LSN_FORMAT_ARGS(endpoint), LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
110 :
111 28 : XLogReaderFree(xlogreader);
112 28 : if (xlogreadfd != -1)
113 : {
114 28 : close(xlogreadfd);
115 28 : xlogreadfd = -1;
116 : }
117 28 : }
118 :
119 : /*
120 : * Reads one WAL record. Returns the end position of the record, without
121 : * doing anything with the record itself.
122 : */
123 : XLogRecPtr
124 28 : readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex,
125 : const char *restoreCommand)
126 : {
127 : XLogRecord *record;
128 : XLogReaderState *xlogreader;
129 : char *errormsg;
130 : XLogPageReadPrivate private;
131 : XLogRecPtr endptr;
132 :
133 28 : private.tliIndex = tliIndex;
134 28 : private.restoreCommand = restoreCommand;
135 28 : xlogreader = XLogReaderAllocate(WalSegSz, datadir,
136 28 : XL_ROUTINE(.page_read = &SimpleXLogPageRead),
137 : &private);
138 28 : if (xlogreader == NULL)
139 0 : pg_fatal("out of memory while allocating a WAL reading processor");
140 :
141 28 : XLogBeginRead(xlogreader, ptr);
142 28 : record = XLogReadRecord(xlogreader, &errormsg);
143 28 : if (record == NULL)
144 : {
145 0 : if (errormsg)
146 0 : pg_fatal("could not read WAL record at %X/%X: %s",
147 : LSN_FORMAT_ARGS(ptr), errormsg);
148 : else
149 0 : pg_fatal("could not read WAL record at %X/%X",
150 : LSN_FORMAT_ARGS(ptr));
151 : }
152 28 : endptr = xlogreader->EndRecPtr;
153 :
154 28 : XLogReaderFree(xlogreader);
155 28 : if (xlogreadfd != -1)
156 : {
157 28 : close(xlogreadfd);
158 28 : xlogreadfd = -1;
159 : }
160 :
161 28 : return endptr;
162 : }
163 :
164 : /*
165 : * Find the previous checkpoint preceding given WAL location.
166 : */
167 : void
168 28 : findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex,
169 : XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli,
170 : XLogRecPtr *lastchkptredo, const char *restoreCommand)
171 : {
172 : /* Walk backwards, starting from the given record */
173 : XLogRecord *record;
174 : XLogRecPtr searchptr;
175 : XLogReaderState *xlogreader;
176 : char *errormsg;
177 : XLogPageReadPrivate private;
178 28 : XLogSegNo current_segno = 0;
179 28 : TimeLineID current_tli = 0;
180 :
181 : /*
182 : * The given fork pointer points to the end of the last common record,
183 : * which is not necessarily the beginning of the next record, if the
184 : * previous record happens to end at a page boundary. Skip over the page
185 : * header in that case to find the next record.
186 : */
187 28 : if (forkptr % XLOG_BLCKSZ == 0)
188 : {
189 4 : if (XLogSegmentOffset(forkptr, WalSegSz) == 0)
190 4 : forkptr += SizeOfXLogLongPHD;
191 : else
192 0 : forkptr += SizeOfXLogShortPHD;
193 : }
194 :
195 28 : private.tliIndex = tliIndex;
196 28 : private.restoreCommand = restoreCommand;
197 28 : xlogreader = XLogReaderAllocate(WalSegSz, datadir,
198 28 : XL_ROUTINE(.page_read = &SimpleXLogPageRead),
199 : &private);
200 28 : if (xlogreader == NULL)
201 0 : pg_fatal("out of memory while allocating a WAL reading processor");
202 :
203 28 : searchptr = forkptr;
204 : for (;;)
205 5252 : {
206 : uint8 info;
207 :
208 5280 : XLogBeginRead(xlogreader, searchptr);
209 5280 : record = XLogReadRecord(xlogreader, &errormsg);
210 :
211 5280 : if (record == NULL)
212 : {
213 0 : if (errormsg)
214 0 : pg_fatal("could not find previous WAL record at %X/%X: %s",
215 : LSN_FORMAT_ARGS(searchptr),
216 : errormsg);
217 : else
218 0 : pg_fatal("could not find previous WAL record at %X/%X",
219 : LSN_FORMAT_ARGS(searchptr));
220 : }
221 :
222 : /* Detect if a new WAL file has been opened */
223 5280 : if (xlogreader->seg.ws_tli != current_tli ||
224 5252 : xlogreader->seg.ws_segno != current_segno)
225 : {
226 : char xlogfname[MAXFNAMELEN];
227 :
228 38 : snprintf(xlogfname, MAXFNAMELEN, XLOGDIR "/");
229 :
230 : /* update current values */
231 38 : current_tli = xlogreader->seg.ws_tli;
232 38 : current_segno = xlogreader->seg.ws_segno;
233 :
234 38 : XLogFileName(xlogfname + sizeof(XLOGDIR),
235 : current_tli, current_segno, WalSegSz);
236 :
237 : /* Track this filename as one to not remove */
238 38 : keepwal_add_entry(xlogfname);
239 : }
240 :
241 : /*
242 : * Check if it is a checkpoint record. This checkpoint record needs to
243 : * be the latest checkpoint before WAL forked and not the checkpoint
244 : * where the primary has been stopped to be rewound.
245 : */
246 5280 : info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK;
247 5280 : if (searchptr < forkptr &&
248 5252 : XLogRecGetRmid(xlogreader) == RM_XLOG_ID &&
249 3786 : (info == XLOG_CHECKPOINT_SHUTDOWN ||
250 : info == XLOG_CHECKPOINT_ONLINE))
251 : {
252 : CheckPoint checkPoint;
253 :
254 28 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
255 28 : *lastchkptrec = searchptr;
256 28 : *lastchkpttli = checkPoint.ThisTimeLineID;
257 28 : *lastchkptredo = checkPoint.redo;
258 28 : break;
259 : }
260 :
261 : /* Walk backwards to previous record. */
262 5252 : searchptr = record->xl_prev;
263 : }
264 :
265 28 : XLogReaderFree(xlogreader);
266 28 : if (xlogreadfd != -1)
267 : {
268 28 : close(xlogreadfd);
269 28 : xlogreadfd = -1;
270 : }
271 28 : }
272 :
273 : /* XLogReader callback function, to read a WAL page */
274 : static int
275 11642 : SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
276 : int reqLen, XLogRecPtr targetRecPtr, char *readBuf)
277 : {
278 11642 : XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
279 : uint32 targetPageOff;
280 : XLogRecPtr targetSegEnd;
281 : XLogSegNo targetSegNo;
282 : int r;
283 :
284 11642 : XLByteToSeg(targetPagePtr, targetSegNo, WalSegSz);
285 11642 : XLogSegNoOffsetToRecPtr(targetSegNo + 1, 0, WalSegSz, targetSegEnd);
286 11642 : targetPageOff = XLogSegmentOffset(targetPagePtr, WalSegSz);
287 :
288 : /*
289 : * See if we need to switch to a new segment because the requested record
290 : * is not in the currently open one.
291 : */
292 11642 : if (xlogreadfd >= 0 &&
293 11558 : !XLByteInSeg(targetPagePtr, xlogreadsegno, WalSegSz))
294 : {
295 20 : close(xlogreadfd);
296 20 : xlogreadfd = -1;
297 : }
298 :
299 11642 : XLByteToSeg(targetPagePtr, xlogreadsegno, WalSegSz);
300 :
301 11642 : if (xlogreadfd < 0)
302 : {
303 : char xlogfname[MAXFNAMELEN];
304 :
305 : /*
306 : * Since incomplete segments are copied into next timelines, switch to
307 : * the timeline holding the required segment. Assuming this scan can
308 : * be done both forward and backward, consider also switching timeline
309 : * accordingly.
310 : */
311 108 : while (private->tliIndex < targetNentries - 1 &&
312 4 : targetHistory[private->tliIndex].end < targetSegEnd)
313 4 : private->tliIndex++;
314 104 : while (private->tliIndex > 0 &&
315 12 : targetHistory[private->tliIndex].begin >= targetSegEnd)
316 0 : private->tliIndex--;
317 :
318 104 : XLogFileName(xlogfname, targetHistory[private->tliIndex].tli,
319 : xlogreadsegno, WalSegSz);
320 :
321 104 : snprintf(xlogfpath, MAXPGPATH, "%s/" XLOGDIR "/%s",
322 104 : xlogreader->segcxt.ws_dir, xlogfname);
323 :
324 104 : xlogreadfd = open(xlogfpath, O_RDONLY | PG_BINARY, 0);
325 :
326 104 : if (xlogreadfd < 0)
327 : {
328 : /*
329 : * If we have no restore_command to execute, then exit.
330 : */
331 2 : if (private->restoreCommand == NULL)
332 : {
333 0 : pg_log_error("could not open file \"%s\": %m", xlogfpath);
334 0 : return -1;
335 : }
336 :
337 : /*
338 : * Since we have restore_command, then try to retrieve missing WAL
339 : * file from the archive.
340 : */
341 2 : xlogreadfd = RestoreArchivedFile(xlogreader->segcxt.ws_dir,
342 : xlogfname,
343 : WalSegSz,
344 : private->restoreCommand);
345 :
346 2 : if (xlogreadfd < 0)
347 0 : return -1;
348 : else
349 2 : pg_log_debug("using file \"%s\" restored from archive",
350 : xlogfpath);
351 : }
352 : }
353 :
354 : /*
355 : * At this point, we have the right segment open.
356 : */
357 : Assert(xlogreadfd != -1);
358 :
359 : /* Read the requested page */
360 11642 : if (lseek(xlogreadfd, (off_t) targetPageOff, SEEK_SET) < 0)
361 : {
362 0 : pg_log_error("could not seek in file \"%s\": %m", xlogfpath);
363 0 : return -1;
364 : }
365 :
366 :
367 11642 : r = read(xlogreadfd, readBuf, XLOG_BLCKSZ);
368 11642 : if (r != XLOG_BLCKSZ)
369 : {
370 0 : if (r < 0)
371 0 : pg_log_error("could not read file \"%s\": %m", xlogfpath);
372 : else
373 0 : pg_log_error("could not read file \"%s\": read %d of %zu",
374 : xlogfpath, r, (Size) XLOG_BLCKSZ);
375 :
376 0 : return -1;
377 : }
378 :
379 : Assert(targetSegNo == xlogreadsegno);
380 :
381 11642 : xlogreader->seg.ws_tli = targetHistory[private->tliIndex].tli;
382 11642 : return XLOG_BLCKSZ;
383 : }
384 :
385 : /*
386 : * Extract information on which blocks the current record modifies.
387 : */
388 : static void
389 172886 : extractPageInfo(XLogReaderState *record)
390 : {
391 : int block_id;
392 172886 : RmgrId rmid = XLogRecGetRmid(record);
393 172886 : uint8 info = XLogRecGetInfo(record);
394 172886 : uint8 rminfo = info & ~XLR_INFO_MASK;
395 :
396 : /* Is this a special record type that I recognize? */
397 :
398 172886 : if (rmid == RM_DBASE_ID && rminfo == XLOG_DBASE_CREATE_FILE_COPY)
399 : {
400 : /*
401 : * New databases can be safely ignored. It won't be present in the
402 : * source system, so it will be deleted. There's one corner-case,
403 : * though: if a new, different, database is also created in the source
404 : * system, we'll see that the files already exist and not copy them.
405 : * That's OK, though; WAL replay of creating the new database, from
406 : * the source systems's WAL, will re-copy the new database,
407 : * overwriting the database created in the target system.
408 : */
409 : }
410 172886 : else if (rmid == RM_DBASE_ID && rminfo == XLOG_DBASE_CREATE_WAL_LOG)
411 : {
412 : /*
413 : * New databases can be safely ignored. It won't be present in the
414 : * source system, so it will be deleted.
415 : */
416 : }
417 172878 : else if (rmid == RM_DBASE_ID && rminfo == XLOG_DBASE_DROP)
418 : {
419 : /*
420 : * An existing database was dropped. We'll see that the files don't
421 : * exist in the target data dir, and copy them in toto from the source
422 : * system. No need to do anything special here.
423 : */
424 : }
425 172878 : else if (rmid == RM_SMGR_ID && rminfo == XLOG_SMGR_CREATE)
426 : {
427 : /*
428 : * We can safely ignore these. The file will be removed from the
429 : * target, if it doesn't exist in source system. If a file with same
430 : * name is created in source system, too, there will be WAL records
431 : * for all the blocks in it.
432 : */
433 : }
434 170484 : else if (rmid == RM_SMGR_ID && rminfo == XLOG_SMGR_TRUNCATE)
435 : {
436 : /*
437 : * We can safely ignore these. When we compare the sizes later on,
438 : * we'll notice that they differ, and copy the missing tail from
439 : * source system.
440 : */
441 : }
442 170476 : else if (rmid == RM_XACT_ID &&
443 86 : ((rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT ||
444 0 : (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT_PREPARED ||
445 0 : (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_ABORT ||
446 0 : (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_ABORT_PREPARED))
447 : {
448 : /*
449 : * These records can include "dropped rels". We can safely ignore
450 : * them, we will see that they are missing and copy them from the
451 : * source.
452 : */
453 : }
454 170390 : else if (info & XLR_SPECIAL_REL_UPDATE)
455 : {
456 : /*
457 : * This record type modifies a relation file in some special way, but
458 : * we don't recognize the type. That's bad - we don't know how to
459 : * track that change.
460 : */
461 0 : pg_fatal("WAL record modifies a relation, but record type is not recognized: "
462 : "lsn: %X/%X, rmid: %d, rmgr: %s, info: %02X",
463 : LSN_FORMAT_ARGS(record->ReadRecPtr),
464 : rmid, RmgrName(rmid), info);
465 : }
466 :
467 343824 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
468 : {
469 : RelFileLocator rlocator;
470 : ForkNumber forknum;
471 : BlockNumber blkno;
472 :
473 170938 : if (!XLogRecGetBlockTagExtended(record, block_id,
474 : &rlocator, &forknum, &blkno, NULL))
475 1898 : continue;
476 :
477 : /* We only care about the main fork; others are copied in toto */
478 170938 : if (forknum != MAIN_FORKNUM)
479 1898 : continue;
480 :
481 169040 : process_target_wal_block_change(forknum, rlocator, blkno);
482 : }
483 172886 : }
|