Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * parsexlog.c
4 : * Functions for reading Write-Ahead-Log
5 : *
6 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *-------------------------------------------------------------------------
10 : */
11 :
12 : #include "postgres_fe.h"
13 :
14 : #include <unistd.h>
15 :
16 : #include "access/rmgr.h"
17 : #include "access/xact.h"
18 : #include "access/xlog_internal.h"
19 : #include "access/xlogreader.h"
20 : #include "catalog/pg_control.h"
21 : #include "catalog/storage_xlog.h"
22 : #include "commands/dbcommands_xlog.h"
23 : #include "fe_utils/archive.h"
24 : #include "filemap.h"
25 : #include "pg_rewind.h"
26 :
27 : /*
28 : * RmgrNames is an array of the built-in resource manager names, to make error
29 : * messages a bit nicer.
30 : */
31 : #define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode) \
32 : name,
33 :
34 : static const char *const RmgrNames[RM_MAX_ID + 1] = {
35 : #include "access/rmgrlist.h"
36 : };
37 :
38 : #define RmgrName(rmid) (((rmid) <= RM_MAX_BUILTIN_ID) ? \
39 : RmgrNames[rmid] : "custom")
40 :
41 : static void extractPageInfo(XLogReaderState *record);
42 :
43 : static int xlogreadfd = -1;
44 : static XLogSegNo xlogreadsegno = 0;
45 : static char xlogfpath[MAXPGPATH];
46 :
47 : typedef struct XLogPageReadPrivate
48 : {
49 : const char *restoreCommand;
50 : int tliIndex;
51 : } XLogPageReadPrivate;
52 :
53 : static int SimpleXLogPageRead(XLogReaderState *xlogreader,
54 : XLogRecPtr targetPagePtr,
55 : int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
56 :
57 : /*
58 : * Read WAL from the datadir/pg_wal, starting from 'startpoint' on timeline
59 : * index 'tliIndex' in target timeline history, until 'endpoint'. Make note of
60 : * the data blocks touched by the WAL records, and return them in a page map.
61 : *
62 : * 'endpoint' is the end of the last record to read. The record starting at
63 : * 'endpoint' is the first one that is not read.
64 : */
65 : void
66 26 : extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex,
67 : XLogRecPtr endpoint, const char *restoreCommand)
68 : {
69 : XLogRecord *record;
70 : XLogReaderState *xlogreader;
71 : char *errormsg;
72 : XLogPageReadPrivate private;
73 :
74 26 : private.tliIndex = tliIndex;
75 26 : private.restoreCommand = restoreCommand;
76 26 : xlogreader = XLogReaderAllocate(WalSegSz, datadir,
77 26 : XL_ROUTINE(.page_read = &SimpleXLogPageRead),
78 : &private);
79 26 : if (xlogreader == NULL)
80 0 : pg_fatal("out of memory while allocating a WAL reading processor");
81 :
82 26 : XLogBeginRead(xlogreader, startpoint);
83 : do
84 : {
85 172604 : record = XLogReadRecord(xlogreader, &errormsg);
86 :
87 172604 : if (record == NULL)
88 : {
89 0 : XLogRecPtr errptr = xlogreader->EndRecPtr;
90 :
91 0 : if (errormsg)
92 0 : pg_fatal("could not read WAL record at %X/%X: %s",
93 : LSN_FORMAT_ARGS(errptr),
94 : errormsg);
95 : else
96 0 : pg_fatal("could not read WAL record at %X/%X",
97 : LSN_FORMAT_ARGS(errptr));
98 : }
99 :
100 172604 : extractPageInfo(xlogreader);
101 172604 : } while (xlogreader->EndRecPtr < endpoint);
102 :
103 : /*
104 : * If 'endpoint' didn't point exactly at a record boundary, the caller
105 : * messed up.
106 : */
107 26 : if (xlogreader->EndRecPtr != endpoint)
108 0 : pg_fatal("end pointer %X/%X is not a valid end point; expected %X/%X",
109 : LSN_FORMAT_ARGS(endpoint), LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
110 :
111 26 : XLogReaderFree(xlogreader);
112 26 : if (xlogreadfd != -1)
113 : {
114 26 : close(xlogreadfd);
115 26 : xlogreadfd = -1;
116 : }
117 26 : }
118 :
119 : /*
120 : * Reads one WAL record. Returns the end position of the record, without
121 : * doing anything with the record itself.
122 : */
123 : XLogRecPtr
124 26 : readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex,
125 : const char *restoreCommand)
126 : {
127 : XLogRecord *record;
128 : XLogReaderState *xlogreader;
129 : char *errormsg;
130 : XLogPageReadPrivate private;
131 : XLogRecPtr endptr;
132 :
133 26 : private.tliIndex = tliIndex;
134 26 : private.restoreCommand = restoreCommand;
135 26 : xlogreader = XLogReaderAllocate(WalSegSz, datadir,
136 26 : XL_ROUTINE(.page_read = &SimpleXLogPageRead),
137 : &private);
138 26 : if (xlogreader == NULL)
139 0 : pg_fatal("out of memory while allocating a WAL reading processor");
140 :
141 26 : XLogBeginRead(xlogreader, ptr);
142 26 : record = XLogReadRecord(xlogreader, &errormsg);
143 26 : if (record == NULL)
144 : {
145 0 : if (errormsg)
146 0 : pg_fatal("could not read WAL record at %X/%X: %s",
147 : LSN_FORMAT_ARGS(ptr), errormsg);
148 : else
149 0 : pg_fatal("could not read WAL record at %X/%X",
150 : LSN_FORMAT_ARGS(ptr));
151 : }
152 26 : endptr = xlogreader->EndRecPtr;
153 :
154 26 : XLogReaderFree(xlogreader);
155 26 : if (xlogreadfd != -1)
156 : {
157 26 : close(xlogreadfd);
158 26 : xlogreadfd = -1;
159 : }
160 :
161 26 : return endptr;
162 : }
163 :
164 : /*
165 : * Find the previous checkpoint preceding given WAL location.
166 : */
167 : void
168 26 : findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex,
169 : XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli,
170 : XLogRecPtr *lastchkptredo, const char *restoreCommand)
171 : {
172 : /* Walk backwards, starting from the given record */
173 : XLogRecord *record;
174 : XLogRecPtr searchptr;
175 : XLogReaderState *xlogreader;
176 : char *errormsg;
177 : XLogPageReadPrivate private;
178 :
179 : /*
180 : * The given fork pointer points to the end of the last common record,
181 : * which is not necessarily the beginning of the next record, if the
182 : * previous record happens to end at a page boundary. Skip over the page
183 : * header in that case to find the next record.
184 : */
185 26 : if (forkptr % XLOG_BLCKSZ == 0)
186 : {
187 4 : if (XLogSegmentOffset(forkptr, WalSegSz) == 0)
188 4 : forkptr += SizeOfXLogLongPHD;
189 : else
190 0 : forkptr += SizeOfXLogShortPHD;
191 : }
192 :
193 26 : private.tliIndex = tliIndex;
194 26 : private.restoreCommand = restoreCommand;
195 26 : xlogreader = XLogReaderAllocate(WalSegSz, datadir,
196 26 : XL_ROUTINE(.page_read = &SimpleXLogPageRead),
197 : &private);
198 26 : if (xlogreader == NULL)
199 0 : pg_fatal("out of memory while allocating a WAL reading processor");
200 :
201 26 : searchptr = forkptr;
202 : for (;;)
203 5074 : {
204 : uint8 info;
205 :
206 5100 : XLogBeginRead(xlogreader, searchptr);
207 5100 : record = XLogReadRecord(xlogreader, &errormsg);
208 :
209 5100 : if (record == NULL)
210 : {
211 0 : if (errormsg)
212 0 : pg_fatal("could not find previous WAL record at %X/%X: %s",
213 : LSN_FORMAT_ARGS(searchptr),
214 : errormsg);
215 : else
216 0 : pg_fatal("could not find previous WAL record at %X/%X",
217 : LSN_FORMAT_ARGS(searchptr));
218 : }
219 :
220 : /*
221 : * Check if it is a checkpoint record. This checkpoint record needs to
222 : * be the latest checkpoint before WAL forked and not the checkpoint
223 : * where the primary has been stopped to be rewound.
224 : */
225 5100 : info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK;
226 5100 : if (searchptr < forkptr &&
227 5074 : XLogRecGetRmid(xlogreader) == RM_XLOG_ID &&
228 3706 : (info == XLOG_CHECKPOINT_SHUTDOWN ||
229 : info == XLOG_CHECKPOINT_ONLINE))
230 : {
231 : CheckPoint checkPoint;
232 :
233 26 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
234 26 : *lastchkptrec = searchptr;
235 26 : *lastchkpttli = checkPoint.ThisTimeLineID;
236 26 : *lastchkptredo = checkPoint.redo;
237 26 : break;
238 : }
239 :
240 : /* Walk backwards to previous record. */
241 5074 : searchptr = record->xl_prev;
242 : }
243 :
244 26 : XLogReaderFree(xlogreader);
245 26 : if (xlogreadfd != -1)
246 : {
247 26 : close(xlogreadfd);
248 26 : xlogreadfd = -1;
249 : }
250 26 : }
251 :
252 : /* XLogReader callback function, to read a WAL page */
253 : static int
254 11330 : SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
255 : int reqLen, XLogRecPtr targetRecPtr, char *readBuf)
256 : {
257 11330 : XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
258 : uint32 targetPageOff;
259 : XLogRecPtr targetSegEnd;
260 : XLogSegNo targetSegNo;
261 : int r;
262 :
263 11330 : XLByteToSeg(targetPagePtr, targetSegNo, WalSegSz);
264 11330 : XLogSegNoOffsetToRecPtr(targetSegNo + 1, 0, WalSegSz, targetSegEnd);
265 11330 : targetPageOff = XLogSegmentOffset(targetPagePtr, WalSegSz);
266 :
267 : /*
268 : * See if we need to switch to a new segment because the requested record
269 : * is not in the currently open one.
270 : */
271 11330 : if (xlogreadfd >= 0 &&
272 11252 : !XLByteInSeg(targetPagePtr, xlogreadsegno, WalSegSz))
273 : {
274 16 : close(xlogreadfd);
275 16 : xlogreadfd = -1;
276 : }
277 :
278 11330 : XLByteToSeg(targetPagePtr, xlogreadsegno, WalSegSz);
279 :
280 11330 : if (xlogreadfd < 0)
281 : {
282 : char xlogfname[MAXFNAMELEN];
283 :
284 : /*
285 : * Since incomplete segments are copied into next timelines, switch to
286 : * the timeline holding the required segment. Assuming this scan can
287 : * be done both forward and backward, consider also switching timeline
288 : * accordingly.
289 : */
290 98 : while (private->tliIndex < targetNentries - 1 &&
291 4 : targetHistory[private->tliIndex].end < targetSegEnd)
292 4 : private->tliIndex++;
293 94 : while (private->tliIndex > 0 &&
294 12 : targetHistory[private->tliIndex].begin >= targetSegEnd)
295 0 : private->tliIndex--;
296 :
297 94 : XLogFileName(xlogfname, targetHistory[private->tliIndex].tli,
298 : xlogreadsegno, WalSegSz);
299 :
300 94 : snprintf(xlogfpath, MAXPGPATH, "%s/" XLOGDIR "/%s",
301 94 : xlogreader->segcxt.ws_dir, xlogfname);
302 :
303 94 : xlogreadfd = open(xlogfpath, O_RDONLY | PG_BINARY, 0);
304 :
305 94 : if (xlogreadfd < 0)
306 : {
307 : /*
308 : * If we have no restore_command to execute, then exit.
309 : */
310 2 : if (private->restoreCommand == NULL)
311 : {
312 0 : pg_log_error("could not open file \"%s\": %m", xlogfpath);
313 0 : return -1;
314 : }
315 :
316 : /*
317 : * Since we have restore_command, then try to retrieve missing WAL
318 : * file from the archive.
319 : */
320 2 : xlogreadfd = RestoreArchivedFile(xlogreader->segcxt.ws_dir,
321 : xlogfname,
322 : WalSegSz,
323 : private->restoreCommand);
324 :
325 2 : if (xlogreadfd < 0)
326 0 : return -1;
327 : else
328 2 : pg_log_debug("using file \"%s\" restored from archive",
329 : xlogfpath);
330 : }
331 : }
332 :
333 : /*
334 : * At this point, we have the right segment open.
335 : */
336 : Assert(xlogreadfd != -1);
337 :
338 : /* Read the requested page */
339 11330 : if (lseek(xlogreadfd, (off_t) targetPageOff, SEEK_SET) < 0)
340 : {
341 0 : pg_log_error("could not seek in file \"%s\": %m", xlogfpath);
342 0 : return -1;
343 : }
344 :
345 :
346 11330 : r = read(xlogreadfd, readBuf, XLOG_BLCKSZ);
347 11330 : if (r != XLOG_BLCKSZ)
348 : {
349 0 : if (r < 0)
350 0 : pg_log_error("could not read file \"%s\": %m", xlogfpath);
351 : else
352 0 : pg_log_error("could not read file \"%s\": read %d of %zu",
353 : xlogfpath, r, (Size) XLOG_BLCKSZ);
354 :
355 0 : return -1;
356 : }
357 :
358 : Assert(targetSegNo == xlogreadsegno);
359 :
360 11330 : xlogreader->seg.ws_tli = targetHistory[private->tliIndex].tli;
361 11330 : return XLOG_BLCKSZ;
362 : }
363 :
364 : /*
365 : * Extract information on which blocks the current record modifies.
366 : */
367 : static void
368 172604 : extractPageInfo(XLogReaderState *record)
369 : {
370 : int block_id;
371 172604 : RmgrId rmid = XLogRecGetRmid(record);
372 172604 : uint8 info = XLogRecGetInfo(record);
373 172604 : uint8 rminfo = info & ~XLR_INFO_MASK;
374 :
375 : /* Is this a special record type that I recognize? */
376 :
377 172604 : if (rmid == RM_DBASE_ID && rminfo == XLOG_DBASE_CREATE_FILE_COPY)
378 : {
379 : /*
380 : * New databases can be safely ignored. It won't be present in the
381 : * source system, so it will be deleted. There's one corner-case,
382 : * though: if a new, different, database is also created in the source
383 : * system, we'll see that the files already exist and not copy them.
384 : * That's OK, though; WAL replay of creating the new database, from
385 : * the source systems's WAL, will re-copy the new database,
386 : * overwriting the database created in the target system.
387 : */
388 : }
389 172604 : else if (rmid == RM_DBASE_ID && rminfo == XLOG_DBASE_CREATE_WAL_LOG)
390 : {
391 : /*
392 : * New databases can be safely ignored. It won't be present in the
393 : * source system, so it will be deleted.
394 : */
395 : }
396 172596 : else if (rmid == RM_DBASE_ID && rminfo == XLOG_DBASE_DROP)
397 : {
398 : /*
399 : * An existing database was dropped. We'll see that the files don't
400 : * exist in the target data dir, and copy them in toto from the source
401 : * system. No need to do anything special here.
402 : */
403 : }
404 172596 : else if (rmid == RM_SMGR_ID && rminfo == XLOG_SMGR_CREATE)
405 : {
406 : /*
407 : * We can safely ignore these. The file will be removed from the
408 : * target, if it doesn't exist in source system. If a file with same
409 : * name is created in source system, too, there will be WAL records
410 : * for all the blocks in it.
411 : */
412 : }
413 170220 : else if (rmid == RM_SMGR_ID && rminfo == XLOG_SMGR_TRUNCATE)
414 : {
415 : /*
416 : * We can safely ignore these. When we compare the sizes later on,
417 : * we'll notice that they differ, and copy the missing tail from
418 : * source system.
419 : */
420 : }
421 170212 : else if (rmid == RM_XACT_ID &&
422 82 : ((rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT ||
423 0 : (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT_PREPARED ||
424 0 : (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_ABORT ||
425 0 : (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_ABORT_PREPARED))
426 : {
427 : /*
428 : * These records can include "dropped rels". We can safely ignore
429 : * them, we will see that they are missing and copy them from the
430 : * source.
431 : */
432 : }
433 170130 : else if (info & XLR_SPECIAL_REL_UPDATE)
434 : {
435 : /*
436 : * This record type modifies a relation file in some special way, but
437 : * we don't recognize the type. That's bad - we don't know how to
438 : * track that change.
439 : */
440 0 : pg_fatal("WAL record modifies a relation, but record type is not recognized: "
441 : "lsn: %X/%X, rmid: %d, rmgr: %s, info: %02X",
442 : LSN_FORMAT_ARGS(record->ReadRecPtr),
443 : rmid, RmgrName(rmid), info);
444 : }
445 :
446 343286 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
447 : {
448 : RelFileLocator rlocator;
449 : ForkNumber forknum;
450 : BlockNumber blkno;
451 :
452 170682 : if (!XLogRecGetBlockTagExtended(record, block_id,
453 : &rlocator, &forknum, &blkno, NULL))
454 1896 : continue;
455 :
456 : /* We only care about the main fork; others are copied in toto */
457 170682 : if (forknum != MAIN_FORKNUM)
458 1896 : continue;
459 :
460 168786 : process_target_wal_block_change(forknum, rlocator, blkno);
461 : }
462 172604 : }
|