LCOV - code coverage report
Current view: top level - src/bin/pg_rewind - parsexlog.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 119 148 80.4 %
Date: 2025-01-18 04:15:08 Functions: 5 5 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * parsexlog.c
       4             :  *    Functions for reading Write-Ahead-Log
       5             :  *
       6             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *-------------------------------------------------------------------------
      10             :  */
      11             : 
      12             : #include "postgres_fe.h"
      13             : 
      14             : #include <unistd.h>
      15             : 
      16             : #include "access/rmgr.h"
      17             : #include "access/xact.h"
      18             : #include "access/xlog_internal.h"
      19             : #include "access/xlogreader.h"
      20             : #include "catalog/pg_control.h"
      21             : #include "catalog/storage_xlog.h"
      22             : #include "commands/dbcommands_xlog.h"
      23             : #include "fe_utils/archive.h"
      24             : #include "filemap.h"
      25             : #include "pg_rewind.h"
      26             : 
      27             : /*
      28             :  * RmgrNames is an array of the built-in resource manager names, to make error
      29             :  * messages a bit nicer.
      30             :  */
      31             : #define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode) \
      32             :   name,
      33             : 
      34             : static const char *const RmgrNames[RM_MAX_ID + 1] = {
      35             : #include "access/rmgrlist.h"
      36             : };
      37             : 
      38             : #define RmgrName(rmid) (((rmid) <= RM_MAX_BUILTIN_ID) ? \
      39             :                         RmgrNames[rmid] : "custom")
      40             : 
      41             : static void extractPageInfo(XLogReaderState *record);
      42             : 
      43             : static int  xlogreadfd = -1;
      44             : static XLogSegNo xlogreadsegno = 0;
      45             : static char xlogfpath[MAXPGPATH];
      46             : 
      47             : typedef struct XLogPageReadPrivate
      48             : {
      49             :     const char *restoreCommand;
      50             :     int         tliIndex;
      51             : } XLogPageReadPrivate;
      52             : 
      53             : static int  SimpleXLogPageRead(XLogReaderState *xlogreader,
      54             :                                XLogRecPtr targetPagePtr,
      55             :                                int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
      56             : 
      57             : /*
      58             :  * Read WAL from the datadir/pg_wal, starting from 'startpoint' on timeline
      59             :  * index 'tliIndex' in target timeline history, until 'endpoint'. Make note of
      60             :  * the data blocks touched by the WAL records, and return them in a page map.
      61             :  *
      62             :  * 'endpoint' is the end of the last record to read. The record starting at
      63             :  * 'endpoint' is the first one that is not read.
      64             :  */
      65             : void
      66          28 : extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex,
      67             :                XLogRecPtr endpoint, const char *restoreCommand)
      68             : {
      69             :     XLogRecord *record;
      70             :     XLogReaderState *xlogreader;
      71             :     char       *errormsg;
      72             :     XLogPageReadPrivate private;
      73             : 
      74          28 :     private.tliIndex = tliIndex;
      75          28 :     private.restoreCommand = restoreCommand;
      76          28 :     xlogreader = XLogReaderAllocate(WalSegSz, datadir,
      77          28 :                                     XL_ROUTINE(.page_read = &SimpleXLogPageRead),
      78             :                                     &private);
      79          28 :     if (xlogreader == NULL)
      80           0 :         pg_fatal("out of memory while allocating a WAL reading processor");
      81             : 
      82          28 :     XLogBeginRead(xlogreader, startpoint);
      83             :     do
      84             :     {
      85      172886 :         record = XLogReadRecord(xlogreader, &errormsg);
      86             : 
      87      172886 :         if (record == NULL)
      88             :         {
      89           0 :             XLogRecPtr  errptr = xlogreader->EndRecPtr;
      90             : 
      91           0 :             if (errormsg)
      92           0 :                 pg_fatal("could not read WAL record at %X/%X: %s",
      93             :                          LSN_FORMAT_ARGS(errptr),
      94             :                          errormsg);
      95             :             else
      96           0 :                 pg_fatal("could not read WAL record at %X/%X",
      97             :                          LSN_FORMAT_ARGS(errptr));
      98             :         }
      99             : 
     100      172886 :         extractPageInfo(xlogreader);
     101      172886 :     } while (xlogreader->EndRecPtr < endpoint);
     102             : 
     103             :     /*
     104             :      * If 'endpoint' didn't point exactly at a record boundary, the caller
     105             :      * messed up.
     106             :      */
     107          28 :     if (xlogreader->EndRecPtr != endpoint)
     108           0 :         pg_fatal("end pointer %X/%X is not a valid end point; expected %X/%X",
     109             :                  LSN_FORMAT_ARGS(endpoint), LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
     110             : 
     111          28 :     XLogReaderFree(xlogreader);
     112          28 :     if (xlogreadfd != -1)
     113             :     {
     114          28 :         close(xlogreadfd);
     115          28 :         xlogreadfd = -1;
     116             :     }
     117          28 : }
     118             : 
     119             : /*
     120             :  * Reads one WAL record. Returns the end position of the record, without
     121             :  * doing anything with the record itself.
     122             :  */
     123             : XLogRecPtr
     124          28 : readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex,
     125             :               const char *restoreCommand)
     126             : {
     127             :     XLogRecord *record;
     128             :     XLogReaderState *xlogreader;
     129             :     char       *errormsg;
     130             :     XLogPageReadPrivate private;
     131             :     XLogRecPtr  endptr;
     132             : 
     133          28 :     private.tliIndex = tliIndex;
     134          28 :     private.restoreCommand = restoreCommand;
     135          28 :     xlogreader = XLogReaderAllocate(WalSegSz, datadir,
     136          28 :                                     XL_ROUTINE(.page_read = &SimpleXLogPageRead),
     137             :                                     &private);
     138          28 :     if (xlogreader == NULL)
     139           0 :         pg_fatal("out of memory while allocating a WAL reading processor");
     140             : 
     141          28 :     XLogBeginRead(xlogreader, ptr);
     142          28 :     record = XLogReadRecord(xlogreader, &errormsg);
     143          28 :     if (record == NULL)
     144             :     {
     145           0 :         if (errormsg)
     146           0 :             pg_fatal("could not read WAL record at %X/%X: %s",
     147             :                      LSN_FORMAT_ARGS(ptr), errormsg);
     148             :         else
     149           0 :             pg_fatal("could not read WAL record at %X/%X",
     150             :                      LSN_FORMAT_ARGS(ptr));
     151             :     }
     152          28 :     endptr = xlogreader->EndRecPtr;
     153             : 
     154          28 :     XLogReaderFree(xlogreader);
     155          28 :     if (xlogreadfd != -1)
     156             :     {
     157          28 :         close(xlogreadfd);
     158          28 :         xlogreadfd = -1;
     159             :     }
     160             : 
     161          28 :     return endptr;
     162             : }
     163             : 
     164             : /*
     165             :  * Find the previous checkpoint preceding given WAL location.
     166             :  */
     167             : void
     168          28 : findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex,
     169             :                    XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli,
     170             :                    XLogRecPtr *lastchkptredo, const char *restoreCommand)
     171             : {
     172             :     /* Walk backwards, starting from the given record */
     173             :     XLogRecord *record;
     174             :     XLogRecPtr  searchptr;
     175             :     XLogReaderState *xlogreader;
     176             :     char       *errormsg;
     177             :     XLogPageReadPrivate private;
     178          28 :     XLogSegNo   current_segno = 0;
     179          28 :     TimeLineID  current_tli = 0;
     180             : 
     181             :     /*
     182             :      * The given fork pointer points to the end of the last common record,
     183             :      * which is not necessarily the beginning of the next record, if the
     184             :      * previous record happens to end at a page boundary. Skip over the page
     185             :      * header in that case to find the next record.
     186             :      */
     187          28 :     if (forkptr % XLOG_BLCKSZ == 0)
     188             :     {
     189           4 :         if (XLogSegmentOffset(forkptr, WalSegSz) == 0)
     190           4 :             forkptr += SizeOfXLogLongPHD;
     191             :         else
     192           0 :             forkptr += SizeOfXLogShortPHD;
     193             :     }
     194             : 
     195          28 :     private.tliIndex = tliIndex;
     196          28 :     private.restoreCommand = restoreCommand;
     197          28 :     xlogreader = XLogReaderAllocate(WalSegSz, datadir,
     198          28 :                                     XL_ROUTINE(.page_read = &SimpleXLogPageRead),
     199             :                                     &private);
     200          28 :     if (xlogreader == NULL)
     201           0 :         pg_fatal("out of memory while allocating a WAL reading processor");
     202             : 
     203          28 :     searchptr = forkptr;
     204             :     for (;;)
     205        5252 :     {
     206             :         uint8       info;
     207             : 
     208        5280 :         XLogBeginRead(xlogreader, searchptr);
     209        5280 :         record = XLogReadRecord(xlogreader, &errormsg);
     210             : 
     211        5280 :         if (record == NULL)
     212             :         {
     213           0 :             if (errormsg)
     214           0 :                 pg_fatal("could not find previous WAL record at %X/%X: %s",
     215             :                          LSN_FORMAT_ARGS(searchptr),
     216             :                          errormsg);
     217             :             else
     218           0 :                 pg_fatal("could not find previous WAL record at %X/%X",
     219             :                          LSN_FORMAT_ARGS(searchptr));
     220             :         }
     221             : 
     222             :         /* Detect if a new WAL file has been opened */
     223        5280 :         if (xlogreader->seg.ws_tli != current_tli ||
     224        5252 :             xlogreader->seg.ws_segno != current_segno)
     225             :         {
     226             :             char        xlogfname[MAXFNAMELEN];
     227             : 
     228          38 :             snprintf(xlogfname, MAXFNAMELEN, XLOGDIR "/");
     229             : 
     230             :             /* update current values */
     231          38 :             current_tli = xlogreader->seg.ws_tli;
     232          38 :             current_segno = xlogreader->seg.ws_segno;
     233             : 
     234          38 :             XLogFileName(xlogfname + sizeof(XLOGDIR),
     235             :                          current_tli, current_segno, WalSegSz);
     236             : 
     237             :             /* Track this filename as one to not remove */
     238          38 :             keepwal_add_entry(xlogfname);
     239             :         }
     240             : 
     241             :         /*
     242             :          * Check if it is a checkpoint record. This checkpoint record needs to
     243             :          * be the latest checkpoint before WAL forked and not the checkpoint
     244             :          * where the primary has been stopped to be rewound.
     245             :          */
     246        5280 :         info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK;
     247        5280 :         if (searchptr < forkptr &&
     248        5252 :             XLogRecGetRmid(xlogreader) == RM_XLOG_ID &&
     249        3786 :             (info == XLOG_CHECKPOINT_SHUTDOWN ||
     250             :              info == XLOG_CHECKPOINT_ONLINE))
     251             :         {
     252             :             CheckPoint  checkPoint;
     253             : 
     254          28 :             memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
     255          28 :             *lastchkptrec = searchptr;
     256          28 :             *lastchkpttli = checkPoint.ThisTimeLineID;
     257          28 :             *lastchkptredo = checkPoint.redo;
     258          28 :             break;
     259             :         }
     260             : 
     261             :         /* Walk backwards to previous record. */
     262        5252 :         searchptr = record->xl_prev;
     263             :     }
     264             : 
     265          28 :     XLogReaderFree(xlogreader);
     266          28 :     if (xlogreadfd != -1)
     267             :     {
     268          28 :         close(xlogreadfd);
     269          28 :         xlogreadfd = -1;
     270             :     }
     271          28 : }
     272             : 
     273             : /* XLogReader callback function, to read a WAL page */
     274             : static int
     275       11642 : SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
     276             :                    int reqLen, XLogRecPtr targetRecPtr, char *readBuf)
     277             : {
     278       11642 :     XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
     279             :     uint32      targetPageOff;
     280             :     XLogRecPtr  targetSegEnd;
     281             :     XLogSegNo   targetSegNo;
     282             :     int         r;
     283             : 
     284       11642 :     XLByteToSeg(targetPagePtr, targetSegNo, WalSegSz);
     285       11642 :     XLogSegNoOffsetToRecPtr(targetSegNo + 1, 0, WalSegSz, targetSegEnd);
     286       11642 :     targetPageOff = XLogSegmentOffset(targetPagePtr, WalSegSz);
     287             : 
     288             :     /*
     289             :      * See if we need to switch to a new segment because the requested record
     290             :      * is not in the currently open one.
     291             :      */
     292       11642 :     if (xlogreadfd >= 0 &&
     293       11558 :         !XLByteInSeg(targetPagePtr, xlogreadsegno, WalSegSz))
     294             :     {
     295          20 :         close(xlogreadfd);
     296          20 :         xlogreadfd = -1;
     297             :     }
     298             : 
     299       11642 :     XLByteToSeg(targetPagePtr, xlogreadsegno, WalSegSz);
     300             : 
     301       11642 :     if (xlogreadfd < 0)
     302             :     {
     303             :         char        xlogfname[MAXFNAMELEN];
     304             : 
     305             :         /*
     306             :          * Since incomplete segments are copied into next timelines, switch to
     307             :          * the timeline holding the required segment. Assuming this scan can
     308             :          * be done both forward and backward, consider also switching timeline
     309             :          * accordingly.
     310             :          */
     311         108 :         while (private->tliIndex < targetNentries - 1 &&
     312           4 :                targetHistory[private->tliIndex].end < targetSegEnd)
     313           4 :             private->tliIndex++;
     314         104 :         while (private->tliIndex > 0 &&
     315          12 :                targetHistory[private->tliIndex].begin >= targetSegEnd)
     316           0 :             private->tliIndex--;
     317             : 
     318         104 :         XLogFileName(xlogfname, targetHistory[private->tliIndex].tli,
     319             :                      xlogreadsegno, WalSegSz);
     320             : 
     321         104 :         snprintf(xlogfpath, MAXPGPATH, "%s/" XLOGDIR "/%s",
     322         104 :                  xlogreader->segcxt.ws_dir, xlogfname);
     323             : 
     324         104 :         xlogreadfd = open(xlogfpath, O_RDONLY | PG_BINARY, 0);
     325             : 
     326         104 :         if (xlogreadfd < 0)
     327             :         {
     328             :             /*
     329             :              * If we have no restore_command to execute, then exit.
     330             :              */
     331           2 :             if (private->restoreCommand == NULL)
     332             :             {
     333           0 :                 pg_log_error("could not open file \"%s\": %m", xlogfpath);
     334           0 :                 return -1;
     335             :             }
     336             : 
     337             :             /*
     338             :              * Since we have restore_command, then try to retrieve missing WAL
     339             :              * file from the archive.
     340             :              */
     341           2 :             xlogreadfd = RestoreArchivedFile(xlogreader->segcxt.ws_dir,
     342             :                                              xlogfname,
     343             :                                              WalSegSz,
     344             :                                              private->restoreCommand);
     345             : 
     346           2 :             if (xlogreadfd < 0)
     347           0 :                 return -1;
     348             :             else
     349           2 :                 pg_log_debug("using file \"%s\" restored from archive",
     350             :                              xlogfpath);
     351             :         }
     352             :     }
     353             : 
     354             :     /*
     355             :      * At this point, we have the right segment open.
     356             :      */
     357             :     Assert(xlogreadfd != -1);
     358             : 
     359             :     /* Read the requested page */
     360       11642 :     if (lseek(xlogreadfd, (off_t) targetPageOff, SEEK_SET) < 0)
     361             :     {
     362           0 :         pg_log_error("could not seek in file \"%s\": %m", xlogfpath);
     363           0 :         return -1;
     364             :     }
     365             : 
     366             : 
     367       11642 :     r = read(xlogreadfd, readBuf, XLOG_BLCKSZ);
     368       11642 :     if (r != XLOG_BLCKSZ)
     369             :     {
     370           0 :         if (r < 0)
     371           0 :             pg_log_error("could not read file \"%s\": %m", xlogfpath);
     372             :         else
     373           0 :             pg_log_error("could not read file \"%s\": read %d of %zu",
     374             :                          xlogfpath, r, (Size) XLOG_BLCKSZ);
     375             : 
     376           0 :         return -1;
     377             :     }
     378             : 
     379             :     Assert(targetSegNo == xlogreadsegno);
     380             : 
     381       11642 :     xlogreader->seg.ws_tli = targetHistory[private->tliIndex].tli;
     382       11642 :     return XLOG_BLCKSZ;
     383             : }
     384             : 
     385             : /*
     386             :  * Extract information on which blocks the current record modifies.
     387             :  */
     388             : static void
     389      172886 : extractPageInfo(XLogReaderState *record)
     390             : {
     391             :     int         block_id;
     392      172886 :     RmgrId      rmid = XLogRecGetRmid(record);
     393      172886 :     uint8       info = XLogRecGetInfo(record);
     394      172886 :     uint8       rminfo = info & ~XLR_INFO_MASK;
     395             : 
     396             :     /* Is this a special record type that I recognize? */
     397             : 
     398      172886 :     if (rmid == RM_DBASE_ID && rminfo == XLOG_DBASE_CREATE_FILE_COPY)
     399             :     {
     400             :         /*
     401             :          * New databases can be safely ignored. It won't be present in the
     402             :          * source system, so it will be deleted. There's one corner-case,
     403             :          * though: if a new, different, database is also created in the source
     404             :          * system, we'll see that the files already exist and not copy them.
     405             :          * That's OK, though; WAL replay of creating the new database, from
     406             :          * the source systems's WAL, will re-copy the new database,
     407             :          * overwriting the database created in the target system.
     408             :          */
     409             :     }
     410      172886 :     else if (rmid == RM_DBASE_ID && rminfo == XLOG_DBASE_CREATE_WAL_LOG)
     411             :     {
     412             :         /*
     413             :          * New databases can be safely ignored. It won't be present in the
     414             :          * source system, so it will be deleted.
     415             :          */
     416             :     }
     417      172878 :     else if (rmid == RM_DBASE_ID && rminfo == XLOG_DBASE_DROP)
     418             :     {
     419             :         /*
     420             :          * An existing database was dropped. We'll see that the files don't
     421             :          * exist in the target data dir, and copy them in toto from the source
     422             :          * system. No need to do anything special here.
     423             :          */
     424             :     }
     425      172878 :     else if (rmid == RM_SMGR_ID && rminfo == XLOG_SMGR_CREATE)
     426             :     {
     427             :         /*
     428             :          * We can safely ignore these. The file will be removed from the
     429             :          * target, if it doesn't exist in source system. If a file with same
     430             :          * name is created in source system, too, there will be WAL records
     431             :          * for all the blocks in it.
     432             :          */
     433             :     }
     434      170484 :     else if (rmid == RM_SMGR_ID && rminfo == XLOG_SMGR_TRUNCATE)
     435             :     {
     436             :         /*
     437             :          * We can safely ignore these. When we compare the sizes later on,
     438             :          * we'll notice that they differ, and copy the missing tail from
     439             :          * source system.
     440             :          */
     441             :     }
     442      170476 :     else if (rmid == RM_XACT_ID &&
     443          86 :              ((rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT ||
     444           0 :               (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT_PREPARED ||
     445           0 :               (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_ABORT ||
     446           0 :               (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_ABORT_PREPARED))
     447             :     {
     448             :         /*
     449             :          * These records can include "dropped rels". We can safely ignore
     450             :          * them, we will see that they are missing and copy them from the
     451             :          * source.
     452             :          */
     453             :     }
     454      170390 :     else if (info & XLR_SPECIAL_REL_UPDATE)
     455             :     {
     456             :         /*
     457             :          * This record type modifies a relation file in some special way, but
     458             :          * we don't recognize the type. That's bad - we don't know how to
     459             :          * track that change.
     460             :          */
     461           0 :         pg_fatal("WAL record modifies a relation, but record type is not recognized: "
     462             :                  "lsn: %X/%X, rmid: %d, rmgr: %s, info: %02X",
     463             :                  LSN_FORMAT_ARGS(record->ReadRecPtr),
     464             :                  rmid, RmgrName(rmid), info);
     465             :     }
     466             : 
     467      343824 :     for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
     468             :     {
     469             :         RelFileLocator rlocator;
     470             :         ForkNumber  forknum;
     471             :         BlockNumber blkno;
     472             : 
     473      170938 :         if (!XLogRecGetBlockTagExtended(record, block_id,
     474             :                                         &rlocator, &forknum, &blkno, NULL))
     475        1898 :             continue;
     476             : 
     477             :         /* We only care about the main fork; others are copied in toto */
     478      170938 :         if (forknum != MAIN_FORKNUM)
     479        1898 :             continue;
     480             : 
     481      169040 :         process_target_wal_block_change(forknum, rlocator, blkno);
     482             :     }
     483      172886 : }

Generated by: LCOV version 1.14