Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * basebackup.c
4 : * code for taking a base backup and streaming it to a standby
5 : *
6 : * Portions Copyright (c) 2010-2026, PostgreSQL Global Development Group
7 : *
8 : * IDENTIFICATION
9 : * src/backend/backup/basebackup.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 : #include "postgres.h"
14 :
15 : #include <sys/stat.h>
16 : #include <unistd.h>
17 : #include <time.h>
18 :
19 : #include "access/xlog_internal.h"
20 : #include "access/xlogbackup.h"
21 : #include "backup/backup_manifest.h"
22 : #include "backup/basebackup.h"
23 : #include "backup/basebackup_incremental.h"
24 : #include "backup/basebackup_sink.h"
25 : #include "backup/basebackup_target.h"
26 : #include "catalog/pg_tablespace_d.h"
27 : #include "commands/defrem.h"
28 : #include "common/compression.h"
29 : #include "common/file_perm.h"
30 : #include "common/file_utils.h"
31 : #include "lib/stringinfo.h"
32 : #include "miscadmin.h"
33 : #include "nodes/pg_list.h"
34 : #include "pgstat.h"
35 : #include "pgtar.h"
36 : #include "postmaster/syslogger.h"
37 : #include "postmaster/walsummarizer.h"
38 : #include "replication/slot.h"
39 : #include "replication/walsender.h"
40 : #include "replication/walsender_private.h"
41 : #include "storage/bufpage.h"
42 : #include "storage/checksum.h"
43 : #include "storage/dsm_impl.h"
44 : #include "storage/ipc.h"
45 : #include "storage/reinit.h"
46 : #include "utils/builtins.h"
47 : #include "utils/guc.h"
48 : #include "utils/ps_status.h"
49 : #include "utils/relcache.h"
50 : #include "utils/resowner.h"
51 :
52 : /*
53 : * How much data do we want to send in one CopyData message? Note that
54 : * this may also result in reading the underlying files in chunks of this
55 : * size.
56 : *
57 : * NB: The buffer size is required to be a multiple of the system block
58 : * size, so use that value instead if it's bigger than our preference.
59 : */
60 : #define SINK_BUFFER_LENGTH Max(32768, BLCKSZ)
61 :
62 : typedef struct
63 : {
64 : const char *label;
65 : bool progress;
66 : bool fastcheckpoint;
67 : bool nowait;
68 : bool includewal;
69 : bool incremental;
70 : uint32 maxrate;
71 : bool sendtblspcmapfile;
72 : bool send_to_client;
73 : bool use_copytblspc;
74 : BaseBackupTargetHandle *target_handle;
75 : backup_manifest_option manifest;
76 : pg_compress_algorithm compression;
77 : pg_compress_specification compression_specification;
78 : pg_checksum_type manifest_checksum_type;
79 : } basebackup_options;
80 :
81 : #define TAR_NUM_TERMINATION_BLOCKS 2
82 :
83 : StaticAssertDecl(TAR_NUM_TERMINATION_BLOCKS * TAR_BLOCK_SIZE <= BLCKSZ,
84 : "BLCKSZ too small for " CppAsString2(TAR_NUM_TERMINATION_BLOCKS) " tar termination blocks");
85 :
86 : static int64 sendTablespace(bbsink *sink, char *path, Oid spcoid, bool sizeonly,
87 : struct backup_manifest_info *manifest,
88 : IncrementalBackupInfo *ib);
89 : static int64 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly,
90 : List *tablespaces, bool sendtblspclinks,
91 : backup_manifest_info *manifest, Oid spcoid,
92 : IncrementalBackupInfo *ib);
93 : static bool sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
94 : struct stat *statbuf, bool missing_ok,
95 : Oid dboid, Oid spcoid, RelFileNumber relfilenumber,
96 : unsigned segno,
97 : backup_manifest_info *manifest,
98 : unsigned num_incremental_blocks,
99 : BlockNumber *incremental_blocks,
100 : unsigned truncation_block_length);
101 : static off_t read_file_data_into_buffer(bbsink *sink,
102 : const char *readfilename, int fd,
103 : off_t offset, size_t length,
104 : BlockNumber blkno,
105 : bool verify_checksum,
106 : int *checksum_failures);
107 : static void push_to_sink(bbsink *sink, pg_checksum_context *checksum_ctx,
108 : size_t *bytes_done, void *data, size_t length);
109 : static bool verify_page_checksum(Page page, XLogRecPtr start_lsn,
110 : BlockNumber blkno,
111 : uint16 *expected_checksum);
112 : static void sendFileWithContent(bbsink *sink, const char *filename,
113 : const char *content, int len,
114 : backup_manifest_info *manifest);
115 : static int64 _tarWriteHeader(bbsink *sink, const char *filename,
116 : const char *linktarget, struct stat *statbuf,
117 : bool sizeonly);
118 : static void _tarWritePadding(bbsink *sink, int len);
119 : static void convert_link_to_directory(const char *pathbuf, struct stat *statbuf);
120 : static void perform_base_backup(basebackup_options *opt, bbsink *sink,
121 : IncrementalBackupInfo *ib);
122 : static void parse_basebackup_options(List *options, basebackup_options *opt);
123 : static int compareWalFileNames(const ListCell *a, const ListCell *b);
124 : static ssize_t basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset,
125 : const char *filename, bool partial_read_ok);
126 :
127 : /* Was the backup currently in-progress initiated in recovery mode? */
128 : static bool backup_started_in_recovery = false;
129 :
130 : /* Total number of checksum failures during base backup. */
131 : static long long int total_checksum_failures;
132 :
133 : /* Do not verify checksums. */
134 : static bool noverify_checksums = false;
135 :
136 : /*
137 : * Definition of one element part of an exclusion list, used for paths part
138 : * of checksum validation or base backups. "name" is the name of the file
139 : * or path to check for exclusion. If "match_prefix" is true, any items
140 : * matching the name as prefix are excluded.
141 : */
142 : struct exclude_list_item
143 : {
144 : const char *name;
145 : bool match_prefix;
146 : };
147 :
148 : /*
149 : * The contents of these directories are removed or recreated during server
150 : * start so they are not included in backups. The directories themselves are
151 : * kept and included as empty to preserve access permissions.
152 : *
153 : * Note: this list should be kept in sync with the filter lists in pg_rewind's
154 : * filemap.c.
155 : */
156 : static const char *const excludeDirContents[] =
157 : {
158 : /*
159 : * Skip temporary statistics files. PG_STAT_TMP_DIR must be skipped
160 : * because extensions like pg_stat_statements store data there.
161 : */
162 : PG_STAT_TMP_DIR,
163 :
164 : /*
165 : * It is generally not useful to backup the contents of this directory
166 : * even if the intention is to restore to another primary. See backup.sgml
167 : * for a more detailed description.
168 : */
169 : PG_REPLSLOT_DIR,
170 :
171 : /* Contents removed on startup, see dsm_cleanup_for_mmap(). */
172 : PG_DYNSHMEM_DIR,
173 :
174 : /* Contents removed on startup, see AsyncShmemInit(). */
175 : "pg_notify",
176 :
177 : /*
178 : * Old contents are loaded for possible debugging but are not required for
179 : * normal operation, see SerialInit().
180 : */
181 : "pg_serial",
182 :
183 : /* Contents removed on startup, see DeleteAllExportedSnapshotFiles(). */
184 : "pg_snapshots",
185 :
186 : /* Contents zeroed on startup, see StartupSUBTRANS(). */
187 : "pg_subtrans",
188 :
189 : /* end of list */
190 : NULL
191 : };
192 :
193 : /*
194 : * List of files excluded from backups.
195 : */
196 : static const struct exclude_list_item excludeFiles[] =
197 : {
198 : /* Skip auto conf temporary file. */
199 : {PG_AUTOCONF_FILENAME ".tmp", false},
200 :
201 : /* Skip current log file temporary file */
202 : {LOG_METAINFO_DATAFILE_TMP, false},
203 :
204 : /*
205 : * Skip relation cache because it is rebuilt on startup. This includes
206 : * temporary files.
207 : */
208 : {RELCACHE_INIT_FILENAME, true},
209 :
210 : /*
211 : * backup_label and tablespace_map should not exist in a running cluster
212 : * capable of doing an online backup, but exclude them just in case.
213 : */
214 : {BACKUP_LABEL_FILE, false},
215 : {TABLESPACE_MAP, false},
216 :
217 : /*
218 : * If there's a backup_manifest, it belongs to a backup that was used to
219 : * start this server. It is *not* correct for this backup. Our
220 : * backup_manifest is injected into the backup separately if users want
221 : * it.
222 : */
223 : {"backup_manifest", false},
224 :
225 : {"postmaster.pid", false},
226 : {"postmaster.opts", false},
227 :
228 : /* end of list */
229 : {NULL, false}
230 : };
231 :
232 : /*
233 : * Actually do a base backup for the specified tablespaces.
234 : *
235 : * This is split out mainly to avoid complaints about "variable might be
236 : * clobbered by longjmp" from stupider versions of gcc.
237 : */
238 : static void
239 169 : perform_base_backup(basebackup_options *opt, bbsink *sink,
240 : IncrementalBackupInfo *ib)
241 : {
242 : bbsink_state state;
243 : XLogRecPtr endptr;
244 : TimeLineID endtli;
245 : backup_manifest_info manifest;
246 : BackupState *backup_state;
247 : StringInfoData tablespace_map;
248 :
249 : /* Initial backup state, insofar as we know it now. */
250 169 : state.tablespaces = NIL;
251 169 : state.tablespace_num = 0;
252 169 : state.bytes_done = 0;
253 169 : state.bytes_total = 0;
254 169 : state.bytes_total_is_valid = false;
255 :
256 : /* we're going to use a BufFile, so we need a ResourceOwner */
257 : Assert(AuxProcessResourceOwner != NULL);
258 : Assert(CurrentResourceOwner == AuxProcessResourceOwner ||
259 : CurrentResourceOwner == NULL);
260 169 : CurrentResourceOwner = AuxProcessResourceOwner;
261 :
262 169 : backup_started_in_recovery = RecoveryInProgress();
263 :
264 169 : InitializeBackupManifest(&manifest, opt->manifest,
265 : opt->manifest_checksum_type);
266 :
267 169 : total_checksum_failures = 0;
268 :
269 : /* Allocate backup related variables. */
270 169 : backup_state = palloc0_object(BackupState);
271 169 : initStringInfo(&tablespace_map);
272 :
273 169 : basebackup_progress_wait_checkpoint();
274 169 : do_pg_backup_start(opt->label, opt->fastcheckpoint, &state.tablespaces,
275 : backup_state, &tablespace_map);
276 :
277 169 : state.startptr = backup_state->startpoint;
278 169 : state.starttli = backup_state->starttli;
279 :
280 : /*
281 : * Once do_pg_backup_start has been called, ensure that any failure causes
282 : * us to abort the backup so we don't "leak" a backup counter. For this
283 : * reason, *all* functionality between do_pg_backup_start() and the end of
284 : * do_pg_backup_stop() should be inside the error cleanup block!
285 : */
286 :
287 169 : PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false));
288 : {
289 : ListCell *lc;
290 : tablespaceinfo *newti;
291 :
292 : /* If this is an incremental backup, execute preparatory steps. */
293 169 : if (ib != NULL)
294 11 : PrepareForIncrementalBackup(ib, backup_state);
295 :
296 : /* Add a node for the base directory at the end */
297 169 : newti = palloc0_object(tablespaceinfo);
298 169 : newti->size = -1;
299 169 : state.tablespaces = lappend(state.tablespaces, newti);
300 :
301 : /*
302 : * Calculate the total backup size by summing up the size of each
303 : * tablespace
304 : */
305 169 : if (opt->progress)
306 : {
307 169 : basebackup_progress_estimate_backup_size();
308 :
309 375 : foreach(lc, state.tablespaces)
310 : {
311 206 : tablespaceinfo *tmp = (tablespaceinfo *) lfirst(lc);
312 :
313 206 : if (tmp->path == NULL)
314 169 : tmp->size = sendDir(sink, ".", 1, true, state.tablespaces,
315 : true, NULL, InvalidOid, NULL);
316 : else
317 37 : tmp->size = sendTablespace(sink, tmp->path, tmp->oid, true,
318 : NULL, NULL);
319 206 : state.bytes_total += tmp->size;
320 : }
321 169 : state.bytes_total_is_valid = true;
322 : }
323 :
324 : /* notify basebackup sink about start of backup */
325 169 : bbsink_begin_backup(sink, &state, SINK_BUFFER_LENGTH);
326 :
327 : /* Send off our tablespaces one by one */
328 370 : foreach(lc, state.tablespaces)
329 : {
330 206 : tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
331 :
332 206 : if (ti->path == NULL)
333 : {
334 : struct stat statbuf;
335 169 : bool sendtblspclinks = true;
336 : char *backup_label;
337 :
338 169 : bbsink_begin_archive(sink, "base.tar");
339 :
340 : /* In the main tar, include the backup_label first... */
341 169 : backup_label = build_backup_content(backup_state, false);
342 169 : sendFileWithContent(sink, BACKUP_LABEL_FILE,
343 : backup_label, -1, &manifest);
344 169 : pfree(backup_label);
345 :
346 : /* Then the tablespace_map file, if required... */
347 169 : if (opt->sendtblspcmapfile)
348 : {
349 27 : sendFileWithContent(sink, TABLESPACE_MAP,
350 27 : tablespace_map.data, -1, &manifest);
351 27 : sendtblspclinks = false;
352 : }
353 :
354 : /* Then the bulk of the files... */
355 169 : sendDir(sink, ".", 1, false, state.tablespaces,
356 : sendtblspclinks, &manifest, InvalidOid, ib);
357 :
358 : /* ... and pg_control after everything else. */
359 164 : if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0)
360 0 : ereport(ERROR,
361 : (errcode_for_file_access(),
362 : errmsg("could not stat file \"%s\": %m",
363 : XLOG_CONTROL_FILE)));
364 164 : sendFile(sink, XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf,
365 : false, InvalidOid, InvalidOid,
366 : InvalidRelFileNumber, 0, &manifest, 0, NULL, 0);
367 : }
368 : else
369 : {
370 37 : char *archive_name = psprintf("%u.tar", ti->oid);
371 :
372 37 : bbsink_begin_archive(sink, archive_name);
373 :
374 37 : sendTablespace(sink, ti->path, ti->oid, false, &manifest, ib);
375 : }
376 :
377 : /*
378 : * If we're including WAL, and this is the main data directory we
379 : * don't treat this as the end of the tablespace. Instead, we will
380 : * include the xlog files below and stop afterwards. This is safe
381 : * since the main data directory is always sent *last*.
382 : */
383 201 : if (opt->includewal && ti->path == NULL)
384 : {
385 : Assert(lnext(state.tablespaces, lc) == NULL);
386 : }
387 : else
388 : {
389 : /* Properly terminate the tarfile. */
390 186 : memset(sink->bbs_buffer, 0, TAR_NUM_TERMINATION_BLOCKS * TAR_BLOCK_SIZE);
391 186 : bbsink_archive_contents(sink, TAR_NUM_TERMINATION_BLOCKS * TAR_BLOCK_SIZE);
392 :
393 : /* OK, that's the end of the archive. */
394 186 : bbsink_end_archive(sink);
395 : }
396 : }
397 :
398 164 : basebackup_progress_wait_wal_archive(&state);
399 164 : do_pg_backup_stop(backup_state, !opt->nowait);
400 :
401 164 : endptr = backup_state->stoppoint;
402 164 : endtli = backup_state->stoptli;
403 :
404 : /* Deallocate backup-related variables. */
405 164 : pfree(tablespace_map.data);
406 164 : pfree(backup_state);
407 : }
408 165 : PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false));
409 :
410 :
411 164 : if (opt->includewal)
412 : {
413 : /*
414 : * We've left the last tar file "open", so we can now append the
415 : * required WAL files to it.
416 : */
417 : char pathbuf[MAXPGPATH];
418 : XLogSegNo segno;
419 : XLogSegNo startsegno;
420 : XLogSegNo endsegno;
421 : struct stat statbuf;
422 15 : List *historyFileList = NIL;
423 15 : List *walFileList = NIL;
424 : char firstoff[MAXFNAMELEN];
425 : char lastoff[MAXFNAMELEN];
426 : DIR *dir;
427 : struct dirent *de;
428 : ListCell *lc;
429 : TimeLineID tli;
430 :
431 15 : basebackup_progress_transfer_wal();
432 :
433 : /*
434 : * I'd rather not worry about timelines here, so scan pg_wal and
435 : * include all WAL files in the range between 'startptr' and 'endptr',
436 : * regardless of the timeline the file is stamped with. If there are
437 : * some spurious WAL files belonging to timelines that don't belong in
438 : * this server's history, they will be included too. Normally there
439 : * shouldn't be such files, but if there are, there's little harm in
440 : * including them.
441 : */
442 15 : XLByteToSeg(state.startptr, startsegno, wal_segment_size);
443 15 : XLogFileName(firstoff, state.starttli, startsegno, wal_segment_size);
444 15 : XLByteToPrevSeg(endptr, endsegno, wal_segment_size);
445 15 : XLogFileName(lastoff, endtli, endsegno, wal_segment_size);
446 :
447 15 : dir = AllocateDir("pg_wal");
448 108 : while ((de = ReadDir(dir, "pg_wal")) != NULL)
449 : {
450 : /* Does it look like a WAL segment, and is it in the range? */
451 93 : if (IsXLogFileName(de->d_name) &&
452 33 : strcmp(de->d_name + 8, firstoff + 8) >= 0 &&
453 33 : strcmp(de->d_name + 8, lastoff + 8) <= 0)
454 : {
455 15 : walFileList = lappend(walFileList, pstrdup(de->d_name));
456 : }
457 : /* Does it look like a timeline history file? */
458 78 : else if (IsTLHistoryFileName(de->d_name))
459 : {
460 0 : historyFileList = lappend(historyFileList, pstrdup(de->d_name));
461 : }
462 : }
463 15 : FreeDir(dir);
464 :
465 : /*
466 : * Before we go any further, check that none of the WAL segments we
467 : * need were removed.
468 : */
469 15 : CheckXLogRemoved(startsegno, state.starttli);
470 :
471 : /*
472 : * Sort the WAL filenames. We want to send the files in order from
473 : * oldest to newest, to reduce the chance that a file is recycled
474 : * before we get a chance to send it over.
475 : */
476 15 : list_sort(walFileList, compareWalFileNames);
477 :
478 : /*
479 : * There must be at least one xlog file in the pg_wal directory, since
480 : * we are doing backup-including-xlog.
481 : */
482 15 : if (walFileList == NIL)
483 0 : ereport(ERROR,
484 : (errmsg("could not find any WAL files")));
485 :
486 : /*
487 : * Sanity check: the first and last segment should cover startptr and
488 : * endptr, with no gaps in between.
489 : */
490 15 : XLogFromFileName((char *) linitial(walFileList),
491 : &tli, &segno, wal_segment_size);
492 15 : if (segno != startsegno)
493 : {
494 : char startfname[MAXFNAMELEN];
495 :
496 0 : XLogFileName(startfname, state.starttli, startsegno,
497 : wal_segment_size);
498 0 : ereport(ERROR,
499 : (errmsg("could not find WAL file \"%s\"", startfname)));
500 : }
501 30 : foreach(lc, walFileList)
502 : {
503 15 : char *walFileName = (char *) lfirst(lc);
504 15 : XLogSegNo currsegno = segno;
505 15 : XLogSegNo nextsegno = segno + 1;
506 :
507 15 : XLogFromFileName(walFileName, &tli, &segno, wal_segment_size);
508 15 : if (!(nextsegno == segno || currsegno == segno))
509 : {
510 : char nextfname[MAXFNAMELEN];
511 :
512 0 : XLogFileName(nextfname, tli, nextsegno, wal_segment_size);
513 0 : ereport(ERROR,
514 : (errmsg("could not find WAL file \"%s\"", nextfname)));
515 : }
516 : }
517 15 : if (segno != endsegno)
518 : {
519 : char endfname[MAXFNAMELEN];
520 :
521 0 : XLogFileName(endfname, endtli, endsegno, wal_segment_size);
522 0 : ereport(ERROR,
523 : (errmsg("could not find WAL file \"%s\"", endfname)));
524 : }
525 :
526 : /* Ok, we have everything we need. Send the WAL files. */
527 30 : foreach(lc, walFileList)
528 : {
529 15 : char *walFileName = (char *) lfirst(lc);
530 : int fd;
531 : ssize_t cnt;
532 15 : pgoff_t len = 0;
533 :
534 15 : snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", walFileName);
535 15 : XLogFromFileName(walFileName, &tli, &segno, wal_segment_size);
536 :
537 15 : fd = OpenTransientFile(pathbuf, O_RDONLY | PG_BINARY);
538 15 : if (fd < 0)
539 : {
540 0 : int save_errno = errno;
541 :
542 : /*
543 : * Most likely reason for this is that the file was already
544 : * removed by a checkpoint, so check for that to get a better
545 : * error message.
546 : */
547 0 : CheckXLogRemoved(segno, tli);
548 :
549 0 : errno = save_errno;
550 0 : ereport(ERROR,
551 : (errcode_for_file_access(),
552 : errmsg("could not open file \"%s\": %m", pathbuf)));
553 : }
554 :
555 15 : if (fstat(fd, &statbuf) != 0)
556 0 : ereport(ERROR,
557 : (errcode_for_file_access(),
558 : errmsg("could not stat file \"%s\": %m",
559 : pathbuf)));
560 15 : if (statbuf.st_size != wal_segment_size)
561 : {
562 0 : CheckXLogRemoved(segno, tli);
563 0 : ereport(ERROR,
564 : (errcode_for_file_access(),
565 : errmsg("unexpected WAL file size \"%s\"", walFileName)));
566 : }
567 :
568 : /* send the WAL file itself */
569 15 : _tarWriteHeader(sink, pathbuf, NULL, &statbuf, false);
570 :
571 15 : while ((cnt = basebackup_read_file(fd, sink->bbs_buffer,
572 7680 : Min(sink->bbs_buffer_length,
573 : wal_segment_size - len),
574 7680 : len, pathbuf, true)) > 0)
575 : {
576 7680 : CheckXLogRemoved(segno, tli);
577 7680 : bbsink_archive_contents(sink, cnt);
578 :
579 7680 : len += cnt;
580 :
581 7680 : if (len == wal_segment_size)
582 15 : break;
583 : }
584 :
585 15 : if (len != wal_segment_size)
586 : {
587 0 : CheckXLogRemoved(segno, tli);
588 0 : ereport(ERROR,
589 : (errcode_for_file_access(),
590 : errmsg("unexpected WAL file size \"%s\"", walFileName)));
591 : }
592 :
593 : /*
594 : * wal_segment_size is a multiple of TAR_BLOCK_SIZE, so no need
595 : * for padding.
596 : */
597 : Assert(wal_segment_size % TAR_BLOCK_SIZE == 0);
598 :
599 15 : CloseTransientFile(fd);
600 :
601 : /*
602 : * Mark file as archived, otherwise files can get archived again
603 : * after promotion of a new node. This is in line with
604 : * walreceiver.c always doing an XLogArchiveForceDone() after a
605 : * complete segment.
606 : */
607 15 : StatusFilePath(pathbuf, walFileName, ".done");
608 15 : sendFileWithContent(sink, pathbuf, "", -1, &manifest);
609 : }
610 :
611 : /*
612 : * Send timeline history files too. Only the latest timeline history
613 : * file is required for recovery, and even that only if there happens
614 : * to be a timeline switch in the first WAL segment that contains the
615 : * checkpoint record, or if we're taking a base backup from a standby
616 : * server and the target timeline changes while the backup is taken.
617 : * But they are small and highly useful for debugging purposes, so
618 : * better include them all, always.
619 : */
620 15 : foreach(lc, historyFileList)
621 : {
622 0 : char *fname = lfirst(lc);
623 :
624 0 : snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", fname);
625 :
626 0 : if (lstat(pathbuf, &statbuf) != 0)
627 0 : ereport(ERROR,
628 : (errcode_for_file_access(),
629 : errmsg("could not stat file \"%s\": %m", pathbuf)));
630 :
631 0 : sendFile(sink, pathbuf, pathbuf, &statbuf, false,
632 : InvalidOid, InvalidOid, InvalidRelFileNumber, 0,
633 : &manifest, 0, NULL, 0);
634 :
635 : /* unconditionally mark file as archived */
636 0 : StatusFilePath(pathbuf, fname, ".done");
637 0 : sendFileWithContent(sink, pathbuf, "", -1, &manifest);
638 : }
639 :
640 : /* Properly terminate the tar file. */
641 15 : memset(sink->bbs_buffer, 0, TAR_NUM_TERMINATION_BLOCKS * TAR_BLOCK_SIZE);
642 15 : bbsink_archive_contents(sink, TAR_NUM_TERMINATION_BLOCKS * TAR_BLOCK_SIZE);
643 :
644 : /* OK, that's the end of the archive. */
645 15 : bbsink_end_archive(sink);
646 : }
647 :
648 164 : AddWALInfoToBackupManifest(&manifest, state.startptr, state.starttli,
649 : endptr, endtli);
650 :
651 164 : SendBackupManifest(&manifest, sink);
652 :
653 164 : bbsink_end_backup(sink, endptr, endtli);
654 :
655 164 : if (total_checksum_failures)
656 : {
657 3 : if (total_checksum_failures > 1)
658 2 : ereport(WARNING,
659 : (errmsg_plural("%lld total checksum verification failure",
660 : "%lld total checksum verification failures",
661 : total_checksum_failures,
662 : total_checksum_failures)));
663 :
664 3 : ereport(ERROR,
665 : (errcode(ERRCODE_DATA_CORRUPTED),
666 : errmsg("checksum verification failure during base backup")));
667 : }
668 :
669 : /*
670 : * Make sure to free the manifest before the resource owners as manifests
671 : * use cryptohash contexts that may depend on resource owners (like
672 : * OpenSSL).
673 : */
674 161 : FreeBackupManifest(&manifest);
675 :
676 : /* clean up the resource owner we created */
677 161 : ReleaseAuxProcessResources(true);
678 :
679 161 : basebackup_progress_done();
680 161 : }
681 :
682 : /*
683 : * list_sort comparison function, to compare log/seg portion of WAL segment
684 : * filenames, ignoring the timeline portion.
685 : */
686 : static int
687 0 : compareWalFileNames(const ListCell *a, const ListCell *b)
688 : {
689 0 : char *fna = (char *) lfirst(a);
690 0 : char *fnb = (char *) lfirst(b);
691 :
692 0 : return strcmp(fna + 8, fnb + 8);
693 : }
694 :
695 : /*
696 : * Parse the base backup options passed down by the parser
697 : */
698 : static void
699 186 : parse_basebackup_options(List *options, basebackup_options *opt)
700 : {
701 : ListCell *lopt;
702 186 : bool o_label = false;
703 186 : bool o_progress = false;
704 186 : bool o_checkpoint = false;
705 186 : bool o_nowait = false;
706 186 : bool o_wal = false;
707 186 : bool o_incremental = false;
708 186 : bool o_maxrate = false;
709 186 : bool o_tablespace_map = false;
710 186 : bool o_noverify_checksums = false;
711 186 : bool o_manifest = false;
712 186 : bool o_manifest_checksums = false;
713 186 : bool o_target = false;
714 186 : bool o_target_detail = false;
715 186 : char *target_str = NULL;
716 186 : char *target_detail_str = NULL;
717 186 : bool o_compression = false;
718 186 : bool o_compression_detail = false;
719 186 : char *compression_detail_str = NULL;
720 :
721 2046 : MemSet(opt, 0, sizeof(*opt));
722 186 : opt->manifest = MANIFEST_OPTION_NO;
723 186 : opt->manifest_checksum_type = CHECKSUM_TYPE_CRC32C;
724 186 : opt->compression = PG_COMPRESSION_NONE;
725 186 : opt->compression_specification.algorithm = PG_COMPRESSION_NONE;
726 :
727 1402 : foreach(lopt, options)
728 : {
729 1219 : DefElem *defel = (DefElem *) lfirst(lopt);
730 :
731 1219 : if (strcmp(defel->defname, "label") == 0)
732 : {
733 186 : if (o_label)
734 0 : ereport(ERROR,
735 : (errcode(ERRCODE_SYNTAX_ERROR),
736 : errmsg("duplicate option \"%s\"", defel->defname)));
737 186 : opt->label = defGetString(defel);
738 186 : o_label = true;
739 : }
740 1033 : else if (strcmp(defel->defname, "progress") == 0)
741 : {
742 186 : if (o_progress)
743 0 : ereport(ERROR,
744 : (errcode(ERRCODE_SYNTAX_ERROR),
745 : errmsg("duplicate option \"%s\"", defel->defname)));
746 186 : opt->progress = defGetBoolean(defel);
747 186 : o_progress = true;
748 : }
749 847 : else if (strcmp(defel->defname, "checkpoint") == 0)
750 : {
751 176 : char *optval = defGetString(defel);
752 :
753 176 : if (o_checkpoint)
754 0 : ereport(ERROR,
755 : (errcode(ERRCODE_SYNTAX_ERROR),
756 : errmsg("duplicate option \"%s\"", defel->defname)));
757 176 : if (pg_strcasecmp(optval, "fast") == 0)
758 176 : opt->fastcheckpoint = true;
759 0 : else if (pg_strcasecmp(optval, "spread") == 0)
760 0 : opt->fastcheckpoint = false;
761 : else
762 0 : ereport(ERROR,
763 : (errcode(ERRCODE_SYNTAX_ERROR),
764 : errmsg("unrecognized checkpoint type: \"%s\"",
765 : optval)));
766 176 : o_checkpoint = true;
767 : }
768 671 : else if (strcmp(defel->defname, "wait") == 0)
769 : {
770 177 : if (o_nowait)
771 0 : ereport(ERROR,
772 : (errcode(ERRCODE_SYNTAX_ERROR),
773 : errmsg("duplicate option \"%s\"", defel->defname)));
774 177 : opt->nowait = !defGetBoolean(defel);
775 177 : o_nowait = true;
776 : }
777 494 : else if (strcmp(defel->defname, "wal") == 0)
778 : {
779 19 : if (o_wal)
780 0 : ereport(ERROR,
781 : (errcode(ERRCODE_SYNTAX_ERROR),
782 : errmsg("duplicate option \"%s\"", defel->defname)));
783 19 : opt->includewal = defGetBoolean(defel);
784 19 : o_wal = true;
785 : }
786 475 : else if (strcmp(defel->defname, "incremental") == 0)
787 : {
788 11 : if (o_incremental)
789 0 : ereport(ERROR,
790 : (errcode(ERRCODE_SYNTAX_ERROR),
791 : errmsg("duplicate option \"%s\"", defel->defname)));
792 11 : opt->incremental = defGetBoolean(defel);
793 11 : if (opt->incremental && !summarize_wal)
794 0 : ereport(ERROR,
795 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
796 : errmsg("incremental backups cannot be taken unless WAL summarization is enabled")));
797 11 : o_incremental = true;
798 : }
799 464 : else if (strcmp(defel->defname, "max_rate") == 0)
800 : {
801 : int64 maxrate;
802 :
803 1 : if (o_maxrate)
804 0 : ereport(ERROR,
805 : (errcode(ERRCODE_SYNTAX_ERROR),
806 : errmsg("duplicate option \"%s\"", defel->defname)));
807 :
808 1 : maxrate = defGetInt64(defel);
809 1 : if (maxrate < MAX_RATE_LOWER || maxrate > MAX_RATE_UPPER)
810 0 : ereport(ERROR,
811 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
812 : errmsg("%" PRId64 " is outside the valid range for parameter \"%s\" (%d .. %d)",
813 : maxrate, "MAX_RATE", MAX_RATE_LOWER, MAX_RATE_UPPER)));
814 :
815 1 : opt->maxrate = (uint32) maxrate;
816 1 : o_maxrate = true;
817 : }
818 463 : else if (strcmp(defel->defname, "tablespace_map") == 0)
819 : {
820 33 : if (o_tablespace_map)
821 0 : ereport(ERROR,
822 : (errcode(ERRCODE_SYNTAX_ERROR),
823 : errmsg("duplicate option \"%s\"", defel->defname)));
824 33 : opt->sendtblspcmapfile = defGetBoolean(defel);
825 33 : o_tablespace_map = true;
826 : }
827 430 : else if (strcmp(defel->defname, "verify_checksums") == 0)
828 : {
829 1 : if (o_noverify_checksums)
830 0 : ereport(ERROR,
831 : (errcode(ERRCODE_SYNTAX_ERROR),
832 : errmsg("duplicate option \"%s\"", defel->defname)));
833 1 : noverify_checksums = !defGetBoolean(defel);
834 1 : o_noverify_checksums = true;
835 : }
836 429 : else if (strcmp(defel->defname, "manifest") == 0)
837 : {
838 185 : char *optval = defGetString(defel);
839 : bool manifest_bool;
840 :
841 185 : if (o_manifest)
842 0 : ereport(ERROR,
843 : (errcode(ERRCODE_SYNTAX_ERROR),
844 : errmsg("duplicate option \"%s\"", defel->defname)));
845 185 : if (parse_bool(optval, &manifest_bool))
846 : {
847 184 : if (manifest_bool)
848 184 : opt->manifest = MANIFEST_OPTION_YES;
849 : else
850 0 : opt->manifest = MANIFEST_OPTION_NO;
851 : }
852 1 : else if (pg_strcasecmp(optval, "force-encode") == 0)
853 1 : opt->manifest = MANIFEST_OPTION_FORCE_ENCODE;
854 : else
855 0 : ereport(ERROR,
856 : (errcode(ERRCODE_SYNTAX_ERROR),
857 : errmsg("unrecognized manifest option: \"%s\"",
858 : optval)));
859 185 : o_manifest = true;
860 : }
861 244 : else if (strcmp(defel->defname, "manifest_checksums") == 0)
862 : {
863 14 : char *optval = defGetString(defel);
864 :
865 14 : if (o_manifest_checksums)
866 0 : ereport(ERROR,
867 : (errcode(ERRCODE_SYNTAX_ERROR),
868 : errmsg("duplicate option \"%s\"", defel->defname)));
869 14 : if (!pg_checksum_parse_type(optval,
870 : &opt->manifest_checksum_type))
871 2 : ereport(ERROR,
872 : (errcode(ERRCODE_SYNTAX_ERROR),
873 : errmsg("unrecognized checksum algorithm: \"%s\"",
874 : optval)));
875 12 : o_manifest_checksums = true;
876 : }
877 230 : else if (strcmp(defel->defname, "target") == 0)
878 : {
879 184 : if (o_target)
880 0 : ereport(ERROR,
881 : (errcode(ERRCODE_SYNTAX_ERROR),
882 : errmsg("duplicate option \"%s\"", defel->defname)));
883 184 : target_str = defGetString(defel);
884 184 : o_target = true;
885 : }
886 46 : else if (strcmp(defel->defname, "target_detail") == 0)
887 : {
888 8 : char *optval = defGetString(defel);
889 :
890 8 : if (o_target_detail)
891 0 : ereport(ERROR,
892 : (errcode(ERRCODE_SYNTAX_ERROR),
893 : errmsg("duplicate option \"%s\"", defel->defname)));
894 8 : target_detail_str = optval;
895 8 : o_target_detail = true;
896 : }
897 38 : else if (strcmp(defel->defname, "compression") == 0)
898 : {
899 26 : char *optval = defGetString(defel);
900 :
901 26 : if (o_compression)
902 0 : ereport(ERROR,
903 : (errcode(ERRCODE_SYNTAX_ERROR),
904 : errmsg("duplicate option \"%s\"", defel->defname)));
905 26 : if (!parse_compress_algorithm(optval, &opt->compression))
906 1 : ereport(ERROR,
907 : (errcode(ERRCODE_SYNTAX_ERROR),
908 : errmsg("unrecognized compression algorithm: \"%s\"",
909 : optval)));
910 25 : o_compression = true;
911 : }
912 12 : else if (strcmp(defel->defname, "compression_detail") == 0)
913 : {
914 12 : if (o_compression_detail)
915 0 : ereport(ERROR,
916 : (errcode(ERRCODE_SYNTAX_ERROR),
917 : errmsg("duplicate option \"%s\"", defel->defname)));
918 12 : compression_detail_str = defGetString(defel);
919 12 : o_compression_detail = true;
920 : }
921 : else
922 0 : ereport(ERROR,
923 : (errcode(ERRCODE_SYNTAX_ERROR),
924 : errmsg("unrecognized base backup option: \"%s\"",
925 : defel->defname)));
926 : }
927 :
928 183 : if (opt->label == NULL)
929 0 : opt->label = "base backup";
930 183 : if (opt->manifest == MANIFEST_OPTION_NO)
931 : {
932 1 : if (o_manifest_checksums)
933 0 : ereport(ERROR,
934 : (errcode(ERRCODE_SYNTAX_ERROR),
935 : errmsg("manifest checksums require a backup manifest")));
936 1 : opt->manifest_checksum_type = CHECKSUM_TYPE_NONE;
937 : }
938 :
939 183 : if (target_str == NULL)
940 : {
941 0 : if (target_detail_str != NULL)
942 0 : ereport(ERROR,
943 : (errcode(ERRCODE_SYNTAX_ERROR),
944 : errmsg("target detail cannot be used without target")));
945 0 : opt->use_copytblspc = true;
946 0 : opt->send_to_client = true;
947 : }
948 183 : else if (strcmp(target_str, "client") == 0)
949 : {
950 169 : if (target_detail_str != NULL)
951 0 : ereport(ERROR,
952 : (errcode(ERRCODE_SYNTAX_ERROR),
953 : errmsg("target \"%s\" does not accept a target detail",
954 : target_str)));
955 169 : opt->send_to_client = true;
956 : }
957 : else
958 12 : opt->target_handle =
959 14 : BaseBackupGetTargetHandle(target_str, target_detail_str);
960 :
961 181 : if (o_compression_detail && !o_compression)
962 0 : ereport(ERROR,
963 : (errcode(ERRCODE_SYNTAX_ERROR),
964 : errmsg("compression detail cannot be specified unless compression is enabled")));
965 :
966 181 : if (o_compression)
967 : {
968 : char *error_detail;
969 :
970 23 : parse_compress_specification(opt->compression, compression_detail_str,
971 : &opt->compression_specification);
972 : error_detail =
973 23 : validate_compress_specification(&opt->compression_specification);
974 23 : if (error_detail != NULL)
975 9 : ereport(ERROR,
976 : errcode(ERRCODE_SYNTAX_ERROR),
977 : errmsg("invalid compression specification: %s",
978 : error_detail));
979 : }
980 172 : }
981 :
982 :
983 : /*
984 : * SendBaseBackup() - send a complete base backup.
985 : *
986 : * The function will put the system into backup mode like pg_backup_start()
987 : * does, so that the backup is consistent even though we read directly from
988 : * the filesystem, bypassing the buffer cache.
989 : */
990 : void
991 187 : SendBaseBackup(BaseBackupCmd *cmd, IncrementalBackupInfo *ib)
992 : {
993 : basebackup_options opt;
994 : bbsink *sink;
995 187 : SessionBackupState status = get_backup_status();
996 :
997 187 : if (status == SESSION_BACKUP_RUNNING)
998 1 : ereport(ERROR,
999 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1000 : errmsg("a backup is already in progress in this session")));
1001 :
1002 186 : parse_basebackup_options(cmd->options, &opt);
1003 :
1004 172 : WalSndSetState(WALSNDSTATE_BACKUP);
1005 :
1006 172 : if (update_process_title)
1007 : {
1008 : char activitymsg[50];
1009 :
1010 172 : snprintf(activitymsg, sizeof(activitymsg), "sending backup \"%s\"",
1011 : opt.label);
1012 172 : set_ps_display(activitymsg);
1013 : }
1014 :
1015 : /*
1016 : * If we're asked to perform an incremental backup and the user has not
1017 : * supplied a manifest, that's an ERROR.
1018 : *
1019 : * If we're asked to perform a full backup and the user did supply a
1020 : * manifest, just ignore it.
1021 : */
1022 172 : if (!opt.incremental)
1023 161 : ib = NULL;
1024 11 : else if (ib == NULL)
1025 0 : ereport(ERROR,
1026 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1027 : errmsg("must UPLOAD_MANIFEST before performing an incremental BASE_BACKUP")));
1028 :
1029 : /*
1030 : * If the target is specifically 'client' then set up to stream the backup
1031 : * to the client; otherwise, it's being sent someplace else and should not
1032 : * be sent to the client. BaseBackupGetSink has the job of setting up a
1033 : * sink to send the backup data wherever it needs to go.
1034 : */
1035 172 : sink = bbsink_copystream_new(opt.send_to_client);
1036 172 : if (opt.target_handle != NULL)
1037 12 : sink = BaseBackupGetSink(opt.target_handle, sink);
1038 :
1039 : /* Set up network throttling, if client requested it */
1040 169 : if (opt.maxrate > 0)
1041 1 : sink = bbsink_throttle_new(sink, opt.maxrate);
1042 :
1043 : /* Set up server-side compression, if client requested it */
1044 169 : if (opt.compression == PG_COMPRESSION_GZIP)
1045 2 : sink = bbsink_gzip_new(sink, &opt.compression_specification);
1046 167 : else if (opt.compression == PG_COMPRESSION_LZ4)
1047 3 : sink = bbsink_lz4_new(sink, &opt.compression_specification);
1048 164 : else if (opt.compression == PG_COMPRESSION_ZSTD)
1049 0 : sink = bbsink_zstd_new(sink, &opt.compression_specification);
1050 :
1051 : /* Set up progress reporting. */
1052 169 : sink = bbsink_progress_new(sink, opt.progress, opt.incremental);
1053 :
1054 : /*
1055 : * Perform the base backup, but make sure we clean up the bbsink even if
1056 : * an error occurs.
1057 : */
1058 169 : PG_TRY();
1059 : {
1060 169 : perform_base_backup(&opt, sink, ib);
1061 : }
1062 4 : PG_FINALLY();
1063 : {
1064 165 : bbsink_cleanup(sink);
1065 : }
1066 165 : PG_END_TRY();
1067 161 : }
1068 :
1069 : /*
1070 : * Inject a file with given name and content in the output tar stream.
1071 : *
1072 : * "len" can optionally be set to an arbitrary length of data sent. If set
1073 : * to -1, the content sent is treated as a string with strlen() as length.
1074 : */
1075 : static void
1076 211 : sendFileWithContent(bbsink *sink, const char *filename, const char *content,
1077 : int len, backup_manifest_info *manifest)
1078 : {
1079 : struct stat statbuf;
1080 211 : int bytes_done = 0;
1081 : pg_checksum_context checksum_ctx;
1082 :
1083 211 : if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0)
1084 0 : elog(ERROR, "could not initialize checksum of file \"%s\"",
1085 : filename);
1086 :
1087 211 : if (len < 0)
1088 211 : len = strlen(content);
1089 :
1090 : /*
1091 : * Construct a stat struct for the file we're injecting in the tar.
1092 : */
1093 :
1094 : /* Windows doesn't have the concept of uid and gid */
1095 : #ifdef WIN32
1096 : statbuf.st_uid = 0;
1097 : statbuf.st_gid = 0;
1098 : #else
1099 211 : statbuf.st_uid = geteuid();
1100 211 : statbuf.st_gid = getegid();
1101 : #endif
1102 211 : statbuf.st_mtime = time(NULL);
1103 211 : statbuf.st_mode = pg_file_create_mode;
1104 211 : statbuf.st_size = len;
1105 :
1106 211 : _tarWriteHeader(sink, filename, NULL, &statbuf, false);
1107 :
1108 211 : if (pg_checksum_update(&checksum_ctx, (const uint8 *) content, len) < 0)
1109 0 : elog(ERROR, "could not update checksum of file \"%s\"",
1110 : filename);
1111 :
1112 386 : while (bytes_done < len)
1113 : {
1114 175 : size_t remaining = len - bytes_done;
1115 175 : size_t nbytes = Min(sink->bbs_buffer_length, remaining);
1116 :
1117 175 : memcpy(sink->bbs_buffer, content, nbytes);
1118 175 : bbsink_archive_contents(sink, nbytes);
1119 175 : bytes_done += nbytes;
1120 175 : content += nbytes;
1121 : }
1122 :
1123 211 : _tarWritePadding(sink, len);
1124 :
1125 211 : AddFileToBackupManifest(manifest, InvalidOid, filename, len,
1126 211 : (pg_time_t) statbuf.st_mtime, &checksum_ctx);
1127 211 : }
1128 :
1129 : /*
1130 : * Include the tablespace directory pointed to by 'path' in the output tar
1131 : * stream. If 'sizeonly' is true, we just calculate a total length and return
1132 : * it, without actually sending anything.
1133 : *
1134 : * Only used to send auxiliary tablespaces, not PGDATA.
1135 : */
1136 : static int64
1137 74 : sendTablespace(bbsink *sink, char *path, Oid spcoid, bool sizeonly,
1138 : backup_manifest_info *manifest, IncrementalBackupInfo *ib)
1139 : {
1140 : int64 size;
1141 : char pathbuf[MAXPGPATH];
1142 : struct stat statbuf;
1143 :
1144 : /*
1145 : * 'path' points to the tablespace location, but we only want to include
1146 : * the version directory in it that belongs to us.
1147 : */
1148 74 : snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path,
1149 : TABLESPACE_VERSION_DIRECTORY);
1150 :
1151 : /*
1152 : * Store a directory entry in the tar file so we get the permissions
1153 : * right.
1154 : */
1155 74 : if (lstat(pathbuf, &statbuf) != 0)
1156 : {
1157 0 : if (errno != ENOENT)
1158 0 : ereport(ERROR,
1159 : (errcode_for_file_access(),
1160 : errmsg("could not stat file or directory \"%s\": %m",
1161 : pathbuf)));
1162 :
1163 : /* If the tablespace went away while scanning, it's no error. */
1164 0 : return 0;
1165 : }
1166 :
1167 74 : size = _tarWriteHeader(sink, TABLESPACE_VERSION_DIRECTORY, NULL, &statbuf,
1168 : sizeonly);
1169 :
1170 : /* Send all the files in the tablespace version directory */
1171 74 : size += sendDir(sink, pathbuf, strlen(path), sizeonly, NIL, true, manifest,
1172 : spcoid, ib);
1173 :
1174 74 : return size;
1175 : }
1176 :
1177 : /*
1178 : * Include all files from the given directory in the output tar stream. If
1179 : * 'sizeonly' is true, we just calculate a total length and return it, without
1180 : * actually sending anything.
1181 : *
1182 : * Omit any directory in the tablespaces list, to avoid backing up
1183 : * tablespaces twice when they were created inside PGDATA.
1184 : *
1185 : * If sendtblspclinks is true, we need to include symlink
1186 : * information in the tar file. If not, we can skip that
1187 : * as it will be sent separately in the tablespace_map file.
1188 : */
1189 : static int64
1190 5844 : sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly,
1191 : List *tablespaces, bool sendtblspclinks, backup_manifest_info *manifest,
1192 : Oid spcoid, IncrementalBackupInfo *ib)
1193 : {
1194 : DIR *dir;
1195 : struct dirent *de;
1196 : char pathbuf[MAXPGPATH * 2];
1197 : struct stat statbuf;
1198 5844 : int64 size = 0;
1199 : const char *lastDir; /* Split last dir from parent path. */
1200 5844 : bool isRelationDir = false; /* Does directory contain relations? */
1201 5844 : bool isGlobalDir = false;
1202 5844 : Oid dboid = InvalidOid;
1203 5844 : BlockNumber *relative_block_numbers = NULL;
1204 :
1205 : /*
1206 : * Since this array is relatively large, avoid putting it on the stack.
1207 : * But we don't need it at all if this is not an incremental backup.
1208 : */
1209 5844 : if (ib != NULL)
1210 191 : relative_block_numbers = palloc_array(BlockNumber, RELSEG_SIZE);
1211 :
1212 : /*
1213 : * Determine if the current path is a database directory that can contain
1214 : * relations.
1215 : *
1216 : * Start by finding the location of the delimiter between the parent path
1217 : * and the current path.
1218 : */
1219 5844 : lastDir = last_dir_separator(path);
1220 :
1221 : /* Does this path look like a database path (i.e. all digits)? */
1222 5844 : if (lastDir != NULL &&
1223 5506 : strspn(lastDir + 1, "0123456789") == strlen(lastDir + 1))
1224 1085 : {
1225 : /* Part of path that contains the parent directory. */
1226 1085 : int parentPathLen = lastDir - path;
1227 :
1228 : /*
1229 : * Mark path as a database directory if the parent path is either
1230 : * $PGDATA/base or a tablespace version path.
1231 : */
1232 1085 : if (strncmp(path, "./base", parentPathLen) == 0 ||
1233 52 : (parentPathLen >= (sizeof(TABLESPACE_VERSION_DIRECTORY) - 1) &&
1234 52 : strncmp(lastDir - (sizeof(TABLESPACE_VERSION_DIRECTORY) - 1),
1235 : TABLESPACE_VERSION_DIRECTORY,
1236 : sizeof(TABLESPACE_VERSION_DIRECTORY) - 1) == 0))
1237 : {
1238 1085 : isRelationDir = true;
1239 1085 : dboid = atooid(lastDir + 1);
1240 : }
1241 : }
1242 4759 : else if (strcmp(path, "./global") == 0)
1243 : {
1244 334 : isRelationDir = true;
1245 334 : isGlobalDir = true;
1246 : }
1247 :
1248 5844 : dir = AllocateDir(path);
1249 361125 : while ((de = ReadDir(dir, path)) != NULL)
1250 : {
1251 : int excludeIdx;
1252 : bool excludeFound;
1253 355288 : RelFileNumber relfilenumber = InvalidRelFileNumber;
1254 355288 : ForkNumber relForkNum = InvalidForkNumber;
1255 355288 : unsigned segno = 0;
1256 355288 : bool isRelationFile = false;
1257 :
1258 : /* Skip special stuff */
1259 355288 : if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
1260 16519 : continue;
1261 :
1262 : /* Skip temporary files */
1263 343610 : if (strncmp(de->d_name,
1264 : PG_TEMP_FILE_PREFIX,
1265 : strlen(PG_TEMP_FILE_PREFIX)) == 0)
1266 333 : continue;
1267 :
1268 : /* Skip macOS system files */
1269 343277 : if (strcmp(de->d_name, ".DS_Store") == 0)
1270 67 : continue;
1271 :
1272 : /*
1273 : * Check if the postmaster has signaled us to exit, and abort with an
1274 : * error in that case. The error handler further up will call
1275 : * do_pg_abort_backup() for us. Also check that if the backup was
1276 : * started while still in recovery, the server wasn't promoted.
1277 : * do_pg_backup_stop() will check that too, but it's better to stop
1278 : * the backup early than continue to the end and fail there.
1279 : */
1280 343210 : CHECK_FOR_INTERRUPTS();
1281 343206 : if (RecoveryInProgress() != backup_started_in_recovery)
1282 0 : ereport(ERROR,
1283 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1284 : errmsg("the standby was promoted during online backup"),
1285 : errhint("This means that the backup being taken is corrupt "
1286 : "and should not be used. "
1287 : "Try taking another online backup.")));
1288 :
1289 : /* Scan for files that should be excluded */
1290 343206 : excludeFound = false;
1291 3083758 : for (excludeIdx = 0; excludeFiles[excludeIdx].name != NULL; excludeIdx++)
1292 : {
1293 2741882 : int cmplen = strlen(excludeFiles[excludeIdx].name);
1294 :
1295 2741882 : if (!excludeFiles[excludeIdx].match_prefix)
1296 2398806 : cmplen++;
1297 2741882 : if (strncmp(de->d_name, excludeFiles[excludeIdx].name, cmplen) == 0)
1298 : {
1299 1330 : elog(DEBUG1, "file \"%s\" excluded from backup", de->d_name);
1300 1330 : excludeFound = true;
1301 1330 : break;
1302 : }
1303 : }
1304 :
1305 343206 : if (excludeFound)
1306 1330 : continue;
1307 :
1308 : /*
1309 : * If there could be non-temporary relation files in this directory,
1310 : * try to parse the filename.
1311 : */
1312 341876 : if (isRelationDir)
1313 : isRelationFile =
1314 330507 : parse_filename_for_nontemp_relation(de->d_name,
1315 : &relfilenumber,
1316 : &relForkNum, &segno);
1317 :
1318 : /* Exclude all forks for unlogged tables except the init fork */
1319 341876 : if (isRelationFile && relForkNum != INIT_FORKNUM)
1320 : {
1321 : char initForkFile[MAXPGPATH];
1322 :
1323 : /*
1324 : * If any other type of fork, check if there is an init fork with
1325 : * the same RelFileNumber. If so, the file can be excluded.
1326 : */
1327 327665 : snprintf(initForkFile, sizeof(initForkFile), "%s/%u_init",
1328 : path, relfilenumber);
1329 :
1330 327665 : if (lstat(initForkFile, &statbuf) == 0)
1331 : {
1332 69 : elog(DEBUG2,
1333 : "unlogged relation file \"%s\" excluded from backup",
1334 : de->d_name);
1335 :
1336 69 : continue;
1337 : }
1338 : }
1339 :
1340 : /* Exclude temporary relations */
1341 341807 : if (OidIsValid(dboid) && looks_like_temp_rel_name(de->d_name))
1342 : {
1343 36 : elog(DEBUG2,
1344 : "temporary relation file \"%s\" excluded from backup",
1345 : de->d_name);
1346 :
1347 36 : continue;
1348 : }
1349 :
1350 341771 : snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path, de->d_name);
1351 :
1352 : /* Skip pg_control here to back up it last */
1353 341771 : if (strcmp(pathbuf, "./" XLOG_CONTROL_FILE) == 0)
1354 334 : continue;
1355 :
1356 341437 : if (lstat(pathbuf, &statbuf) != 0)
1357 : {
1358 0 : if (errno != ENOENT)
1359 0 : ereport(ERROR,
1360 : (errcode_for_file_access(),
1361 : errmsg("could not stat file or directory \"%s\": %m",
1362 : pathbuf)));
1363 :
1364 : /* If the file went away while scanning, it's not an error. */
1365 0 : continue;
1366 : }
1367 :
1368 : /* Scan for directories whose contents should be excluded */
1369 341437 : excludeFound = false;
1370 2722127 : for (excludeIdx = 0; excludeDirContents[excludeIdx] != NULL; excludeIdx++)
1371 : {
1372 2383029 : if (strcmp(de->d_name, excludeDirContents[excludeIdx]) == 0)
1373 : {
1374 2339 : elog(DEBUG1, "contents of directory \"%s\" excluded from backup", de->d_name);
1375 2339 : convert_link_to_directory(pathbuf, &statbuf);
1376 2339 : size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, NULL,
1377 : &statbuf, sizeonly);
1378 2339 : excludeFound = true;
1379 2339 : break;
1380 : }
1381 : }
1382 :
1383 341437 : if (excludeFound)
1384 2339 : continue;
1385 :
1386 : /*
1387 : * We can skip pg_wal, the WAL segments need to be fetched from the
1388 : * WAL archive anyway. But include it as an empty directory anyway, so
1389 : * we get permissions right.
1390 : */
1391 339098 : if (strcmp(pathbuf, "./pg_wal") == 0)
1392 : {
1393 : /* If pg_wal is a symlink, write it as a directory anyway */
1394 333 : convert_link_to_directory(pathbuf, &statbuf);
1395 333 : size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, NULL,
1396 : &statbuf, sizeonly);
1397 :
1398 : /*
1399 : * Also send archive_status and summaries directories (by
1400 : * hackishly reusing statbuf from above ...).
1401 : */
1402 333 : size += _tarWriteHeader(sink, "./pg_wal/archive_status", NULL,
1403 : &statbuf, sizeonly);
1404 333 : size += _tarWriteHeader(sink, "./pg_wal/summaries", NULL,
1405 : &statbuf, sizeonly);
1406 :
1407 333 : continue; /* don't recurse into pg_wal */
1408 : }
1409 :
1410 : /* Allow symbolic links in pg_tblspc only */
1411 338765 : if (strcmp(path, "./pg_tblspc") == 0 && S_ISLNK(statbuf.st_mode))
1412 39 : {
1413 : char linkpath[MAXPGPATH];
1414 : int rllen;
1415 :
1416 39 : rllen = readlink(pathbuf, linkpath, sizeof(linkpath));
1417 39 : if (rllen < 0)
1418 0 : ereport(ERROR,
1419 : (errcode_for_file_access(),
1420 : errmsg("could not read symbolic link \"%s\": %m",
1421 : pathbuf)));
1422 39 : if (rllen >= sizeof(linkpath))
1423 0 : ereport(ERROR,
1424 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1425 : errmsg("symbolic link \"%s\" target is too long",
1426 : pathbuf)));
1427 39 : linkpath[rllen] = '\0';
1428 :
1429 39 : size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, linkpath,
1430 : &statbuf, sizeonly);
1431 : }
1432 338726 : else if (S_ISDIR(statbuf.st_mode))
1433 : {
1434 5486 : bool skip_this_dir = false;
1435 : ListCell *lc;
1436 :
1437 : /*
1438 : * Store a directory entry in the tar file so we can get the
1439 : * permissions right.
1440 : */
1441 5486 : size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, NULL, &statbuf,
1442 : sizeonly);
1443 :
1444 : /*
1445 : * Call ourselves recursively for a directory, unless it happens
1446 : * to be a separate tablespace located within PGDATA.
1447 : */
1448 12109 : foreach(lc, tablespaces)
1449 : {
1450 6651 : tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
1451 :
1452 : /*
1453 : * ti->rpath is the tablespace relative path within PGDATA, or
1454 : * NULL if the tablespace has been properly located somewhere
1455 : * else.
1456 : *
1457 : * Skip past the leading "./" in pathbuf when comparing.
1458 : */
1459 6651 : if (ti->rpath && strcmp(ti->rpath, pathbuf + 2) == 0)
1460 : {
1461 28 : skip_this_dir = true;
1462 28 : break;
1463 : }
1464 : }
1465 :
1466 : /*
1467 : * skip sending directories inside pg_tblspc, if not required.
1468 : */
1469 5486 : if (strcmp(pathbuf, "./pg_tblspc") == 0 && !sendtblspclinks)
1470 26 : skip_this_dir = true;
1471 :
1472 5486 : if (!skip_this_dir)
1473 5432 : size += sendDir(sink, pathbuf, basepathlen, sizeonly, tablespaces,
1474 : sendtblspclinks, manifest, spcoid, ib);
1475 : }
1476 333240 : else if (S_ISREG(statbuf.st_mode))
1477 : {
1478 333240 : bool sent = false;
1479 333240 : unsigned num_blocks_required = 0;
1480 333240 : unsigned truncation_block_length = 0;
1481 : char tarfilenamebuf[MAXPGPATH * 2];
1482 333240 : char *tarfilename = pathbuf + basepathlen + 1;
1483 333240 : FileBackupMethod method = BACK_UP_FILE_FULLY;
1484 :
1485 333240 : if (ib != NULL && isRelationFile)
1486 : {
1487 : Oid relspcoid;
1488 : char *lookup_path;
1489 :
1490 11095 : if (OidIsValid(spcoid))
1491 : {
1492 9 : relspcoid = spcoid;
1493 9 : lookup_path = psprintf("%s/%u/%s", PG_TBLSPC_DIR, spcoid,
1494 : tarfilename);
1495 : }
1496 : else
1497 : {
1498 11086 : if (isGlobalDir)
1499 616 : relspcoid = GLOBALTABLESPACE_OID;
1500 : else
1501 10470 : relspcoid = DEFAULTTABLESPACE_OID;
1502 11086 : lookup_path = pstrdup(tarfilename);
1503 : }
1504 :
1505 11095 : method = GetFileBackupMethod(ib, lookup_path, dboid, relspcoid,
1506 : relfilenumber, relForkNum,
1507 11095 : segno, statbuf.st_size,
1508 : &num_blocks_required,
1509 : relative_block_numbers,
1510 : &truncation_block_length);
1511 11095 : if (method == BACK_UP_FILE_INCREMENTALLY)
1512 : {
1513 7327 : statbuf.st_size =
1514 7327 : GetIncrementalFileSize(num_blocks_required);
1515 7327 : snprintf(tarfilenamebuf, sizeof(tarfilenamebuf),
1516 : "%s/INCREMENTAL.%s",
1517 7327 : path + basepathlen + 1,
1518 7327 : de->d_name);
1519 7327 : tarfilename = tarfilenamebuf;
1520 : }
1521 :
1522 11095 : pfree(lookup_path);
1523 : }
1524 :
1525 333240 : if (!sizeonly)
1526 164233 : sent = sendFile(sink, pathbuf, tarfilename, &statbuf,
1527 : true, dboid, spcoid,
1528 : relfilenumber, segno, manifest,
1529 : num_blocks_required,
1530 : method == BACK_UP_FILE_INCREMENTALLY ? relative_block_numbers : NULL,
1531 : truncation_block_length);
1532 :
1533 333239 : if (sent || sizeonly)
1534 : {
1535 : /* Add size. */
1536 333239 : size += statbuf.st_size;
1537 :
1538 : /* Pad to a multiple of the tar block size. */
1539 333239 : size += tarPaddingBytesRequired(statbuf.st_size);
1540 :
1541 : /* Size of the header for the file. */
1542 333239 : size += TAR_BLOCK_SIZE;
1543 : }
1544 : }
1545 : else
1546 0 : ereport(WARNING,
1547 : (errmsg("skipping special file \"%s\"", pathbuf)));
1548 : }
1549 :
1550 5837 : if (relative_block_numbers != NULL)
1551 191 : pfree(relative_block_numbers);
1552 :
1553 5837 : FreeDir(dir);
1554 5837 : return size;
1555 : }
1556 :
1557 : /*
1558 : * Given the member, write the TAR header & send the file.
1559 : *
1560 : * If 'missing_ok' is true, will not throw an error if the file is not found.
1561 : *
1562 : * If dboid is anything other than InvalidOid then any checksum failures
1563 : * detected will get reported to the cumulative stats system.
1564 : *
1565 : * If the file is to be sent incrementally, then num_incremental_blocks
1566 : * should be the number of blocks to be sent, and incremental_blocks
1567 : * an array of block numbers relative to the start of the current segment.
1568 : * If the whole file is to be sent, then incremental_blocks should be NULL,
1569 : * and num_incremental_blocks can have any value, as it will be ignored.
1570 : *
1571 : * Returns true if the file was successfully sent, false if 'missing_ok',
1572 : * and the file did not exist.
1573 : */
1574 : static bool
1575 164397 : sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
1576 : struct stat *statbuf, bool missing_ok, Oid dboid, Oid spcoid,
1577 : RelFileNumber relfilenumber, unsigned segno,
1578 : backup_manifest_info *manifest, unsigned num_incremental_blocks,
1579 : BlockNumber *incremental_blocks, unsigned truncation_block_length)
1580 : {
1581 : int fd;
1582 164397 : BlockNumber blkno = 0;
1583 164397 : int checksum_failures = 0;
1584 : off_t cnt;
1585 164397 : pgoff_t bytes_done = 0;
1586 164397 : bool verify_checksum = false;
1587 : pg_checksum_context checksum_ctx;
1588 164397 : int ibindex = 0;
1589 :
1590 164397 : if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0)
1591 0 : elog(ERROR, "could not initialize checksum of file \"%s\"",
1592 : readfilename);
1593 :
1594 164397 : fd = OpenTransientFile(readfilename, O_RDONLY | PG_BINARY);
1595 164397 : if (fd < 0)
1596 : {
1597 0 : if (errno == ENOENT && missing_ok)
1598 0 : return false;
1599 0 : ereport(ERROR,
1600 : (errcode_for_file_access(),
1601 : errmsg("could not open file \"%s\": %m", readfilename)));
1602 : }
1603 :
1604 164397 : _tarWriteHeader(sink, tarfilename, NULL, statbuf, false);
1605 :
1606 : /*
1607 : * Checksums are verified in multiples of BLCKSZ, so the buffer length
1608 : * should be a multiple of the block size as well.
1609 : */
1610 : Assert((sink->bbs_buffer_length % BLCKSZ) == 0);
1611 :
1612 : /*
1613 : * If we weren't told not to verify checksums, and if checksums are
1614 : * enabled for this cluster, and if this is a relation file, then verify
1615 : * the checksum.
1616 : */
1617 164396 : if (!noverify_checksums && DataChecksumsEnabled() &&
1618 : RelFileNumberIsValid(relfilenumber))
1619 160522 : verify_checksum = true;
1620 :
1621 : /*
1622 : * If we're sending an incremental file, write the file header.
1623 : */
1624 164396 : if (incremental_blocks != NULL)
1625 : {
1626 7327 : unsigned magic = INCREMENTAL_MAGIC;
1627 7327 : size_t header_bytes_done = 0;
1628 : char padding[BLCKSZ];
1629 : size_t paddinglen;
1630 :
1631 : /* Emit header data. */
1632 7327 : push_to_sink(sink, &checksum_ctx, &header_bytes_done,
1633 : &magic, sizeof(magic));
1634 7327 : push_to_sink(sink, &checksum_ctx, &header_bytes_done,
1635 : &num_incremental_blocks, sizeof(num_incremental_blocks));
1636 7327 : push_to_sink(sink, &checksum_ctx, &header_bytes_done,
1637 : &truncation_block_length, sizeof(truncation_block_length));
1638 7327 : push_to_sink(sink, &checksum_ctx, &header_bytes_done,
1639 : incremental_blocks,
1640 : sizeof(BlockNumber) * num_incremental_blocks);
1641 :
1642 : /*
1643 : * Add padding to align header to a multiple of BLCKSZ, but only if
1644 : * the incremental file has some blocks, and the alignment is actually
1645 : * needed (i.e. header is not already a multiple of BLCKSZ). If there
1646 : * are no blocks we don't want to make the file unnecessarily large,
1647 : * as that might make some filesystem optimizations impossible.
1648 : */
1649 7327 : if ((num_incremental_blocks > 0) && (header_bytes_done % BLCKSZ != 0))
1650 : {
1651 26 : paddinglen = (BLCKSZ - (header_bytes_done % BLCKSZ));
1652 :
1653 26 : memset(padding, 0, paddinglen);
1654 26 : bytes_done += paddinglen;
1655 :
1656 26 : push_to_sink(sink, &checksum_ctx, &header_bytes_done,
1657 : padding, paddinglen);
1658 : }
1659 :
1660 : /* Flush out any data still in the buffer so it's again empty. */
1661 7327 : if (header_bytes_done > 0)
1662 : {
1663 7327 : bbsink_archive_contents(sink, header_bytes_done);
1664 7327 : if (pg_checksum_update(&checksum_ctx,
1665 7327 : (uint8 *) sink->bbs_buffer,
1666 : header_bytes_done) < 0)
1667 0 : elog(ERROR, "could not update checksum of base backup");
1668 : }
1669 :
1670 : /* Update our notion of file position. */
1671 7327 : bytes_done += sizeof(magic);
1672 7327 : bytes_done += sizeof(num_incremental_blocks);
1673 7327 : bytes_done += sizeof(truncation_block_length);
1674 7327 : bytes_done += sizeof(BlockNumber) * num_incremental_blocks;
1675 : }
1676 :
1677 : /*
1678 : * Loop until we read the amount of data the caller told us to expect. The
1679 : * file could be longer, if it was extended while we were sending it, but
1680 : * for a base backup we can ignore such extended data. It will be restored
1681 : * from WAL.
1682 : */
1683 : while (1)
1684 : {
1685 : /*
1686 : * Determine whether we've read all the data that we need, and if not,
1687 : * read some more.
1688 : */
1689 353756 : if (incremental_blocks == NULL)
1690 : {
1691 346389 : size_t remaining = statbuf->st_size - bytes_done;
1692 :
1693 : /*
1694 : * If we've read the required number of bytes, then it's time to
1695 : * stop.
1696 : */
1697 346389 : if (bytes_done >= statbuf->st_size)
1698 157069 : break;
1699 :
1700 : /*
1701 : * Read as many bytes as will fit in the buffer, or however many
1702 : * are left to read, whichever is less.
1703 : */
1704 189320 : cnt = read_file_data_into_buffer(sink, readfilename, fd,
1705 : bytes_done, remaining,
1706 189320 : blkno + segno * RELSEG_SIZE,
1707 : verify_checksum,
1708 : &checksum_failures);
1709 : }
1710 : else
1711 : {
1712 : BlockNumber relative_blkno;
1713 :
1714 : /*
1715 : * If we've read all the blocks, then it's time to stop.
1716 : */
1717 7367 : if (ibindex >= num_incremental_blocks)
1718 7327 : break;
1719 :
1720 : /*
1721 : * Read just one block, whichever one is the next that we're
1722 : * supposed to include.
1723 : */
1724 40 : relative_blkno = incremental_blocks[ibindex++];
1725 40 : cnt = read_file_data_into_buffer(sink, readfilename, fd,
1726 40 : relative_blkno * BLCKSZ,
1727 : BLCKSZ,
1728 40 : relative_blkno + segno * RELSEG_SIZE,
1729 : verify_checksum,
1730 : &checksum_failures);
1731 :
1732 : /*
1733 : * If we get a partial read, that must mean that the relation is
1734 : * being truncated. Ultimately, it should be truncated to a
1735 : * multiple of BLCKSZ, since this path should only be reached for
1736 : * relation files, but we might transiently observe an
1737 : * intermediate value.
1738 : *
1739 : * It should be fine to treat this just as if the entire block had
1740 : * been truncated away - i.e. fill this and all later blocks with
1741 : * zeroes. WAL replay will fix things up.
1742 : */
1743 40 : if (cnt < BLCKSZ)
1744 0 : break;
1745 : }
1746 :
1747 : /*
1748 : * If the amount of data we were able to read was not a multiple of
1749 : * BLCKSZ, we cannot verify checksums, which are block-level.
1750 : */
1751 189360 : if (verify_checksum && (cnt % BLCKSZ != 0))
1752 : {
1753 0 : ereport(WARNING,
1754 : (errmsg("could not verify checksum in file \"%s\", block "
1755 : "%u: read buffer size %d and page size %d "
1756 : "differ",
1757 : readfilename, blkno, (int) cnt, BLCKSZ)));
1758 0 : verify_checksum = false;
1759 : }
1760 :
1761 : /*
1762 : * If we hit end-of-file, a concurrent truncation must have occurred.
1763 : * That's not an error condition, because WAL replay will fix things
1764 : * up.
1765 : */
1766 189360 : if (cnt == 0)
1767 0 : break;
1768 :
1769 : /* Update block number and # of bytes done for next loop iteration. */
1770 189360 : blkno += cnt / BLCKSZ;
1771 189360 : bytes_done += cnt;
1772 :
1773 : /*
1774 : * Make sure incremental files with block data are properly aligned
1775 : * (header is a multiple of BLCKSZ, blocks are BLCKSZ too).
1776 : */
1777 : Assert(!((incremental_blocks != NULL && num_incremental_blocks > 0) &&
1778 : (bytes_done % BLCKSZ != 0)));
1779 :
1780 : /* Archive the data we just read. */
1781 189360 : bbsink_archive_contents(sink, cnt);
1782 :
1783 : /* Also feed it to the checksum machinery. */
1784 189360 : if (pg_checksum_update(&checksum_ctx,
1785 189360 : (uint8 *) sink->bbs_buffer, cnt) < 0)
1786 0 : elog(ERROR, "could not update checksum of base backup");
1787 : }
1788 :
1789 : /* If the file was truncated while we were sending it, pad it with zeros */
1790 164396 : while (bytes_done < statbuf->st_size)
1791 : {
1792 0 : size_t remaining = statbuf->st_size - bytes_done;
1793 0 : size_t nbytes = Min(sink->bbs_buffer_length, remaining);
1794 :
1795 0 : MemSet(sink->bbs_buffer, 0, nbytes);
1796 0 : if (pg_checksum_update(&checksum_ctx,
1797 0 : (uint8 *) sink->bbs_buffer,
1798 : nbytes) < 0)
1799 0 : elog(ERROR, "could not update checksum of base backup");
1800 0 : bbsink_archive_contents(sink, nbytes);
1801 0 : bytes_done += nbytes;
1802 : }
1803 :
1804 : /*
1805 : * Pad to a block boundary, per tar format requirements. (This small piece
1806 : * of data is probably not worth throttling, and is not checksummed
1807 : * because it's not actually part of the file.)
1808 : */
1809 164396 : _tarWritePadding(sink, bytes_done);
1810 :
1811 164396 : CloseTransientFile(fd);
1812 :
1813 164396 : if (checksum_failures > 1)
1814 : {
1815 2 : ereport(WARNING,
1816 : (errmsg_plural("file \"%s\" has a total of %d checksum verification failure",
1817 : "file \"%s\" has a total of %d checksum verification failures",
1818 : checksum_failures,
1819 : readfilename, checksum_failures)));
1820 :
1821 2 : pgstat_prepare_report_checksum_failure(dboid);
1822 2 : pgstat_report_checksum_failures_in_db(dboid, checksum_failures);
1823 : }
1824 :
1825 164396 : total_checksum_failures += checksum_failures;
1826 :
1827 164396 : AddFileToBackupManifest(manifest, spcoid, tarfilename, statbuf->st_size,
1828 164396 : (pg_time_t) statbuf->st_mtime, &checksum_ctx);
1829 :
1830 164396 : return true;
1831 : }
1832 :
1833 : /*
1834 : * Read some more data from the file into the bbsink's buffer, verifying
1835 : * checksums as required.
1836 : *
1837 : * 'offset' is the file offset from which we should begin to read, and
1838 : * 'length' is the amount of data that should be read. The actual amount
1839 : * of data read will be less than the requested amount if the bbsink's
1840 : * buffer isn't big enough to hold it all, or if the underlying file has
1841 : * been truncated. The return value is the number of bytes actually read.
1842 : *
1843 : * 'blkno' is the block number of the first page in the bbsink's buffer
1844 : * relative to the start of the relation.
1845 : *
1846 : * 'verify_checksum' indicates whether we should try to verify checksums
1847 : * for the blocks we read. If we do this, we'll update *checksum_failures
1848 : * and issue warnings as appropriate.
1849 : */
1850 : static off_t
1851 189360 : read_file_data_into_buffer(bbsink *sink, const char *readfilename, int fd,
1852 : off_t offset, size_t length, BlockNumber blkno,
1853 : bool verify_checksum, int *checksum_failures)
1854 : {
1855 : off_t cnt;
1856 : int i;
1857 : char *page;
1858 :
1859 : /* Try to read some more data. */
1860 189360 : cnt = basebackup_read_file(fd, sink->bbs_buffer,
1861 189360 : Min(sink->bbs_buffer_length, length),
1862 : offset, readfilename, true);
1863 :
1864 : /* Can't verify checksums if read length is not a multiple of BLCKSZ. */
1865 189360 : if (!verify_checksum || (cnt % BLCKSZ) != 0)
1866 4415 : return cnt;
1867 :
1868 : /* Verify checksum for each block. */
1869 646645 : for (i = 0; i < cnt / BLCKSZ; i++)
1870 : {
1871 : int reread_cnt;
1872 : uint16 expected_checksum;
1873 :
1874 461700 : page = sink->bbs_buffer + BLCKSZ * i;
1875 :
1876 : /* If the page is OK, go on to the next one. */
1877 461700 : if (verify_page_checksum(page, sink->bbs_state->startptr, blkno + i,
1878 : &expected_checksum))
1879 461686 : continue;
1880 :
1881 : /*
1882 : * Retry the block on the first failure. It's possible that we read
1883 : * the first 4K page of the block just before postgres updated the
1884 : * entire block so it ends up looking torn to us. If, before we retry
1885 : * the read, the concurrent write of the block finishes, the page LSN
1886 : * will be updated and we'll realize that we should ignore this block.
1887 : *
1888 : * There's no guarantee that this will actually happen, though: the
1889 : * torn write could take an arbitrarily long time to complete.
1890 : * Retrying multiple times wouldn't fix this problem, either, though
1891 : * it would reduce the chances of it happening in practice. The only
1892 : * real fix here seems to be to have some kind of interlock that
1893 : * allows us to wait until we can be certain that no write to the
1894 : * block is in progress. Since we don't have any such thing right now,
1895 : * we just do this and hope for the best.
1896 : */
1897 14 : reread_cnt =
1898 14 : basebackup_read_file(fd, sink->bbs_buffer + BLCKSZ * i,
1899 14 : BLCKSZ, offset + BLCKSZ * i,
1900 : readfilename, false);
1901 14 : if (reread_cnt == 0)
1902 : {
1903 : /*
1904 : * If we hit end-of-file, a concurrent truncation must have
1905 : * occurred, so reduce cnt to reflect only the blocks already
1906 : * processed and break out of this loop.
1907 : */
1908 0 : cnt = BLCKSZ * i;
1909 0 : break;
1910 : }
1911 :
1912 : /* If the page now looks OK, go on to the next one. */
1913 14 : if (verify_page_checksum(page, sink->bbs_state->startptr, blkno + i,
1914 : &expected_checksum))
1915 0 : continue;
1916 :
1917 : /* Handle checksum failure. */
1918 14 : (*checksum_failures)++;
1919 14 : if (*checksum_failures <= 5)
1920 12 : ereport(WARNING,
1921 : (errmsg("checksum verification failed in "
1922 : "file \"%s\", block %u: calculated "
1923 : "%X but expected %X",
1924 : readfilename, blkno + i, expected_checksum,
1925 : ((PageHeader) page)->pd_checksum)));
1926 14 : if (*checksum_failures == 5)
1927 2 : ereport(WARNING,
1928 : (errmsg("further checksum verification "
1929 : "failures in file \"%s\" will not "
1930 : "be reported", readfilename)));
1931 : }
1932 :
1933 184945 : return cnt;
1934 : }
1935 :
1936 : /*
1937 : * Push data into a bbsink.
1938 : *
1939 : * It's better, when possible, to read data directly into the bbsink's buffer,
1940 : * rather than using this function to copy it into the buffer; this function is
1941 : * for cases where that approach is not practical.
1942 : *
1943 : * bytes_done should point to a count of the number of bytes that are
1944 : * currently used in the bbsink's buffer. Upon return, the bytes identified by
1945 : * data and length will have been copied into the bbsink's buffer, flushing
1946 : * as required, and *bytes_done will have been updated accordingly. If the
1947 : * buffer was flushed, the previous contents will also have been fed to
1948 : * checksum_ctx.
1949 : *
1950 : * Note that after one or more calls to this function it is the caller's
1951 : * responsibility to perform any required final flush.
1952 : */
1953 : static void
1954 29334 : push_to_sink(bbsink *sink, pg_checksum_context *checksum_ctx,
1955 : size_t *bytes_done, void *data, size_t length)
1956 : {
1957 29334 : while (length > 0)
1958 : {
1959 : size_t bytes_to_copy;
1960 :
1961 : /*
1962 : * We use < here rather than <= so that if the data exactly fills the
1963 : * remaining buffer space, we trigger a flush now.
1964 : */
1965 22033 : if (length < sink->bbs_buffer_length - *bytes_done)
1966 : {
1967 : /* Append remaining data to buffer. */
1968 22033 : memcpy(sink->bbs_buffer + *bytes_done, data, length);
1969 22033 : *bytes_done += length;
1970 22033 : return;
1971 : }
1972 :
1973 : /* Copy until buffer is full and flush it. */
1974 0 : bytes_to_copy = sink->bbs_buffer_length - *bytes_done;
1975 0 : memcpy(sink->bbs_buffer + *bytes_done, data, bytes_to_copy);
1976 0 : data = ((char *) data) + bytes_to_copy;
1977 0 : length -= bytes_to_copy;
1978 0 : bbsink_archive_contents(sink, sink->bbs_buffer_length);
1979 0 : if (pg_checksum_update(checksum_ctx, (uint8 *) sink->bbs_buffer,
1980 : sink->bbs_buffer_length) < 0)
1981 0 : elog(ERROR, "could not update checksum");
1982 0 : *bytes_done = 0;
1983 : }
1984 : }
1985 :
1986 : /*
1987 : * Try to verify the checksum for the provided page, if it seems appropriate
1988 : * to do so.
1989 : *
1990 : * Returns true if verification succeeds or if we decide not to check it,
1991 : * and false if verification fails. When return false, it also sets
1992 : * *expected_checksum to the computed value.
1993 : */
1994 : static bool
1995 461714 : verify_page_checksum(Page page, XLogRecPtr start_lsn, BlockNumber blkno,
1996 : uint16 *expected_checksum)
1997 : {
1998 : PageHeader phdr;
1999 : uint16 checksum;
2000 :
2001 : /*
2002 : * Only check pages which have not been modified since the start of the
2003 : * base backup. Otherwise, they might have been written only halfway and
2004 : * the checksum would not be valid. However, replaying WAL would
2005 : * reinstate the correct page in this case. We also skip completely new
2006 : * pages, since they don't have a checksum yet.
2007 : */
2008 461714 : if (PageIsNew(page) || PageGetLSN(page) >= start_lsn)
2009 1029 : return true;
2010 :
2011 : /* Perform the actual checksum calculation. */
2012 460685 : checksum = pg_checksum_page(page, blkno);
2013 :
2014 : /* See whether it matches the value from the page. */
2015 460685 : phdr = (PageHeader) page;
2016 460685 : if (phdr->pd_checksum == checksum)
2017 460657 : return true;
2018 28 : *expected_checksum = checksum;
2019 28 : return false;
2020 : }
2021 :
2022 : static int64
2023 173560 : _tarWriteHeader(bbsink *sink, const char *filename, const char *linktarget,
2024 : struct stat *statbuf, bool sizeonly)
2025 : {
2026 : enum tarError rc;
2027 :
2028 173560 : if (!sizeonly)
2029 : {
2030 : /*
2031 : * As of this writing, the smallest supported block size is 1kB, which
2032 : * is twice TAR_BLOCK_SIZE. Since the buffer size is required to be a
2033 : * multiple of BLCKSZ, it should be safe to assume that the buffer is
2034 : * large enough to fit an entire tar block. We double-check by means
2035 : * of these assertions.
2036 : */
2037 : StaticAssertDecl(TAR_BLOCK_SIZE <= BLCKSZ,
2038 : "BLCKSZ too small for tar block");
2039 : Assert(sink->bbs_buffer_length >= TAR_BLOCK_SIZE);
2040 :
2041 169045 : rc = tarCreateHeader(sink->bbs_buffer, filename, linktarget,
2042 : statbuf->st_size, statbuf->st_mode,
2043 : statbuf->st_uid, statbuf->st_gid,
2044 : statbuf->st_mtime);
2045 :
2046 169045 : switch (rc)
2047 : {
2048 169044 : case TAR_OK:
2049 169044 : break;
2050 1 : case TAR_NAME_TOO_LONG:
2051 1 : ereport(ERROR,
2052 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2053 : errmsg("file name too long for tar format: \"%s\"",
2054 : filename)));
2055 : break;
2056 0 : case TAR_SYMLINK_TOO_LONG:
2057 0 : ereport(ERROR,
2058 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2059 : errmsg("symbolic link target too long for tar format: "
2060 : "file name \"%s\", target \"%s\"",
2061 : filename, linktarget)));
2062 : break;
2063 0 : default:
2064 0 : elog(ERROR, "unrecognized tar error: %d", rc);
2065 : }
2066 :
2067 169044 : bbsink_archive_contents(sink, TAR_BLOCK_SIZE);
2068 : }
2069 :
2070 173559 : return TAR_BLOCK_SIZE;
2071 : }
2072 :
2073 : /*
2074 : * Pad with zero bytes out to a multiple of TAR_BLOCK_SIZE.
2075 : */
2076 : static void
2077 164607 : _tarWritePadding(bbsink *sink, int len)
2078 : {
2079 164607 : int pad = tarPaddingBytesRequired(len);
2080 :
2081 : /*
2082 : * As in _tarWriteHeader, it should be safe to assume that the buffer is
2083 : * large enough that we don't need to do this in multiple chunks.
2084 : */
2085 : Assert(sink->bbs_buffer_length >= TAR_BLOCK_SIZE);
2086 : Assert(pad <= TAR_BLOCK_SIZE);
2087 :
2088 164607 : if (pad > 0)
2089 : {
2090 29878 : MemSet(sink->bbs_buffer, 0, pad);
2091 9717 : bbsink_archive_contents(sink, pad);
2092 : }
2093 164607 : }
2094 :
2095 : /*
2096 : * If the entry in statbuf is a link, then adjust statbuf to make it look like a
2097 : * directory, so that it will be written that way.
2098 : */
2099 : static void
2100 2672 : convert_link_to_directory(const char *pathbuf, struct stat *statbuf)
2101 : {
2102 : /* If symlink, write it as a directory anyway */
2103 2672 : if (S_ISLNK(statbuf->st_mode))
2104 66 : statbuf->st_mode = S_IFDIR | pg_dir_create_mode;
2105 2672 : }
2106 :
2107 : /*
2108 : * Read some data from a file, setting a wait event and reporting any error
2109 : * encountered.
2110 : *
2111 : * If partial_read_ok is false, also report an error if the number of bytes
2112 : * read is not equal to the number of bytes requested.
2113 : *
2114 : * Returns the number of bytes read.
2115 : */
2116 : static ssize_t
2117 197054 : basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset,
2118 : const char *filename, bool partial_read_ok)
2119 : {
2120 : ssize_t rc;
2121 :
2122 197054 : pgstat_report_wait_start(WAIT_EVENT_BASEBACKUP_READ);
2123 197054 : rc = pg_pread(fd, buf, nbytes, offset);
2124 197054 : pgstat_report_wait_end();
2125 :
2126 197054 : if (rc < 0)
2127 0 : ereport(ERROR,
2128 : (errcode_for_file_access(),
2129 : errmsg("could not read file \"%s\": %m", filename)));
2130 197054 : if (!partial_read_ok && rc > 0 && rc != nbytes)
2131 0 : ereport(ERROR,
2132 : (errcode_for_file_access(),
2133 : errmsg("could not read file \"%s\": read %zd of %zu",
2134 : filename, rc, nbytes)));
2135 :
2136 197054 : return rc;
2137 : }
|