Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * basebackup.c
4 : * code for taking a base backup and streaming it to a standby
5 : *
6 : * Portions Copyright (c) 2010-2025, PostgreSQL Global Development Group
7 : *
8 : * IDENTIFICATION
9 : * src/backend/backup/basebackup.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 : #include "postgres.h"
14 :
15 : #include <sys/stat.h>
16 : #include <unistd.h>
17 : #include <time.h>
18 :
19 : #include "access/xlog_internal.h"
20 : #include "access/xlogbackup.h"
21 : #include "backup/backup_manifest.h"
22 : #include "backup/basebackup.h"
23 : #include "backup/basebackup_incremental.h"
24 : #include "backup/basebackup_sink.h"
25 : #include "backup/basebackup_target.h"
26 : #include "catalog/pg_tablespace_d.h"
27 : #include "commands/defrem.h"
28 : #include "common/compression.h"
29 : #include "common/file_perm.h"
30 : #include "common/file_utils.h"
31 : #include "lib/stringinfo.h"
32 : #include "miscadmin.h"
33 : #include "nodes/pg_list.h"
34 : #include "pgstat.h"
35 : #include "pgtar.h"
36 : #include "postmaster/syslogger.h"
37 : #include "postmaster/walsummarizer.h"
38 : #include "replication/slot.h"
39 : #include "replication/walsender.h"
40 : #include "replication/walsender_private.h"
41 : #include "storage/bufpage.h"
42 : #include "storage/checksum.h"
43 : #include "storage/dsm_impl.h"
44 : #include "storage/ipc.h"
45 : #include "storage/reinit.h"
46 : #include "utils/builtins.h"
47 : #include "utils/guc.h"
48 : #include "utils/ps_status.h"
49 : #include "utils/relcache.h"
50 : #include "utils/resowner.h"
51 :
52 : /*
53 : * How much data do we want to send in one CopyData message? Note that
54 : * this may also result in reading the underlying files in chunks of this
55 : * size.
56 : *
57 : * NB: The buffer size is required to be a multiple of the system block
58 : * size, so use that value instead if it's bigger than our preference.
59 : */
60 : #define SINK_BUFFER_LENGTH Max(32768, BLCKSZ)
61 :
62 : typedef struct
63 : {
64 : const char *label;
65 : bool progress;
66 : bool fastcheckpoint;
67 : bool nowait;
68 : bool includewal;
69 : bool incremental;
70 : uint32 maxrate;
71 : bool sendtblspcmapfile;
72 : bool send_to_client;
73 : bool use_copytblspc;
74 : BaseBackupTargetHandle *target_handle;
75 : backup_manifest_option manifest;
76 : pg_compress_algorithm compression;
77 : pg_compress_specification compression_specification;
78 : pg_checksum_type manifest_checksum_type;
79 : } basebackup_options;
80 :
81 : static int64 sendTablespace(bbsink *sink, char *path, Oid spcoid, bool sizeonly,
82 : struct backup_manifest_info *manifest,
83 : IncrementalBackupInfo *ib);
84 : static int64 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly,
85 : List *tablespaces, bool sendtblspclinks,
86 : backup_manifest_info *manifest, Oid spcoid,
87 : IncrementalBackupInfo *ib);
88 : static bool sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
89 : struct stat *statbuf, bool missing_ok,
90 : Oid dboid, Oid spcoid, RelFileNumber relfilenumber,
91 : unsigned segno,
92 : backup_manifest_info *manifest,
93 : unsigned num_incremental_blocks,
94 : BlockNumber *incremental_blocks,
95 : unsigned truncation_block_length);
96 : static off_t read_file_data_into_buffer(bbsink *sink,
97 : const char *readfilename, int fd,
98 : off_t offset, size_t length,
99 : BlockNumber blkno,
100 : bool verify_checksum,
101 : int *checksum_failures);
102 : static void push_to_sink(bbsink *sink, pg_checksum_context *checksum_ctx,
103 : size_t *bytes_done, void *data, size_t length);
104 : static bool verify_page_checksum(Page page, XLogRecPtr start_lsn,
105 : BlockNumber blkno,
106 : uint16 *expected_checksum);
107 : static void sendFileWithContent(bbsink *sink, const char *filename,
108 : const char *content, int len,
109 : backup_manifest_info *manifest);
110 : static int64 _tarWriteHeader(bbsink *sink, const char *filename,
111 : const char *linktarget, struct stat *statbuf,
112 : bool sizeonly);
113 : static void _tarWritePadding(bbsink *sink, int len);
114 : static void convert_link_to_directory(const char *pathbuf, struct stat *statbuf);
115 : static void perform_base_backup(basebackup_options *opt, bbsink *sink,
116 : IncrementalBackupInfo *ib);
117 : static void parse_basebackup_options(List *options, basebackup_options *opt);
118 : static int compareWalFileNames(const ListCell *a, const ListCell *b);
119 : static ssize_t basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset,
120 : const char *filename, bool partial_read_ok);
121 :
122 : /* Was the backup currently in-progress initiated in recovery mode? */
123 : static bool backup_started_in_recovery = false;
124 :
125 : /* Total number of checksum failures during base backup. */
126 : static long long int total_checksum_failures;
127 :
128 : /* Do not verify checksums. */
129 : static bool noverify_checksums = false;
130 :
131 : /*
132 : * Definition of one element part of an exclusion list, used for paths part
133 : * of checksum validation or base backups. "name" is the name of the file
134 : * or path to check for exclusion. If "match_prefix" is true, any items
135 : * matching the name as prefix are excluded.
136 : */
137 : struct exclude_list_item
138 : {
139 : const char *name;
140 : bool match_prefix;
141 : };
142 :
143 : /*
144 : * The contents of these directories are removed or recreated during server
145 : * start so they are not included in backups. The directories themselves are
146 : * kept and included as empty to preserve access permissions.
147 : *
148 : * Note: this list should be kept in sync with the filter lists in pg_rewind's
149 : * filemap.c.
150 : */
151 : static const char *const excludeDirContents[] =
152 : {
153 : /*
154 : * Skip temporary statistics files. PG_STAT_TMP_DIR must be skipped
155 : * because extensions like pg_stat_statements store data there.
156 : */
157 : PG_STAT_TMP_DIR,
158 :
159 : /*
160 : * It is generally not useful to backup the contents of this directory
161 : * even if the intention is to restore to another primary. See backup.sgml
162 : * for a more detailed description.
163 : */
164 : PG_REPLSLOT_DIR,
165 :
166 : /* Contents removed on startup, see dsm_cleanup_for_mmap(). */
167 : PG_DYNSHMEM_DIR,
168 :
169 : /* Contents removed on startup, see AsyncShmemInit(). */
170 : "pg_notify",
171 :
172 : /*
173 : * Old contents are loaded for possible debugging but are not required for
174 : * normal operation, see SerialInit().
175 : */
176 : "pg_serial",
177 :
178 : /* Contents removed on startup, see DeleteAllExportedSnapshotFiles(). */
179 : "pg_snapshots",
180 :
181 : /* Contents zeroed on startup, see StartupSUBTRANS(). */
182 : "pg_subtrans",
183 :
184 : /* end of list */
185 : NULL
186 : };
187 :
188 : /*
189 : * List of files excluded from backups.
190 : */
191 : static const struct exclude_list_item excludeFiles[] =
192 : {
193 : /* Skip auto conf temporary file. */
194 : {PG_AUTOCONF_FILENAME ".tmp", false},
195 :
196 : /* Skip current log file temporary file */
197 : {LOG_METAINFO_DATAFILE_TMP, false},
198 :
199 : /*
200 : * Skip relation cache because it is rebuilt on startup. This includes
201 : * temporary files.
202 : */
203 : {RELCACHE_INIT_FILENAME, true},
204 :
205 : /*
206 : * backup_label and tablespace_map should not exist in a running cluster
207 : * capable of doing an online backup, but exclude them just in case.
208 : */
209 : {BACKUP_LABEL_FILE, false},
210 : {TABLESPACE_MAP, false},
211 :
212 : /*
213 : * If there's a backup_manifest, it belongs to a backup that was used to
214 : * start this server. It is *not* correct for this backup. Our
215 : * backup_manifest is injected into the backup separately if users want
216 : * it.
217 : */
218 : {"backup_manifest", false},
219 :
220 : {"postmaster.pid", false},
221 : {"postmaster.opts", false},
222 :
223 : /* end of list */
224 : {NULL, false}
225 : };
226 :
227 : /*
228 : * Actually do a base backup for the specified tablespaces.
229 : *
230 : * This is split out mainly to avoid complaints about "variable might be
231 : * clobbered by longjmp" from stupider versions of gcc.
232 : */
233 : static void
234 312 : perform_base_backup(basebackup_options *opt, bbsink *sink,
235 : IncrementalBackupInfo *ib)
236 : {
237 : bbsink_state state;
238 : XLogRecPtr endptr;
239 : TimeLineID endtli;
240 : backup_manifest_info manifest;
241 : BackupState *backup_state;
242 : StringInfo tablespace_map;
243 :
244 : /* Initial backup state, insofar as we know it now. */
245 312 : state.tablespaces = NIL;
246 312 : state.tablespace_num = 0;
247 312 : state.bytes_done = 0;
248 312 : state.bytes_total = 0;
249 312 : state.bytes_total_is_valid = false;
250 :
251 : /* we're going to use a BufFile, so we need a ResourceOwner */
252 : Assert(AuxProcessResourceOwner != NULL);
253 : Assert(CurrentResourceOwner == AuxProcessResourceOwner ||
254 : CurrentResourceOwner == NULL);
255 312 : CurrentResourceOwner = AuxProcessResourceOwner;
256 :
257 312 : backup_started_in_recovery = RecoveryInProgress();
258 :
259 312 : InitializeBackupManifest(&manifest, opt->manifest,
260 : opt->manifest_checksum_type);
261 :
262 312 : total_checksum_failures = 0;
263 :
264 : /* Allocate backup related variables. */
265 312 : backup_state = (BackupState *) palloc0(sizeof(BackupState));
266 312 : tablespace_map = makeStringInfo();
267 :
268 312 : basebackup_progress_wait_checkpoint();
269 312 : do_pg_backup_start(opt->label, opt->fastcheckpoint, &state.tablespaces,
270 : backup_state, tablespace_map);
271 :
272 312 : state.startptr = backup_state->startpoint;
273 312 : state.starttli = backup_state->starttli;
274 :
275 : /*
276 : * Once do_pg_backup_start has been called, ensure that any failure causes
277 : * us to abort the backup so we don't "leak" a backup counter. For this
278 : * reason, *all* functionality between do_pg_backup_start() and the end of
279 : * do_pg_backup_stop() should be inside the error cleanup block!
280 : */
281 :
282 312 : PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false));
283 : {
284 : ListCell *lc;
285 : tablespaceinfo *newti;
286 :
287 : /* If this is an incremental backup, execute preparatory steps. */
288 312 : if (ib != NULL)
289 18 : PrepareForIncrementalBackup(ib, backup_state);
290 :
291 : /* Add a node for the base directory at the end */
292 312 : newti = palloc0(sizeof(tablespaceinfo));
293 312 : newti->size = -1;
294 312 : state.tablespaces = lappend(state.tablespaces, newti);
295 :
296 : /*
297 : * Calculate the total backup size by summing up the size of each
298 : * tablespace
299 : */
300 312 : if (opt->progress)
301 : {
302 310 : basebackup_progress_estimate_backup_size();
303 :
304 692 : foreach(lc, state.tablespaces)
305 : {
306 382 : tablespaceinfo *tmp = (tablespaceinfo *) lfirst(lc);
307 :
308 382 : if (tmp->path == NULL)
309 310 : tmp->size = sendDir(sink, ".", 1, true, state.tablespaces,
310 : true, NULL, InvalidOid, NULL);
311 : else
312 72 : tmp->size = sendTablespace(sink, tmp->path, tmp->oid, true,
313 : NULL, NULL);
314 382 : state.bytes_total += tmp->size;
315 : }
316 310 : state.bytes_total_is_valid = true;
317 : }
318 :
319 : /* notify basebackup sink about start of backup */
320 312 : bbsink_begin_backup(sink, &state, SINK_BUFFER_LENGTH);
321 :
322 : /* Send off our tablespaces one by one */
323 682 : foreach(lc, state.tablespaces)
324 : {
325 384 : tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
326 :
327 384 : if (ti->path == NULL)
328 : {
329 : struct stat statbuf;
330 312 : bool sendtblspclinks = true;
331 : char *backup_label;
332 :
333 312 : bbsink_begin_archive(sink, "base.tar");
334 :
335 : /* In the main tar, include the backup_label first... */
336 312 : backup_label = build_backup_content(backup_state, false);
337 312 : sendFileWithContent(sink, BACKUP_LABEL_FILE,
338 : backup_label, -1, &manifest);
339 312 : pfree(backup_label);
340 :
341 : /* Then the tablespace_map file, if required... */
342 312 : if (opt->sendtblspcmapfile)
343 : {
344 50 : sendFileWithContent(sink, TABLESPACE_MAP,
345 50 : tablespace_map->data, -1, &manifest);
346 50 : sendtblspclinks = false;
347 : }
348 :
349 : /* Then the bulk of the files... */
350 312 : sendDir(sink, ".", 1, false, state.tablespaces,
351 : sendtblspclinks, &manifest, InvalidOid, ib);
352 :
353 : /* ... and pg_control after everything else. */
354 298 : if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0)
355 0 : ereport(ERROR,
356 : (errcode_for_file_access(),
357 : errmsg("could not stat file \"%s\": %m",
358 : XLOG_CONTROL_FILE)));
359 298 : sendFile(sink, XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf,
360 : false, InvalidOid, InvalidOid,
361 : InvalidRelFileNumber, 0, &manifest, 0, NULL, 0);
362 : }
363 : else
364 : {
365 72 : char *archive_name = psprintf("%u.tar", ti->oid);
366 :
367 72 : bbsink_begin_archive(sink, archive_name);
368 :
369 72 : sendTablespace(sink, ti->path, ti->oid, false, &manifest, ib);
370 : }
371 :
372 : /*
373 : * If we're including WAL, and this is the main data directory we
374 : * don't treat this as the end of the tablespace. Instead, we will
375 : * include the xlog files below and stop afterwards. This is safe
376 : * since the main data directory is always sent *last*.
377 : */
378 370 : if (opt->includewal && ti->path == NULL)
379 : {
380 : Assert(lnext(state.tablespaces, lc) == NULL);
381 : }
382 : else
383 : {
384 : /* Properly terminate the tarfile. */
385 : StaticAssertDecl(2 * TAR_BLOCK_SIZE <= BLCKSZ,
386 : "BLCKSZ too small for 2 tar blocks");
387 344 : memset(sink->bbs_buffer, 0, 2 * TAR_BLOCK_SIZE);
388 344 : bbsink_archive_contents(sink, 2 * TAR_BLOCK_SIZE);
389 :
390 : /* OK, that's the end of the archive. */
391 344 : bbsink_end_archive(sink);
392 : }
393 : }
394 :
395 298 : basebackup_progress_wait_wal_archive(&state);
396 298 : do_pg_backup_stop(backup_state, !opt->nowait);
397 :
398 298 : endptr = backup_state->stoppoint;
399 298 : endtli = backup_state->stoptli;
400 :
401 : /* Deallocate backup-related variables. */
402 298 : destroyStringInfo(tablespace_map);
403 298 : pfree(backup_state);
404 : }
405 302 : PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false));
406 :
407 :
408 298 : if (opt->includewal)
409 : {
410 : /*
411 : * We've left the last tar file "open", so we can now append the
412 : * required WAL files to it.
413 : */
414 : char pathbuf[MAXPGPATH];
415 : XLogSegNo segno;
416 : XLogSegNo startsegno;
417 : XLogSegNo endsegno;
418 : struct stat statbuf;
419 26 : List *historyFileList = NIL;
420 26 : List *walFileList = NIL;
421 : char firstoff[MAXFNAMELEN];
422 : char lastoff[MAXFNAMELEN];
423 : DIR *dir;
424 : struct dirent *de;
425 : ListCell *lc;
426 : TimeLineID tli;
427 :
428 26 : basebackup_progress_transfer_wal();
429 :
430 : /*
431 : * I'd rather not worry about timelines here, so scan pg_wal and
432 : * include all WAL files in the range between 'startptr' and 'endptr',
433 : * regardless of the timeline the file is stamped with. If there are
434 : * some spurious WAL files belonging to timelines that don't belong in
435 : * this server's history, they will be included too. Normally there
436 : * shouldn't be such files, but if there are, there's little harm in
437 : * including them.
438 : */
439 26 : XLByteToSeg(state.startptr, startsegno, wal_segment_size);
440 26 : XLogFileName(firstoff, state.starttli, startsegno, wal_segment_size);
441 26 : XLByteToPrevSeg(endptr, endsegno, wal_segment_size);
442 26 : XLogFileName(lastoff, endtli, endsegno, wal_segment_size);
443 :
444 26 : dir = AllocateDir("pg_wal");
445 188 : while ((de = ReadDir(dir, "pg_wal")) != NULL)
446 : {
447 : /* Does it look like a WAL segment, and is it in the range? */
448 162 : if (IsXLogFileName(de->d_name) &&
449 58 : strcmp(de->d_name + 8, firstoff + 8) >= 0 &&
450 58 : strcmp(de->d_name + 8, lastoff + 8) <= 0)
451 : {
452 26 : walFileList = lappend(walFileList, pstrdup(de->d_name));
453 : }
454 : /* Does it look like a timeline history file? */
455 136 : else if (IsTLHistoryFileName(de->d_name))
456 : {
457 0 : historyFileList = lappend(historyFileList, pstrdup(de->d_name));
458 : }
459 : }
460 26 : FreeDir(dir);
461 :
462 : /*
463 : * Before we go any further, check that none of the WAL segments we
464 : * need were removed.
465 : */
466 26 : CheckXLogRemoved(startsegno, state.starttli);
467 :
468 : /*
469 : * Sort the WAL filenames. We want to send the files in order from
470 : * oldest to newest, to reduce the chance that a file is recycled
471 : * before we get a chance to send it over.
472 : */
473 26 : list_sort(walFileList, compareWalFileNames);
474 :
475 : /*
476 : * There must be at least one xlog file in the pg_wal directory, since
477 : * we are doing backup-including-xlog.
478 : */
479 26 : if (walFileList == NIL)
480 0 : ereport(ERROR,
481 : (errmsg("could not find any WAL files")));
482 :
483 : /*
484 : * Sanity check: the first and last segment should cover startptr and
485 : * endptr, with no gaps in between.
486 : */
487 26 : XLogFromFileName((char *) linitial(walFileList),
488 : &tli, &segno, wal_segment_size);
489 26 : if (segno != startsegno)
490 : {
491 : char startfname[MAXFNAMELEN];
492 :
493 0 : XLogFileName(startfname, state.starttli, startsegno,
494 : wal_segment_size);
495 0 : ereport(ERROR,
496 : (errmsg("could not find WAL file \"%s\"", startfname)));
497 : }
498 52 : foreach(lc, walFileList)
499 : {
500 26 : char *walFileName = (char *) lfirst(lc);
501 26 : XLogSegNo currsegno = segno;
502 26 : XLogSegNo nextsegno = segno + 1;
503 :
504 26 : XLogFromFileName(walFileName, &tli, &segno, wal_segment_size);
505 26 : if (!(nextsegno == segno || currsegno == segno))
506 : {
507 : char nextfname[MAXFNAMELEN];
508 :
509 0 : XLogFileName(nextfname, tli, nextsegno, wal_segment_size);
510 0 : ereport(ERROR,
511 : (errmsg("could not find WAL file \"%s\"", nextfname)));
512 : }
513 : }
514 26 : if (segno != endsegno)
515 : {
516 : char endfname[MAXFNAMELEN];
517 :
518 0 : XLogFileName(endfname, endtli, endsegno, wal_segment_size);
519 0 : ereport(ERROR,
520 : (errmsg("could not find WAL file \"%s\"", endfname)));
521 : }
522 :
523 : /* Ok, we have everything we need. Send the WAL files. */
524 52 : foreach(lc, walFileList)
525 : {
526 26 : char *walFileName = (char *) lfirst(lc);
527 : int fd;
528 : ssize_t cnt;
529 26 : pgoff_t len = 0;
530 :
531 26 : snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", walFileName);
532 26 : XLogFromFileName(walFileName, &tli, &segno, wal_segment_size);
533 :
534 26 : fd = OpenTransientFile(pathbuf, O_RDONLY | PG_BINARY);
535 26 : if (fd < 0)
536 : {
537 0 : int save_errno = errno;
538 :
539 : /*
540 : * Most likely reason for this is that the file was already
541 : * removed by a checkpoint, so check for that to get a better
542 : * error message.
543 : */
544 0 : CheckXLogRemoved(segno, tli);
545 :
546 0 : errno = save_errno;
547 0 : ereport(ERROR,
548 : (errcode_for_file_access(),
549 : errmsg("could not open file \"%s\": %m", pathbuf)));
550 : }
551 :
552 26 : if (fstat(fd, &statbuf) != 0)
553 0 : ereport(ERROR,
554 : (errcode_for_file_access(),
555 : errmsg("could not stat file \"%s\": %m",
556 : pathbuf)));
557 26 : if (statbuf.st_size != wal_segment_size)
558 : {
559 0 : CheckXLogRemoved(segno, tli);
560 0 : ereport(ERROR,
561 : (errcode_for_file_access(),
562 : errmsg("unexpected WAL file size \"%s\"", walFileName)));
563 : }
564 :
565 : /* send the WAL file itself */
566 26 : _tarWriteHeader(sink, pathbuf, NULL, &statbuf, false);
567 :
568 13338 : while ((cnt = basebackup_read_file(fd, sink->bbs_buffer,
569 13312 : Min(sink->bbs_buffer_length,
570 : wal_segment_size - len),
571 : len, pathbuf, true)) > 0)
572 : {
573 13312 : CheckXLogRemoved(segno, tli);
574 13312 : bbsink_archive_contents(sink, cnt);
575 :
576 13312 : len += cnt;
577 :
578 13312 : if (len == wal_segment_size)
579 26 : break;
580 : }
581 :
582 26 : if (len != wal_segment_size)
583 : {
584 0 : CheckXLogRemoved(segno, tli);
585 0 : ereport(ERROR,
586 : (errcode_for_file_access(),
587 : errmsg("unexpected WAL file size \"%s\"", walFileName)));
588 : }
589 :
590 : /*
591 : * wal_segment_size is a multiple of TAR_BLOCK_SIZE, so no need
592 : * for padding.
593 : */
594 : Assert(wal_segment_size % TAR_BLOCK_SIZE == 0);
595 :
596 26 : CloseTransientFile(fd);
597 :
598 : /*
599 : * Mark file as archived, otherwise files can get archived again
600 : * after promotion of a new node. This is in line with
601 : * walreceiver.c always doing an XLogArchiveForceDone() after a
602 : * complete segment.
603 : */
604 26 : StatusFilePath(pathbuf, walFileName, ".done");
605 26 : sendFileWithContent(sink, pathbuf, "", -1, &manifest);
606 : }
607 :
608 : /*
609 : * Send timeline history files too. Only the latest timeline history
610 : * file is required for recovery, and even that only if there happens
611 : * to be a timeline switch in the first WAL segment that contains the
612 : * checkpoint record, or if we're taking a base backup from a standby
613 : * server and the target timeline changes while the backup is taken.
614 : * But they are small and highly useful for debugging purposes, so
615 : * better include them all, always.
616 : */
617 26 : foreach(lc, historyFileList)
618 : {
619 0 : char *fname = lfirst(lc);
620 :
621 0 : snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", fname);
622 :
623 0 : if (lstat(pathbuf, &statbuf) != 0)
624 0 : ereport(ERROR,
625 : (errcode_for_file_access(),
626 : errmsg("could not stat file \"%s\": %m", pathbuf)));
627 :
628 0 : sendFile(sink, pathbuf, pathbuf, &statbuf, false,
629 : InvalidOid, InvalidOid, InvalidRelFileNumber, 0,
630 : &manifest, 0, NULL, 0);
631 :
632 : /* unconditionally mark file as archived */
633 0 : StatusFilePath(pathbuf, fname, ".done");
634 0 : sendFileWithContent(sink, pathbuf, "", -1, &manifest);
635 : }
636 :
637 : /* Properly terminate the tar file. */
638 : StaticAssertStmt(2 * TAR_BLOCK_SIZE <= BLCKSZ,
639 : "BLCKSZ too small for 2 tar blocks");
640 26 : memset(sink->bbs_buffer, 0, 2 * TAR_BLOCK_SIZE);
641 26 : bbsink_archive_contents(sink, 2 * TAR_BLOCK_SIZE);
642 :
643 : /* OK, that's the end of the archive. */
644 26 : bbsink_end_archive(sink);
645 : }
646 :
647 298 : AddWALInfoToBackupManifest(&manifest, state.startptr, state.starttli,
648 : endptr, endtli);
649 :
650 298 : SendBackupManifest(&manifest, sink);
651 :
652 298 : bbsink_end_backup(sink, endptr, endtli);
653 :
654 298 : if (total_checksum_failures)
655 : {
656 6 : if (total_checksum_failures > 1)
657 4 : ereport(WARNING,
658 : (errmsg_plural("%lld total checksum verification failure",
659 : "%lld total checksum verification failures",
660 : total_checksum_failures,
661 : total_checksum_failures)));
662 :
663 6 : ereport(ERROR,
664 : (errcode(ERRCODE_DATA_CORRUPTED),
665 : errmsg("checksum verification failure during base backup")));
666 : }
667 :
668 : /*
669 : * Make sure to free the manifest before the resource owners as manifests
670 : * use cryptohash contexts that may depend on resource owners (like
671 : * OpenSSL).
672 : */
673 292 : FreeBackupManifest(&manifest);
674 :
675 : /* clean up the resource owner we created */
676 292 : ReleaseAuxProcessResources(true);
677 :
678 292 : basebackup_progress_done();
679 292 : }
680 :
681 : /*
682 : * list_sort comparison function, to compare log/seg portion of WAL segment
683 : * filenames, ignoring the timeline portion.
684 : */
685 : static int
686 0 : compareWalFileNames(const ListCell *a, const ListCell *b)
687 : {
688 0 : char *fna = (char *) lfirst(a);
689 0 : char *fnb = (char *) lfirst(b);
690 :
691 0 : return strcmp(fna + 8, fnb + 8);
692 : }
693 :
694 : /*
695 : * Parse the base backup options passed down by the parser
696 : */
697 : static void
698 346 : parse_basebackup_options(List *options, basebackup_options *opt)
699 : {
700 : ListCell *lopt;
701 346 : bool o_label = false;
702 346 : bool o_progress = false;
703 346 : bool o_checkpoint = false;
704 346 : bool o_nowait = false;
705 346 : bool o_wal = false;
706 346 : bool o_incremental = false;
707 346 : bool o_maxrate = false;
708 346 : bool o_tablespace_map = false;
709 346 : bool o_noverify_checksums = false;
710 346 : bool o_manifest = false;
711 346 : bool o_manifest_checksums = false;
712 346 : bool o_target = false;
713 346 : bool o_target_detail = false;
714 346 : char *target_str = NULL;
715 346 : char *target_detail_str = NULL;
716 346 : bool o_compression = false;
717 346 : bool o_compression_detail = false;
718 346 : char *compression_detail_str = NULL;
719 :
720 3806 : MemSet(opt, 0, sizeof(*opt));
721 346 : opt->manifest = MANIFEST_OPTION_NO;
722 346 : opt->manifest_checksum_type = CHECKSUM_TYPE_CRC32C;
723 346 : opt->compression = PG_COMPRESSION_NONE;
724 346 : opt->compression_specification.algorithm = PG_COMPRESSION_NONE;
725 :
726 2600 : foreach(lopt, options)
727 : {
728 2260 : DefElem *defel = (DefElem *) lfirst(lopt);
729 :
730 2260 : if (strcmp(defel->defname, "label") == 0)
731 : {
732 344 : if (o_label)
733 0 : ereport(ERROR,
734 : (errcode(ERRCODE_SYNTAX_ERROR),
735 : errmsg("duplicate option \"%s\"", defel->defname)));
736 344 : opt->label = defGetString(defel);
737 344 : o_label = true;
738 : }
739 1916 : else if (strcmp(defel->defname, "progress") == 0)
740 : {
741 344 : if (o_progress)
742 0 : ereport(ERROR,
743 : (errcode(ERRCODE_SYNTAX_ERROR),
744 : errmsg("duplicate option \"%s\"", defel->defname)));
745 344 : opt->progress = defGetBoolean(defel);
746 344 : o_progress = true;
747 : }
748 1572 : else if (strcmp(defel->defname, "checkpoint") == 0)
749 : {
750 326 : char *optval = defGetString(defel);
751 :
752 326 : if (o_checkpoint)
753 0 : ereport(ERROR,
754 : (errcode(ERRCODE_SYNTAX_ERROR),
755 : errmsg("duplicate option \"%s\"", defel->defname)));
756 326 : if (pg_strcasecmp(optval, "fast") == 0)
757 326 : opt->fastcheckpoint = true;
758 0 : else if (pg_strcasecmp(optval, "spread") == 0)
759 0 : opt->fastcheckpoint = false;
760 : else
761 0 : ereport(ERROR,
762 : (errcode(ERRCODE_SYNTAX_ERROR),
763 : errmsg("unrecognized checkpoint type: \"%s\"",
764 : optval)));
765 326 : o_checkpoint = true;
766 : }
767 1246 : else if (strcmp(defel->defname, "wait") == 0)
768 : {
769 330 : if (o_nowait)
770 0 : ereport(ERROR,
771 : (errcode(ERRCODE_SYNTAX_ERROR),
772 : errmsg("duplicate option \"%s\"", defel->defname)));
773 330 : opt->nowait = !defGetBoolean(defel);
774 330 : o_nowait = true;
775 : }
776 916 : else if (strcmp(defel->defname, "wal") == 0)
777 : {
778 34 : if (o_wal)
779 0 : ereport(ERROR,
780 : (errcode(ERRCODE_SYNTAX_ERROR),
781 : errmsg("duplicate option \"%s\"", defel->defname)));
782 34 : opt->includewal = defGetBoolean(defel);
783 34 : o_wal = true;
784 : }
785 882 : else if (strcmp(defel->defname, "incremental") == 0)
786 : {
787 18 : if (o_incremental)
788 0 : ereport(ERROR,
789 : (errcode(ERRCODE_SYNTAX_ERROR),
790 : errmsg("duplicate option \"%s\"", defel->defname)));
791 18 : opt->incremental = defGetBoolean(defel);
792 18 : if (opt->incremental && !summarize_wal)
793 0 : ereport(ERROR,
794 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
795 : errmsg("incremental backups cannot be taken unless WAL summarization is enabled")));
796 18 : o_incremental = true;
797 : }
798 864 : else if (strcmp(defel->defname, "max_rate") == 0)
799 : {
800 : int64 maxrate;
801 :
802 4 : if (o_maxrate)
803 0 : ereport(ERROR,
804 : (errcode(ERRCODE_SYNTAX_ERROR),
805 : errmsg("duplicate option \"%s\"", defel->defname)));
806 :
807 4 : maxrate = defGetInt64(defel);
808 4 : if (maxrate < MAX_RATE_LOWER || maxrate > MAX_RATE_UPPER)
809 0 : ereport(ERROR,
810 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
811 : errmsg("%d is outside the valid range for parameter \"%s\" (%d .. %d)",
812 : (int) maxrate, "MAX_RATE", MAX_RATE_LOWER, MAX_RATE_UPPER)));
813 :
814 4 : opt->maxrate = (uint32) maxrate;
815 4 : o_maxrate = true;
816 : }
817 860 : else if (strcmp(defel->defname, "tablespace_map") == 0)
818 : {
819 62 : if (o_tablespace_map)
820 0 : ereport(ERROR,
821 : (errcode(ERRCODE_SYNTAX_ERROR),
822 : errmsg("duplicate option \"%s\"", defel->defname)));
823 62 : opt->sendtblspcmapfile = defGetBoolean(defel);
824 62 : o_tablespace_map = true;
825 : }
826 798 : else if (strcmp(defel->defname, "verify_checksums") == 0)
827 : {
828 2 : if (o_noverify_checksums)
829 0 : ereport(ERROR,
830 : (errcode(ERRCODE_SYNTAX_ERROR),
831 : errmsg("duplicate option \"%s\"", defel->defname)));
832 2 : noverify_checksums = !defGetBoolean(defel);
833 2 : o_noverify_checksums = true;
834 : }
835 796 : else if (strcmp(defel->defname, "manifest") == 0)
836 : {
837 342 : char *optval = defGetString(defel);
838 : bool manifest_bool;
839 :
840 342 : if (o_manifest)
841 0 : ereport(ERROR,
842 : (errcode(ERRCODE_SYNTAX_ERROR),
843 : errmsg("duplicate option \"%s\"", defel->defname)));
844 342 : if (parse_bool(optval, &manifest_bool))
845 : {
846 340 : if (manifest_bool)
847 340 : opt->manifest = MANIFEST_OPTION_YES;
848 : else
849 0 : opt->manifest = MANIFEST_OPTION_NO;
850 : }
851 2 : else if (pg_strcasecmp(optval, "force-encode") == 0)
852 2 : opt->manifest = MANIFEST_OPTION_FORCE_ENCODE;
853 : else
854 0 : ereport(ERROR,
855 : (errcode(ERRCODE_SYNTAX_ERROR),
856 : errmsg("unrecognized manifest option: \"%s\"",
857 : optval)));
858 342 : o_manifest = true;
859 : }
860 454 : else if (strcmp(defel->defname, "manifest_checksums") == 0)
861 : {
862 28 : char *optval = defGetString(defel);
863 :
864 28 : if (o_manifest_checksums)
865 0 : ereport(ERROR,
866 : (errcode(ERRCODE_SYNTAX_ERROR),
867 : errmsg("duplicate option \"%s\"", defel->defname)));
868 28 : if (!pg_checksum_parse_type(optval,
869 : &opt->manifest_checksum_type))
870 4 : ereport(ERROR,
871 : (errcode(ERRCODE_SYNTAX_ERROR),
872 : errmsg("unrecognized checksum algorithm: \"%s\"",
873 : optval)));
874 24 : o_manifest_checksums = true;
875 : }
876 426 : else if (strcmp(defel->defname, "target") == 0)
877 : {
878 340 : if (o_target)
879 0 : ereport(ERROR,
880 : (errcode(ERRCODE_SYNTAX_ERROR),
881 : errmsg("duplicate option \"%s\"", defel->defname)));
882 340 : target_str = defGetString(defel);
883 340 : o_target = true;
884 : }
885 86 : else if (strcmp(defel->defname, "target_detail") == 0)
886 : {
887 14 : char *optval = defGetString(defel);
888 :
889 14 : if (o_target_detail)
890 0 : ereport(ERROR,
891 : (errcode(ERRCODE_SYNTAX_ERROR),
892 : errmsg("duplicate option \"%s\"", defel->defname)));
893 14 : target_detail_str = optval;
894 14 : o_target_detail = true;
895 : }
896 72 : else if (strcmp(defel->defname, "compression") == 0)
897 : {
898 50 : char *optval = defGetString(defel);
899 :
900 50 : if (o_compression)
901 0 : ereport(ERROR,
902 : (errcode(ERRCODE_SYNTAX_ERROR),
903 : errmsg("duplicate option \"%s\"", defel->defname)));
904 50 : if (!parse_compress_algorithm(optval, &opt->compression))
905 2 : ereport(ERROR,
906 : (errcode(ERRCODE_SYNTAX_ERROR),
907 : errmsg("unrecognized compression algorithm: \"%s\"",
908 : optval)));
909 48 : o_compression = true;
910 : }
911 22 : else if (strcmp(defel->defname, "compression_detail") == 0)
912 : {
913 22 : if (o_compression_detail)
914 0 : ereport(ERROR,
915 : (errcode(ERRCODE_SYNTAX_ERROR),
916 : errmsg("duplicate option \"%s\"", defel->defname)));
917 22 : compression_detail_str = defGetString(defel);
918 22 : o_compression_detail = true;
919 : }
920 : else
921 0 : ereport(ERROR,
922 : (errcode(ERRCODE_SYNTAX_ERROR),
923 : errmsg("unrecognized base backup option: \"%s\"",
924 : defel->defname)));
925 : }
926 :
927 340 : if (opt->label == NULL)
928 2 : opt->label = "base backup";
929 340 : if (opt->manifest == MANIFEST_OPTION_NO)
930 : {
931 4 : if (o_manifest_checksums)
932 0 : ereport(ERROR,
933 : (errcode(ERRCODE_SYNTAX_ERROR),
934 : errmsg("manifest checksums require a backup manifest")));
935 4 : opt->manifest_checksum_type = CHECKSUM_TYPE_NONE;
936 : }
937 :
938 340 : if (target_str == NULL)
939 : {
940 2 : if (target_detail_str != NULL)
941 0 : ereport(ERROR,
942 : (errcode(ERRCODE_SYNTAX_ERROR),
943 : errmsg("target detail cannot be used without target")));
944 2 : opt->use_copytblspc = true;
945 2 : opt->send_to_client = true;
946 : }
947 338 : else if (strcmp(target_str, "client") == 0)
948 : {
949 312 : if (target_detail_str != NULL)
950 0 : ereport(ERROR,
951 : (errcode(ERRCODE_SYNTAX_ERROR),
952 : errmsg("target \"%s\" does not accept a target detail",
953 : target_str)));
954 312 : opt->send_to_client = true;
955 : }
956 : else
957 22 : opt->target_handle =
958 26 : BaseBackupGetTargetHandle(target_str, target_detail_str);
959 :
960 336 : if (o_compression_detail && !o_compression)
961 0 : ereport(ERROR,
962 : (errcode(ERRCODE_SYNTAX_ERROR),
963 : errmsg("compression detail cannot be specified unless compression is enabled")));
964 :
965 336 : if (o_compression)
966 : {
967 : char *error_detail;
968 :
969 44 : parse_compress_specification(opt->compression, compression_detail_str,
970 : &opt->compression_specification);
971 : error_detail =
972 44 : validate_compress_specification(&opt->compression_specification);
973 44 : if (error_detail != NULL)
974 18 : ereport(ERROR,
975 : errcode(ERRCODE_SYNTAX_ERROR),
976 : errmsg("invalid compression specification: %s",
977 : error_detail));
978 : }
979 318 : }
980 :
981 :
982 : /*
983 : * SendBaseBackup() - send a complete base backup.
984 : *
985 : * The function will put the system into backup mode like pg_backup_start()
986 : * does, so that the backup is consistent even though we read directly from
987 : * the filesystem, bypassing the buffer cache.
988 : */
989 : void
990 348 : SendBaseBackup(BaseBackupCmd *cmd, IncrementalBackupInfo *ib)
991 : {
992 : basebackup_options opt;
993 : bbsink *sink;
994 348 : SessionBackupState status = get_backup_status();
995 :
996 348 : if (status == SESSION_BACKUP_RUNNING)
997 2 : ereport(ERROR,
998 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
999 : errmsg("a backup is already in progress in this session")));
1000 :
1001 346 : parse_basebackup_options(cmd->options, &opt);
1002 :
1003 318 : WalSndSetState(WALSNDSTATE_BACKUP);
1004 :
1005 318 : if (update_process_title)
1006 : {
1007 : char activitymsg[50];
1008 :
1009 318 : snprintf(activitymsg, sizeof(activitymsg), "sending backup \"%s\"",
1010 : opt.label);
1011 318 : set_ps_display(activitymsg);
1012 : }
1013 :
1014 : /*
1015 : * If we're asked to perform an incremental backup and the user has not
1016 : * supplied a manifest, that's an ERROR.
1017 : *
1018 : * If we're asked to perform a full backup and the user did supply a
1019 : * manifest, just ignore it.
1020 : */
1021 318 : if (!opt.incremental)
1022 300 : ib = NULL;
1023 18 : else if (ib == NULL)
1024 0 : ereport(ERROR,
1025 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1026 : errmsg("must UPLOAD_MANIFEST before performing an incremental BASE_BACKUP")));
1027 :
1028 : /*
1029 : * If the target is specifically 'client' then set up to stream the backup
1030 : * to the client; otherwise, it's being sent someplace else and should not
1031 : * be sent to the client. BaseBackupGetSink has the job of setting up a
1032 : * sink to send the backup data wherever it needs to go.
1033 : */
1034 318 : sink = bbsink_copystream_new(opt.send_to_client);
1035 318 : if (opt.target_handle != NULL)
1036 22 : sink = BaseBackupGetSink(opt.target_handle, sink);
1037 :
1038 : /* Set up network throttling, if client requested it */
1039 312 : if (opt.maxrate > 0)
1040 4 : sink = bbsink_throttle_new(sink, opt.maxrate);
1041 :
1042 : /* Set up server-side compression, if client requested it */
1043 312 : if (opt.compression == PG_COMPRESSION_GZIP)
1044 4 : sink = bbsink_gzip_new(sink, &opt.compression_specification);
1045 308 : else if (opt.compression == PG_COMPRESSION_LZ4)
1046 4 : sink = bbsink_lz4_new(sink, &opt.compression_specification);
1047 304 : else if (opt.compression == PG_COMPRESSION_ZSTD)
1048 0 : sink = bbsink_zstd_new(sink, &opt.compression_specification);
1049 :
1050 : /* Set up progress reporting. */
1051 312 : sink = bbsink_progress_new(sink, opt.progress);
1052 :
1053 : /*
1054 : * Perform the base backup, but make sure we clean up the bbsink even if
1055 : * an error occurs.
1056 : */
1057 312 : PG_TRY();
1058 : {
1059 312 : perform_base_backup(&opt, sink, ib);
1060 : }
1061 10 : PG_FINALLY();
1062 : {
1063 302 : bbsink_cleanup(sink);
1064 : }
1065 302 : PG_END_TRY();
1066 292 : }
1067 :
1068 : /*
1069 : * Inject a file with given name and content in the output tar stream.
1070 : *
1071 : * "len" can optionally be set to an arbitrary length of data sent. If set
1072 : * to -1, the content sent is treated as a string with strlen() as length.
1073 : */
1074 : static void
1075 388 : sendFileWithContent(bbsink *sink, const char *filename, const char *content,
1076 : int len, backup_manifest_info *manifest)
1077 : {
1078 : struct stat statbuf;
1079 388 : int bytes_done = 0;
1080 : pg_checksum_context checksum_ctx;
1081 :
1082 388 : if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0)
1083 0 : elog(ERROR, "could not initialize checksum of file \"%s\"",
1084 : filename);
1085 :
1086 388 : if (len < 0)
1087 388 : len = strlen(content);
1088 :
1089 : /*
1090 : * Construct a stat struct for the file we're injecting in the tar.
1091 : */
1092 :
1093 : /* Windows doesn't have the concept of uid and gid */
1094 : #ifdef WIN32
1095 : statbuf.st_uid = 0;
1096 : statbuf.st_gid = 0;
1097 : #else
1098 388 : statbuf.st_uid = geteuid();
1099 388 : statbuf.st_gid = getegid();
1100 : #endif
1101 388 : statbuf.st_mtime = time(NULL);
1102 388 : statbuf.st_mode = pg_file_create_mode;
1103 388 : statbuf.st_size = len;
1104 :
1105 388 : _tarWriteHeader(sink, filename, NULL, &statbuf, false);
1106 :
1107 388 : if (pg_checksum_update(&checksum_ctx, (uint8 *) content, len) < 0)
1108 0 : elog(ERROR, "could not update checksum of file \"%s\"",
1109 : filename);
1110 :
1111 710 : while (bytes_done < len)
1112 : {
1113 322 : size_t remaining = len - bytes_done;
1114 322 : size_t nbytes = Min(sink->bbs_buffer_length, remaining);
1115 :
1116 322 : memcpy(sink->bbs_buffer, content, nbytes);
1117 322 : bbsink_archive_contents(sink, nbytes);
1118 322 : bytes_done += nbytes;
1119 322 : content += nbytes;
1120 : }
1121 :
1122 388 : _tarWritePadding(sink, len);
1123 :
1124 388 : AddFileToBackupManifest(manifest, InvalidOid, filename, len,
1125 388 : (pg_time_t) statbuf.st_mtime, &checksum_ctx);
1126 388 : }
1127 :
1128 : /*
1129 : * Include the tablespace directory pointed to by 'path' in the output tar
1130 : * stream. If 'sizeonly' is true, we just calculate a total length and return
1131 : * it, without actually sending anything.
1132 : *
1133 : * Only used to send auxiliary tablespaces, not PGDATA.
1134 : */
1135 : static int64
1136 144 : sendTablespace(bbsink *sink, char *path, Oid spcoid, bool sizeonly,
1137 : backup_manifest_info *manifest, IncrementalBackupInfo *ib)
1138 : {
1139 : int64 size;
1140 : char pathbuf[MAXPGPATH];
1141 : struct stat statbuf;
1142 :
1143 : /*
1144 : * 'path' points to the tablespace location, but we only want to include
1145 : * the version directory in it that belongs to us.
1146 : */
1147 144 : snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path,
1148 : TABLESPACE_VERSION_DIRECTORY);
1149 :
1150 : /*
1151 : * Store a directory entry in the tar file so we get the permissions
1152 : * right.
1153 : */
1154 144 : if (lstat(pathbuf, &statbuf) != 0)
1155 : {
1156 0 : if (errno != ENOENT)
1157 0 : ereport(ERROR,
1158 : (errcode_for_file_access(),
1159 : errmsg("could not stat file or directory \"%s\": %m",
1160 : pathbuf)));
1161 :
1162 : /* If the tablespace went away while scanning, it's no error. */
1163 0 : return 0;
1164 : }
1165 :
1166 144 : size = _tarWriteHeader(sink, TABLESPACE_VERSION_DIRECTORY, NULL, &statbuf,
1167 : sizeonly);
1168 :
1169 : /* Send all the files in the tablespace version directory */
1170 144 : size += sendDir(sink, pathbuf, strlen(path), sizeonly, NIL, true, manifest,
1171 : spcoid, ib);
1172 :
1173 144 : return size;
1174 : }
1175 :
1176 : /*
1177 : * Include all files from the given directory in the output tar stream. If
1178 : * 'sizeonly' is true, we just calculate a total length and return it, without
1179 : * actually sending anything.
1180 : *
1181 : * Omit any directory in the tablespaces list, to avoid backing up
1182 : * tablespaces twice when they were created inside PGDATA.
1183 : *
1184 : * If sendtblspclinks is true, we need to include symlink
1185 : * information in the tar file. If not, we can skip that
1186 : * as it will be sent separately in the tablespace_map file.
1187 : */
1188 : static int64
1189 10736 : sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly,
1190 : List *tablespaces, bool sendtblspclinks, backup_manifest_info *manifest,
1191 : Oid spcoid, IncrementalBackupInfo *ib)
1192 : {
1193 : DIR *dir;
1194 : struct dirent *de;
1195 : char pathbuf[MAXPGPATH * 2];
1196 : struct stat statbuf;
1197 10736 : int64 size = 0;
1198 : const char *lastDir; /* Split last dir from parent path. */
1199 10736 : bool isRelationDir = false; /* Does directory contain relations? */
1200 10736 : bool isGlobalDir = false;
1201 10736 : Oid dboid = InvalidOid;
1202 10736 : BlockNumber *relative_block_numbers = NULL;
1203 :
1204 : /*
1205 : * Since this array is relatively large, avoid putting it on the stack.
1206 : * But we don't need it at all if this is not an incremental backup.
1207 : */
1208 10736 : if (ib != NULL)
1209 314 : relative_block_numbers = palloc(sizeof(BlockNumber) * RELSEG_SIZE);
1210 :
1211 : /*
1212 : * Determine if the current path is a database directory that can contain
1213 : * relations.
1214 : *
1215 : * Start by finding the location of the delimiter between the parent path
1216 : * and the current path.
1217 : */
1218 10736 : lastDir = last_dir_separator(path);
1219 :
1220 : /* Does this path look like a database path (i.e. all digits)? */
1221 10736 : if (lastDir != NULL &&
1222 10114 : strspn(lastDir + 1, "0123456789") == strlen(lastDir + 1))
1223 1988 : {
1224 : /* Part of path that contains the parent directory. */
1225 1988 : int parentPathLen = lastDir - path;
1226 :
1227 : /*
1228 : * Mark path as a database directory if the parent path is either
1229 : * $PGDATA/base or a tablespace version path.
1230 : */
1231 1988 : if (strncmp(path, "./base", parentPathLen) == 0 ||
1232 100 : (parentPathLen >= (sizeof(TABLESPACE_VERSION_DIRECTORY) - 1) &&
1233 100 : strncmp(lastDir - (sizeof(TABLESPACE_VERSION_DIRECTORY) - 1),
1234 : TABLESPACE_VERSION_DIRECTORY,
1235 : sizeof(TABLESPACE_VERSION_DIRECTORY) - 1) == 0))
1236 : {
1237 1988 : isRelationDir = true;
1238 1988 : dboid = atooid(lastDir + 1);
1239 : }
1240 : }
1241 8748 : else if (strcmp(path, "./global") == 0)
1242 : {
1243 614 : isRelationDir = true;
1244 614 : isGlobalDir = true;
1245 : }
1246 :
1247 10736 : dir = AllocateDir(path);
1248 660924 : while ((de = ReadDir(dir, path)) != NULL)
1249 : {
1250 : int excludeIdx;
1251 : bool excludeFound;
1252 650214 : RelFileNumber relfilenumber = InvalidRelFileNumber;
1253 650214 : ForkNumber relForkNum = InvalidForkNumber;
1254 650214 : unsigned segno = 0;
1255 650214 : bool isRelationFile = false;
1256 :
1257 : /* Skip special stuff */
1258 650214 : if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
1259 30364 : continue;
1260 :
1261 : /* Skip temporary files */
1262 628778 : if (strncmp(de->d_name,
1263 : PG_TEMP_FILE_PREFIX,
1264 : strlen(PG_TEMP_FILE_PREFIX)) == 0)
1265 608 : continue;
1266 :
1267 : /* Skip macOS system files */
1268 628170 : if (strcmp(de->d_name, ".DS_Store") == 0)
1269 140 : continue;
1270 :
1271 : /*
1272 : * Check if the postmaster has signaled us to exit, and abort with an
1273 : * error in that case. The error handler further up will call
1274 : * do_pg_abort_backup() for us. Also check that if the backup was
1275 : * started while still in recovery, the server wasn't promoted.
1276 : * do_pg_backup_stop() will check that too, but it's better to stop
1277 : * the backup early than continue to the end and fail there.
1278 : */
1279 628030 : CHECK_FOR_INTERRUPTS();
1280 628020 : if (RecoveryInProgress() != backup_started_in_recovery)
1281 0 : ereport(ERROR,
1282 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1283 : errmsg("the standby was promoted during online backup"),
1284 : errhint("This means that the backup being taken is corrupt "
1285 : "and should not be used. "
1286 : "Try taking another online backup.")));
1287 :
1288 : /* Scan for files that should be excluded */
1289 628020 : excludeFound = false;
1290 5642744 : for (excludeIdx = 0; excludeFiles[excludeIdx].name != NULL; excludeIdx++)
1291 : {
1292 5017170 : int cmplen = strlen(excludeFiles[excludeIdx].name);
1293 :
1294 5017170 : if (!excludeFiles[excludeIdx].match_prefix)
1295 4389422 : cmplen++;
1296 5017170 : if (strncmp(de->d_name, excludeFiles[excludeIdx].name, cmplen) == 0)
1297 : {
1298 2446 : elog(DEBUG1, "file \"%s\" excluded from backup", de->d_name);
1299 2446 : excludeFound = true;
1300 2446 : break;
1301 : }
1302 : }
1303 :
1304 628020 : if (excludeFound)
1305 2446 : continue;
1306 :
1307 : /*
1308 : * If there could be non-temporary relation files in this directory,
1309 : * try to parse the filename.
1310 : */
1311 625574 : if (isRelationDir)
1312 : isRelationFile =
1313 604712 : parse_filename_for_nontemp_relation(de->d_name,
1314 : &relfilenumber,
1315 : &relForkNum, &segno);
1316 :
1317 : /* Exclude all forks for unlogged tables except the init fork */
1318 625574 : if (isRelationFile && relForkNum != INIT_FORKNUM)
1319 : {
1320 : char initForkFile[MAXPGPATH];
1321 :
1322 : /*
1323 : * If any other type of fork, check if there is an init fork with
1324 : * the same RelFileNumber. If so, the file can be excluded.
1325 : */
1326 599498 : snprintf(initForkFile, sizeof(initForkFile), "%s/%u_init",
1327 : path, relfilenumber);
1328 :
1329 599498 : if (lstat(initForkFile, &statbuf) == 0)
1330 : {
1331 140 : elog(DEBUG2,
1332 : "unlogged relation file \"%s\" excluded from backup",
1333 : de->d_name);
1334 :
1335 140 : continue;
1336 : }
1337 : }
1338 :
1339 : /* Exclude temporary relations */
1340 625434 : if (OidIsValid(dboid) && looks_like_temp_rel_name(de->d_name))
1341 : {
1342 72 : elog(DEBUG2,
1343 : "temporary relation file \"%s\" excluded from backup",
1344 : de->d_name);
1345 :
1346 72 : continue;
1347 : }
1348 :
1349 625362 : snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path, de->d_name);
1350 :
1351 : /* Skip pg_control here to back up it last */
1352 625362 : if (strcmp(pathbuf, "./global/pg_control") == 0)
1353 612 : continue;
1354 :
1355 624750 : if (lstat(pathbuf, &statbuf) != 0)
1356 : {
1357 0 : if (errno != ENOENT)
1358 0 : ereport(ERROR,
1359 : (errcode_for_file_access(),
1360 : errmsg("could not stat file or directory \"%s\": %m",
1361 : pathbuf)));
1362 :
1363 : /* If the file went away while scanning, it's not an error. */
1364 0 : continue;
1365 : }
1366 :
1367 : /* Scan for directories whose contents should be excluded */
1368 624750 : excludeFound = false;
1369 4980774 : for (excludeIdx = 0; excludeDirContents[excludeIdx] != NULL; excludeIdx++)
1370 : {
1371 4360324 : if (strcmp(de->d_name, excludeDirContents[excludeIdx]) == 0)
1372 : {
1373 4300 : elog(DEBUG1, "contents of directory \"%s\" excluded from backup", de->d_name);
1374 4300 : convert_link_to_directory(pathbuf, &statbuf);
1375 4300 : size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, NULL,
1376 : &statbuf, sizeonly);
1377 4298 : excludeFound = true;
1378 4298 : break;
1379 : }
1380 : }
1381 :
1382 624748 : if (excludeFound)
1383 4298 : continue;
1384 :
1385 : /*
1386 : * We can skip pg_wal, the WAL segments need to be fetched from the
1387 : * WAL archive anyway. But include it as an empty directory anyway, so
1388 : * we get permissions right.
1389 : */
1390 620450 : if (strcmp(pathbuf, "./pg_wal") == 0)
1391 : {
1392 : /* If pg_wal is a symlink, write it as a directory anyway */
1393 612 : convert_link_to_directory(pathbuf, &statbuf);
1394 612 : size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, NULL,
1395 : &statbuf, sizeonly);
1396 :
1397 : /*
1398 : * Also send archive_status and summaries directories (by
1399 : * hackishly reusing statbuf from above ...).
1400 : */
1401 612 : size += _tarWriteHeader(sink, "./pg_wal/archive_status", NULL,
1402 : &statbuf, sizeonly);
1403 612 : size += _tarWriteHeader(sink, "./pg_wal/summaries", NULL,
1404 : &statbuf, sizeonly);
1405 :
1406 612 : continue; /* don't recurse into pg_wal */
1407 : }
1408 :
1409 : /* Allow symbolic links in pg_tblspc only */
1410 619838 : if (strcmp(path, "./pg_tblspc") == 0 && S_ISLNK(statbuf.st_mode))
1411 76 : {
1412 : char linkpath[MAXPGPATH];
1413 : int rllen;
1414 :
1415 76 : rllen = readlink(pathbuf, linkpath, sizeof(linkpath));
1416 76 : if (rllen < 0)
1417 0 : ereport(ERROR,
1418 : (errcode_for_file_access(),
1419 : errmsg("could not read symbolic link \"%s\": %m",
1420 : pathbuf)));
1421 76 : if (rllen >= sizeof(linkpath))
1422 0 : ereport(ERROR,
1423 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1424 : errmsg("symbolic link \"%s\" target is too long",
1425 : pathbuf)));
1426 76 : linkpath[rllen] = '\0';
1427 :
1428 76 : size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, linkpath,
1429 : &statbuf, sizeonly);
1430 : }
1431 619762 : else if (S_ISDIR(statbuf.st_mode))
1432 : {
1433 10074 : bool skip_this_dir = false;
1434 : ListCell *lc;
1435 :
1436 : /*
1437 : * Store a directory entry in the tar file so we can get the
1438 : * permissions right.
1439 : */
1440 10074 : size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, NULL, &statbuf,
1441 : sizeonly);
1442 :
1443 : /*
1444 : * Call ourselves recursively for a directory, unless it happens
1445 : * to be a separate tablespace located within PGDATA.
1446 : */
1447 22362 : foreach(lc, tablespaces)
1448 : {
1449 12344 : tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
1450 :
1451 : /*
1452 : * ti->rpath is the tablespace relative path within PGDATA, or
1453 : * NULL if the tablespace has been properly located somewhere
1454 : * else.
1455 : *
1456 : * Skip past the leading "./" in pathbuf when comparing.
1457 : */
1458 12344 : if (ti->rpath && strcmp(ti->rpath, pathbuf + 2) == 0)
1459 : {
1460 56 : skip_this_dir = true;
1461 56 : break;
1462 : }
1463 : }
1464 :
1465 : /*
1466 : * skip sending directories inside pg_tblspc, if not required.
1467 : */
1468 10074 : if (strcmp(pathbuf, "./pg_tblspc") == 0 && !sendtblspclinks)
1469 48 : skip_this_dir = true;
1470 :
1471 10074 : if (!skip_this_dir)
1472 9970 : size += sendDir(sink, pathbuf, basepathlen, sizeonly, tablespaces,
1473 : sendtblspclinks, manifest, spcoid, ib);
1474 : }
1475 609688 : else if (S_ISREG(statbuf.st_mode))
1476 : {
1477 609688 : bool sent = false;
1478 609688 : unsigned num_blocks_required = 0;
1479 609688 : unsigned truncation_block_length = 0;
1480 : char tarfilenamebuf[MAXPGPATH * 2];
1481 609688 : char *tarfilename = pathbuf + basepathlen + 1;
1482 609688 : FileBackupMethod method = BACK_UP_FILE_FULLY;
1483 :
1484 609688 : if (ib != NULL && isRelationFile)
1485 : {
1486 : Oid relspcoid;
1487 : char *lookup_path;
1488 :
1489 18406 : if (OidIsValid(spcoid))
1490 : {
1491 18 : relspcoid = spcoid;
1492 18 : lookup_path = psprintf("%s/%u/%s", PG_TBLSPC_DIR, spcoid,
1493 : tarfilename);
1494 : }
1495 : else
1496 : {
1497 18388 : if (isGlobalDir)
1498 1044 : relspcoid = GLOBALTABLESPACE_OID;
1499 : else
1500 17344 : relspcoid = DEFAULTTABLESPACE_OID;
1501 18388 : lookup_path = pstrdup(tarfilename);
1502 : }
1503 :
1504 18406 : method = GetFileBackupMethod(ib, lookup_path, dboid, relspcoid,
1505 : relfilenumber, relForkNum,
1506 18406 : segno, statbuf.st_size,
1507 : &num_blocks_required,
1508 : relative_block_numbers,
1509 : &truncation_block_length);
1510 18406 : if (method == BACK_UP_FILE_INCREMENTALLY)
1511 : {
1512 12002 : statbuf.st_size =
1513 12002 : GetIncrementalFileSize(num_blocks_required);
1514 12002 : snprintf(tarfilenamebuf, sizeof(tarfilenamebuf),
1515 : "%s/INCREMENTAL.%s",
1516 12002 : path + basepathlen + 1,
1517 12002 : de->d_name);
1518 12002 : tarfilename = tarfilenamebuf;
1519 : }
1520 :
1521 18406 : pfree(lookup_path);
1522 : }
1523 :
1524 609688 : if (!sizeonly)
1525 299380 : sent = sendFile(sink, pathbuf, tarfilename, &statbuf,
1526 : true, dboid, spcoid,
1527 : relfilenumber, segno, manifest,
1528 : num_blocks_required,
1529 : method == BACK_UP_FILE_INCREMENTALLY ? relative_block_numbers : NULL,
1530 : truncation_block_length);
1531 :
1532 609686 : if (sent || sizeonly)
1533 : {
1534 : /* Add size. */
1535 609686 : size += statbuf.st_size;
1536 :
1537 : /* Pad to a multiple of the tar block size. */
1538 609686 : size += tarPaddingBytesRequired(statbuf.st_size);
1539 :
1540 : /* Size of the header for the file. */
1541 609686 : size += TAR_BLOCK_SIZE;
1542 : }
1543 : }
1544 : else
1545 0 : ereport(WARNING,
1546 : (errmsg("skipping special file \"%s\"", pathbuf)));
1547 : }
1548 :
1549 10710 : if (relative_block_numbers != NULL)
1550 314 : pfree(relative_block_numbers);
1551 :
1552 10710 : FreeDir(dir);
1553 10710 : return size;
1554 : }
1555 :
1556 : /*
1557 : * Given the member, write the TAR header & send the file.
1558 : *
1559 : * If 'missing_ok' is true, will not throw an error if the file is not found.
1560 : *
1561 : * If dboid is anything other than InvalidOid then any checksum failures
1562 : * detected will get reported to the cumulative stats system.
1563 : *
1564 : * If the file is to be sent incrementally, then num_incremental_blocks
1565 : * should be the number of blocks to be sent, and incremental_blocks
1566 : * an array of block numbers relative to the start of the current segment.
1567 : * If the whole file is to be sent, then incremental_blocks should be NULL,
1568 : * and num_incremental_blocks can have any value, as it will be ignored.
1569 : *
1570 : * Returns true if the file was successfully sent, false if 'missing_ok',
1571 : * and the file did not exist.
1572 : */
1573 : static bool
1574 299678 : sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
1575 : struct stat *statbuf, bool missing_ok, Oid dboid, Oid spcoid,
1576 : RelFileNumber relfilenumber, unsigned segno,
1577 : backup_manifest_info *manifest, unsigned num_incremental_blocks,
1578 : BlockNumber *incremental_blocks, unsigned truncation_block_length)
1579 : {
1580 : int fd;
1581 299678 : BlockNumber blkno = 0;
1582 299678 : int checksum_failures = 0;
1583 : off_t cnt;
1584 299678 : pgoff_t bytes_done = 0;
1585 299678 : bool verify_checksum = false;
1586 : pg_checksum_context checksum_ctx;
1587 299678 : int ibindex = 0;
1588 :
1589 299678 : if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0)
1590 0 : elog(ERROR, "could not initialize checksum of file \"%s\"",
1591 : readfilename);
1592 :
1593 299678 : fd = OpenTransientFile(readfilename, O_RDONLY | PG_BINARY);
1594 299678 : if (fd < 0)
1595 : {
1596 0 : if (errno == ENOENT && missing_ok)
1597 0 : return false;
1598 0 : ereport(ERROR,
1599 : (errcode_for_file_access(),
1600 : errmsg("could not open file \"%s\": %m", readfilename)));
1601 : }
1602 :
1603 299678 : _tarWriteHeader(sink, tarfilename, NULL, statbuf, false);
1604 :
1605 : /*
1606 : * Checksums are verified in multiples of BLCKSZ, so the buffer length
1607 : * should be a multiple of the block size as well.
1608 : */
1609 : Assert((sink->bbs_buffer_length % BLCKSZ) == 0);
1610 :
1611 : /*
1612 : * If we weren't told not to verify checksums, and if checksums are
1613 : * enabled for this cluster, and if this is a relation file, then verify
1614 : * the checksum.
1615 : */
1616 299676 : if (!noverify_checksums && DataChecksumsEnabled() &&
1617 : RelFileNumberIsValid(relfilenumber))
1618 292434 : verify_checksum = true;
1619 :
1620 : /*
1621 : * If we're sending an incremental file, write the file header.
1622 : */
1623 299676 : if (incremental_blocks != NULL)
1624 : {
1625 12002 : unsigned magic = INCREMENTAL_MAGIC;
1626 12002 : size_t header_bytes_done = 0;
1627 : char padding[BLCKSZ];
1628 : size_t paddinglen;
1629 :
1630 : /* Emit header data. */
1631 12002 : push_to_sink(sink, &checksum_ctx, &header_bytes_done,
1632 : &magic, sizeof(magic));
1633 12002 : push_to_sink(sink, &checksum_ctx, &header_bytes_done,
1634 : &num_incremental_blocks, sizeof(num_incremental_blocks));
1635 12002 : push_to_sink(sink, &checksum_ctx, &header_bytes_done,
1636 : &truncation_block_length, sizeof(truncation_block_length));
1637 12002 : push_to_sink(sink, &checksum_ctx, &header_bytes_done,
1638 : incremental_blocks,
1639 : sizeof(BlockNumber) * num_incremental_blocks);
1640 :
1641 : /*
1642 : * Add padding to align header to a multiple of BLCKSZ, but only if
1643 : * the incremental file has some blocks, and the alignment is actually
1644 : * needed (i.e. header is not already a multiple of BLCKSZ). If there
1645 : * are no blocks we don't want to make the file unnecessarily large,
1646 : * as that might make some filesystem optimizations impossible.
1647 : */
1648 12002 : if ((num_incremental_blocks > 0) && (header_bytes_done % BLCKSZ != 0))
1649 : {
1650 46 : paddinglen = (BLCKSZ - (header_bytes_done % BLCKSZ));
1651 :
1652 46 : memset(padding, 0, paddinglen);
1653 46 : bytes_done += paddinglen;
1654 :
1655 46 : push_to_sink(sink, &checksum_ctx, &header_bytes_done,
1656 : padding, paddinglen);
1657 : }
1658 :
1659 : /* Flush out any data still in the buffer so it's again empty. */
1660 12002 : if (header_bytes_done > 0)
1661 : {
1662 12002 : bbsink_archive_contents(sink, header_bytes_done);
1663 12002 : if (pg_checksum_update(&checksum_ctx,
1664 12002 : (uint8 *) sink->bbs_buffer,
1665 : header_bytes_done) < 0)
1666 0 : elog(ERROR, "could not update checksum of base backup");
1667 : }
1668 :
1669 : /* Update our notion of file position. */
1670 12002 : bytes_done += sizeof(magic);
1671 12002 : bytes_done += sizeof(num_incremental_blocks);
1672 12002 : bytes_done += sizeof(truncation_block_length);
1673 12002 : bytes_done += sizeof(BlockNumber) * num_incremental_blocks;
1674 : }
1675 :
1676 : /*
1677 : * Loop until we read the amount of data the caller told us to expect. The
1678 : * file could be longer, if it was extended while we were sending it, but
1679 : * for a base backup we can ignore such extended data. It will be restored
1680 : * from WAL.
1681 : */
1682 : while (1)
1683 : {
1684 : /*
1685 : * Determine whether we've read all the data that we need, and if not,
1686 : * read some more.
1687 : */
1688 644964 : if (incremental_blocks == NULL)
1689 : {
1690 632886 : size_t remaining = statbuf->st_size - bytes_done;
1691 :
1692 : /*
1693 : * If we've read the required number of bytes, then it's time to
1694 : * stop.
1695 : */
1696 632886 : if (bytes_done >= statbuf->st_size)
1697 287674 : break;
1698 :
1699 : /*
1700 : * Read as many bytes as will fit in the buffer, or however many
1701 : * are left to read, whichever is less.
1702 : */
1703 345212 : cnt = read_file_data_into_buffer(sink, readfilename, fd,
1704 : bytes_done, remaining,
1705 345212 : blkno + segno * RELSEG_SIZE,
1706 : verify_checksum,
1707 : &checksum_failures);
1708 : }
1709 : else
1710 : {
1711 : BlockNumber relative_blkno;
1712 :
1713 : /*
1714 : * If we've read all the blocks, then it's time to stop.
1715 : */
1716 12078 : if (ibindex >= num_incremental_blocks)
1717 12002 : break;
1718 :
1719 : /*
1720 : * Read just one block, whichever one is the next that we're
1721 : * supposed to include.
1722 : */
1723 76 : relative_blkno = incremental_blocks[ibindex++];
1724 76 : cnt = read_file_data_into_buffer(sink, readfilename, fd,
1725 76 : relative_blkno * BLCKSZ,
1726 : BLCKSZ,
1727 76 : relative_blkno + segno * RELSEG_SIZE,
1728 : verify_checksum,
1729 : &checksum_failures);
1730 :
1731 : /*
1732 : * If we get a partial read, that must mean that the relation is
1733 : * being truncated. Ultimately, it should be truncated to a
1734 : * multiple of BLCKSZ, since this path should only be reached for
1735 : * relation files, but we might transiently observe an
1736 : * intermediate value.
1737 : *
1738 : * It should be fine to treat this just as if the entire block had
1739 : * been truncated away - i.e. fill this and all later blocks with
1740 : * zeroes. WAL replay will fix things up.
1741 : */
1742 76 : if (cnt < BLCKSZ)
1743 0 : break;
1744 : }
1745 :
1746 : /*
1747 : * If the amount of data we were able to read was not a multiple of
1748 : * BLCKSZ, we cannot verify checksums, which are block-level.
1749 : */
1750 345288 : if (verify_checksum && (cnt % BLCKSZ != 0))
1751 : {
1752 0 : ereport(WARNING,
1753 : (errmsg("could not verify checksum in file \"%s\", block "
1754 : "%u: read buffer size %d and page size %d "
1755 : "differ",
1756 : readfilename, blkno, (int) cnt, BLCKSZ)));
1757 0 : verify_checksum = false;
1758 : }
1759 :
1760 : /*
1761 : * If we hit end-of-file, a concurrent truncation must have occurred.
1762 : * That's not an error condition, because WAL replay will fix things
1763 : * up.
1764 : */
1765 345288 : if (cnt == 0)
1766 0 : break;
1767 :
1768 : /* Update block number and # of bytes done for next loop iteration. */
1769 345288 : blkno += cnt / BLCKSZ;
1770 345288 : bytes_done += cnt;
1771 :
1772 : /*
1773 : * Make sure incremental files with block data are properly aligned
1774 : * (header is a multiple of BLCKSZ, blocks are BLCKSZ too).
1775 : */
1776 : Assert(!((incremental_blocks != NULL && num_incremental_blocks > 0) &&
1777 : (bytes_done % BLCKSZ != 0)));
1778 :
1779 : /* Archive the data we just read. */
1780 345288 : bbsink_archive_contents(sink, cnt);
1781 :
1782 : /* Also feed it to the checksum machinery. */
1783 345288 : if (pg_checksum_update(&checksum_ctx,
1784 345288 : (uint8 *) sink->bbs_buffer, cnt) < 0)
1785 0 : elog(ERROR, "could not update checksum of base backup");
1786 : }
1787 :
1788 : /* If the file was truncated while we were sending it, pad it with zeros */
1789 299676 : while (bytes_done < statbuf->st_size)
1790 : {
1791 0 : size_t remaining = statbuf->st_size - bytes_done;
1792 0 : size_t nbytes = Min(sink->bbs_buffer_length, remaining);
1793 :
1794 0 : MemSet(sink->bbs_buffer, 0, nbytes);
1795 0 : if (pg_checksum_update(&checksum_ctx,
1796 0 : (uint8 *) sink->bbs_buffer,
1797 : nbytes) < 0)
1798 0 : elog(ERROR, "could not update checksum of base backup");
1799 0 : bbsink_archive_contents(sink, nbytes);
1800 0 : bytes_done += nbytes;
1801 : }
1802 :
1803 : /*
1804 : * Pad to a block boundary, per tar format requirements. (This small piece
1805 : * of data is probably not worth throttling, and is not checksummed
1806 : * because it's not actually part of the file.)
1807 : */
1808 299676 : _tarWritePadding(sink, bytes_done);
1809 :
1810 299676 : CloseTransientFile(fd);
1811 :
1812 299676 : if (checksum_failures > 1)
1813 : {
1814 4 : ereport(WARNING,
1815 : (errmsg_plural("file \"%s\" has a total of %d checksum verification failure",
1816 : "file \"%s\" has a total of %d checksum verification failures",
1817 : checksum_failures,
1818 : readfilename, checksum_failures)));
1819 :
1820 4 : pgstat_report_checksum_failures_in_db(dboid, checksum_failures);
1821 : }
1822 :
1823 299676 : total_checksum_failures += checksum_failures;
1824 :
1825 299676 : AddFileToBackupManifest(manifest, spcoid, tarfilename, statbuf->st_size,
1826 299676 : (pg_time_t) statbuf->st_mtime, &checksum_ctx);
1827 :
1828 299676 : return true;
1829 : }
1830 :
1831 : /*
1832 : * Read some more data from the file into the bbsink's buffer, verifying
1833 : * checksums as required.
1834 : *
1835 : * 'offset' is the file offset from which we should begin to read, and
1836 : * 'length' is the amount of data that should be read. The actual amount
1837 : * of data read will be less than the requested amount if the bbsink's
1838 : * buffer isn't big enough to hold it all, or if the underlying file has
1839 : * been truncated. The return value is the number of bytes actually read.
1840 : *
1841 : * 'blkno' is the block number of the first page in the bbsink's buffer
1842 : * relative to the start of the relation.
1843 : *
1844 : * 'verify_checksum' indicates whether we should try to verify checksums
1845 : * for the blocks we read. If we do this, we'll update *checksum_failures
1846 : * and issue warnings as appropriate.
1847 : */
1848 : static off_t
1849 345288 : read_file_data_into_buffer(bbsink *sink, const char *readfilename, int fd,
1850 : off_t offset, size_t length, BlockNumber blkno,
1851 : bool verify_checksum, int *checksum_failures)
1852 : {
1853 : off_t cnt;
1854 : int i;
1855 : char *page;
1856 :
1857 : /* Try to read some more data. */
1858 345288 : cnt = basebackup_read_file(fd, sink->bbs_buffer,
1859 345288 : Min(sink->bbs_buffer_length, length),
1860 : offset, readfilename, true);
1861 :
1862 : /* Can't verify checksums if read length is not a multiple of BLCKSZ. */
1863 345288 : if (!verify_checksum || (cnt % BLCKSZ) != 0)
1864 7680 : return cnt;
1865 :
1866 : /* Verify checksum for each block. */
1867 1170824 : for (i = 0; i < cnt / BLCKSZ; i++)
1868 : {
1869 : int reread_cnt;
1870 : uint16 expected_checksum;
1871 :
1872 833216 : page = sink->bbs_buffer + BLCKSZ * i;
1873 :
1874 : /* If the page is OK, go on to the next one. */
1875 833216 : if (verify_page_checksum(page, sink->bbs_state->startptr, blkno + i,
1876 : &expected_checksum))
1877 833188 : continue;
1878 :
1879 : /*
1880 : * Retry the block on the first failure. It's possible that we read
1881 : * the first 4K page of the block just before postgres updated the
1882 : * entire block so it ends up looking torn to us. If, before we retry
1883 : * the read, the concurrent write of the block finishes, the page LSN
1884 : * will be updated and we'll realize that we should ignore this block.
1885 : *
1886 : * There's no guarantee that this will actually happen, though: the
1887 : * torn write could take an arbitrarily long time to complete.
1888 : * Retrying multiple times wouldn't fix this problem, either, though
1889 : * it would reduce the chances of it happening in practice. The only
1890 : * real fix here seems to be to have some kind of interlock that
1891 : * allows us to wait until we can be certain that no write to the
1892 : * block is in progress. Since we don't have any such thing right now,
1893 : * we just do this and hope for the best.
1894 : */
1895 28 : reread_cnt =
1896 28 : basebackup_read_file(fd, sink->bbs_buffer + BLCKSZ * i,
1897 28 : BLCKSZ, offset + BLCKSZ * i,
1898 : readfilename, false);
1899 28 : if (reread_cnt == 0)
1900 : {
1901 : /*
1902 : * If we hit end-of-file, a concurrent truncation must have
1903 : * occurred, so reduce cnt to reflect only the blocks already
1904 : * processed and break out of this loop.
1905 : */
1906 0 : cnt = BLCKSZ * i;
1907 0 : break;
1908 : }
1909 :
1910 : /* If the page now looks OK, go on to the next one. */
1911 28 : if (verify_page_checksum(page, sink->bbs_state->startptr, blkno + i,
1912 : &expected_checksum))
1913 0 : continue;
1914 :
1915 : /* Handle checksum failure. */
1916 28 : (*checksum_failures)++;
1917 28 : if (*checksum_failures <= 5)
1918 24 : ereport(WARNING,
1919 : (errmsg("checksum verification failed in "
1920 : "file \"%s\", block %u: calculated "
1921 : "%X but expected %X",
1922 : readfilename, blkno + i, expected_checksum,
1923 : ((PageHeader) page)->pd_checksum)));
1924 28 : if (*checksum_failures == 5)
1925 4 : ereport(WARNING,
1926 : (errmsg("further checksum verification "
1927 : "failures in file \"%s\" will not "
1928 : "be reported", readfilename)));
1929 : }
1930 :
1931 337608 : return cnt;
1932 : }
1933 :
1934 : /*
1935 : * Push data into a bbsink.
1936 : *
1937 : * It's better, when possible, to read data directly into the bbsink's buffer,
1938 : * rather than using this function to copy it into the buffer; this function is
1939 : * for cases where that approach is not practical.
1940 : *
1941 : * bytes_done should point to a count of the number of bytes that are
1942 : * currently used in the bbsink's buffer. Upon return, the bytes identified by
1943 : * data and length will have been copied into the bbsink's buffer, flushing
1944 : * as required, and *bytes_done will have been updated accordingly. If the
1945 : * buffer was flushed, the previous contents will also have been fed to
1946 : * checksum_ctx.
1947 : *
1948 : * Note that after one or more calls to this function it is the caller's
1949 : * responsibility to perform any required final flush.
1950 : */
1951 : static void
1952 48054 : push_to_sink(bbsink *sink, pg_checksum_context *checksum_ctx,
1953 : size_t *bytes_done, void *data, size_t length)
1954 : {
1955 48054 : while (length > 0)
1956 : {
1957 : size_t bytes_to_copy;
1958 :
1959 : /*
1960 : * We use < here rather than <= so that if the data exactly fills the
1961 : * remaining buffer space, we trigger a flush now.
1962 : */
1963 36098 : if (length < sink->bbs_buffer_length - *bytes_done)
1964 : {
1965 : /* Append remaining data to buffer. */
1966 36098 : memcpy(sink->bbs_buffer + *bytes_done, data, length);
1967 36098 : *bytes_done += length;
1968 36098 : return;
1969 : }
1970 :
1971 : /* Copy until buffer is full and flush it. */
1972 0 : bytes_to_copy = sink->bbs_buffer_length - *bytes_done;
1973 0 : memcpy(sink->bbs_buffer + *bytes_done, data, bytes_to_copy);
1974 0 : data = ((char *) data) + bytes_to_copy;
1975 0 : length -= bytes_to_copy;
1976 0 : bbsink_archive_contents(sink, sink->bbs_buffer_length);
1977 0 : if (pg_checksum_update(checksum_ctx, (uint8 *) sink->bbs_buffer,
1978 : sink->bbs_buffer_length) < 0)
1979 0 : elog(ERROR, "could not update checksum");
1980 0 : *bytes_done = 0;
1981 : }
1982 : }
1983 :
1984 : /*
1985 : * Try to verify the checksum for the provided page, if it seems appropriate
1986 : * to do so.
1987 : *
1988 : * Returns true if verification succeeds or if we decide not to check it,
1989 : * and false if verification fails. When return false, it also sets
1990 : * *expected_checksum to the computed value.
1991 : */
1992 : static bool
1993 833244 : verify_page_checksum(Page page, XLogRecPtr start_lsn, BlockNumber blkno,
1994 : uint16 *expected_checksum)
1995 : {
1996 : PageHeader phdr;
1997 : uint16 checksum;
1998 :
1999 : /*
2000 : * Only check pages which have not been modified since the start of the
2001 : * base backup. Otherwise, they might have been written only halfway and
2002 : * the checksum would not be valid. However, replaying WAL would
2003 : * reinstate the correct page in this case. We also skip completely new
2004 : * pages, since they don't have a checksum yet.
2005 : */
2006 833244 : if (PageIsNew(page) || PageGetLSN(page) >= start_lsn)
2007 2032 : return true;
2008 :
2009 : /* Perform the actual checksum calculation. */
2010 831212 : checksum = pg_checksum_page(page, blkno);
2011 :
2012 : /* See whether it matches the value from the page. */
2013 831212 : phdr = (PageHeader) page;
2014 831212 : if (phdr->pd_checksum == checksum)
2015 831156 : return true;
2016 56 : *expected_checksum = checksum;
2017 56 : return false;
2018 : }
2019 :
2020 : static int64
2021 316522 : _tarWriteHeader(bbsink *sink, const char *filename, const char *linktarget,
2022 : struct stat *statbuf, bool sizeonly)
2023 : {
2024 : enum tarError rc;
2025 :
2026 316522 : if (!sizeonly)
2027 : {
2028 : /*
2029 : * As of this writing, the smallest supported block size is 1kB, which
2030 : * is twice TAR_BLOCK_SIZE. Since the buffer size is required to be a
2031 : * multiple of BLCKSZ, it should be safe to assume that the buffer is
2032 : * large enough to fit an entire tar block. We double-check by means
2033 : * of these assertions.
2034 : */
2035 : StaticAssertDecl(TAR_BLOCK_SIZE <= BLCKSZ,
2036 : "BLCKSZ too small for tar block");
2037 : Assert(sink->bbs_buffer_length >= TAR_BLOCK_SIZE);
2038 :
2039 308230 : rc = tarCreateHeader(sink->bbs_buffer, filename, linktarget,
2040 : statbuf->st_size, statbuf->st_mode,
2041 : statbuf->st_uid, statbuf->st_gid,
2042 : statbuf->st_mtime);
2043 :
2044 308230 : switch (rc)
2045 : {
2046 308228 : case TAR_OK:
2047 308228 : break;
2048 2 : case TAR_NAME_TOO_LONG:
2049 2 : ereport(ERROR,
2050 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2051 : errmsg("file name too long for tar format: \"%s\"",
2052 : filename)));
2053 : break;
2054 0 : case TAR_SYMLINK_TOO_LONG:
2055 0 : ereport(ERROR,
2056 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2057 : errmsg("symbolic link target too long for tar format: "
2058 : "file name \"%s\", target \"%s\"",
2059 : filename, linktarget)));
2060 : break;
2061 0 : default:
2062 0 : elog(ERROR, "unrecognized tar error: %d", rc);
2063 : }
2064 :
2065 308228 : bbsink_archive_contents(sink, TAR_BLOCK_SIZE);
2066 : }
2067 :
2068 316518 : return TAR_BLOCK_SIZE;
2069 : }
2070 :
2071 : /*
2072 : * Pad with zero bytes out to a multiple of TAR_BLOCK_SIZE.
2073 : */
2074 : static void
2075 300064 : _tarWritePadding(bbsink *sink, int len)
2076 : {
2077 300064 : int pad = tarPaddingBytesRequired(len);
2078 :
2079 : /*
2080 : * As in _tarWriteHeader, it should be safe to assume that the buffer is
2081 : * large enough that we don't need to do this in multiple chunks.
2082 : */
2083 : Assert(sink->bbs_buffer_length >= TAR_BLOCK_SIZE);
2084 : Assert(pad <= TAR_BLOCK_SIZE);
2085 :
2086 300064 : if (pad > 0)
2087 : {
2088 69664 : MemSet(sink->bbs_buffer, 0, pad);
2089 16378 : bbsink_archive_contents(sink, pad);
2090 : }
2091 300064 : }
2092 :
2093 : /*
2094 : * If the entry in statbuf is a link, then adjust statbuf to make it look like a
2095 : * directory, so that it will be written that way.
2096 : */
2097 : static void
2098 4912 : convert_link_to_directory(const char *pathbuf, struct stat *statbuf)
2099 : {
2100 : /* If symlink, write it as a directory anyway */
2101 4912 : if (S_ISLNK(statbuf->st_mode))
2102 136 : statbuf->st_mode = S_IFDIR | pg_dir_create_mode;
2103 4912 : }
2104 :
2105 : /*
2106 : * Read some data from a file, setting a wait event and reporting any error
2107 : * encountered.
2108 : *
2109 : * If partial_read_ok is false, also report an error if the number of bytes
2110 : * read is not equal to the number of bytes requested.
2111 : *
2112 : * Returns the number of bytes read.
2113 : */
2114 : static ssize_t
2115 358628 : basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset,
2116 : const char *filename, bool partial_read_ok)
2117 : {
2118 : ssize_t rc;
2119 :
2120 358628 : pgstat_report_wait_start(WAIT_EVENT_BASEBACKUP_READ);
2121 358628 : rc = pg_pread(fd, buf, nbytes, offset);
2122 358628 : pgstat_report_wait_end();
2123 :
2124 358628 : if (rc < 0)
2125 0 : ereport(ERROR,
2126 : (errcode_for_file_access(),
2127 : errmsg("could not read file \"%s\": %m", filename)));
2128 358628 : if (!partial_read_ok && rc > 0 && rc != nbytes)
2129 0 : ereport(ERROR,
2130 : (errcode_for_file_access(),
2131 : errmsg("could not read file \"%s\": read %zd of %zu",
2132 : filename, rc, nbytes)));
2133 :
2134 358628 : return rc;
2135 : }
|