Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * basebackup.c
4 : * code for taking a base backup and streaming it to a standby
5 : *
6 : * Portions Copyright (c) 2010-2024, PostgreSQL Global Development Group
7 : *
8 : * IDENTIFICATION
9 : * src/backend/backup/basebackup.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 : #include "postgres.h"
14 :
15 : #include <sys/stat.h>
16 : #include <unistd.h>
17 : #include <time.h>
18 :
19 : #include "access/xlog_internal.h"
20 : #include "access/xlogbackup.h"
21 : #include "backup/backup_manifest.h"
22 : #include "backup/basebackup.h"
23 : #include "backup/basebackup_incremental.h"
24 : #include "backup/basebackup_sink.h"
25 : #include "backup/basebackup_target.h"
26 : #include "catalog/pg_tablespace_d.h"
27 : #include "commands/defrem.h"
28 : #include "common/compression.h"
29 : #include "common/file_perm.h"
30 : #include "common/file_utils.h"
31 : #include "lib/stringinfo.h"
32 : #include "miscadmin.h"
33 : #include "nodes/pg_list.h"
34 : #include "pgstat.h"
35 : #include "pgtar.h"
36 : #include "port.h"
37 : #include "postmaster/syslogger.h"
38 : #include "postmaster/walsummarizer.h"
39 : #include "replication/walsender.h"
40 : #include "replication/walsender_private.h"
41 : #include "storage/bufpage.h"
42 : #include "storage/checksum.h"
43 : #include "storage/dsm_impl.h"
44 : #include "storage/ipc.h"
45 : #include "storage/reinit.h"
46 : #include "utils/builtins.h"
47 : #include "utils/guc.h"
48 : #include "utils/ps_status.h"
49 : #include "utils/relcache.h"
50 : #include "utils/resowner.h"
51 :
52 : /*
53 : * How much data do we want to send in one CopyData message? Note that
54 : * this may also result in reading the underlying files in chunks of this
55 : * size.
56 : *
57 : * NB: The buffer size is required to be a multiple of the system block
58 : * size, so use that value instead if it's bigger than our preference.
59 : */
60 : #define SINK_BUFFER_LENGTH Max(32768, BLCKSZ)
61 :
62 : typedef struct
63 : {
64 : const char *label;
65 : bool progress;
66 : bool fastcheckpoint;
67 : bool nowait;
68 : bool includewal;
69 : bool incremental;
70 : uint32 maxrate;
71 : bool sendtblspcmapfile;
72 : bool send_to_client;
73 : bool use_copytblspc;
74 : BaseBackupTargetHandle *target_handle;
75 : backup_manifest_option manifest;
76 : pg_compress_algorithm compression;
77 : pg_compress_specification compression_specification;
78 : pg_checksum_type manifest_checksum_type;
79 : } basebackup_options;
80 :
81 : static int64 sendTablespace(bbsink *sink, char *path, Oid spcoid, bool sizeonly,
82 : struct backup_manifest_info *manifest,
83 : IncrementalBackupInfo *ib);
84 : static int64 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly,
85 : List *tablespaces, bool sendtblspclinks,
86 : backup_manifest_info *manifest, Oid spcoid,
87 : IncrementalBackupInfo *ib);
88 : static bool sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
89 : struct stat *statbuf, bool missing_ok,
90 : Oid dboid, Oid spcoid, RelFileNumber relfilenumber,
91 : unsigned segno,
92 : backup_manifest_info *manifest,
93 : unsigned num_incremental_blocks,
94 : BlockNumber *incremental_blocks,
95 : unsigned truncation_block_length);
96 : static off_t read_file_data_into_buffer(bbsink *sink,
97 : const char *readfilename, int fd,
98 : off_t offset, size_t length,
99 : BlockNumber blkno,
100 : bool verify_checksum,
101 : int *checksum_failures);
102 : static void push_to_sink(bbsink *sink, pg_checksum_context *checksum_ctx,
103 : size_t *bytes_done, void *data, size_t length);
104 : static bool verify_page_checksum(Page page, XLogRecPtr start_lsn,
105 : BlockNumber blkno,
106 : uint16 *expected_checksum);
107 : static void sendFileWithContent(bbsink *sink, const char *filename,
108 : const char *content, int len,
109 : backup_manifest_info *manifest);
110 : static int64 _tarWriteHeader(bbsink *sink, const char *filename,
111 : const char *linktarget, struct stat *statbuf,
112 : bool sizeonly);
113 : static void _tarWritePadding(bbsink *sink, int len);
114 : static void convert_link_to_directory(const char *pathbuf, struct stat *statbuf);
115 : static void perform_base_backup(basebackup_options *opt, bbsink *sink,
116 : IncrementalBackupInfo *ib);
117 : static void parse_basebackup_options(List *options, basebackup_options *opt);
118 : static int compareWalFileNames(const ListCell *a, const ListCell *b);
119 : static ssize_t basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset,
120 : const char *filename, bool partial_read_ok);
121 :
122 : /* Was the backup currently in-progress initiated in recovery mode? */
123 : static bool backup_started_in_recovery = false;
124 :
125 : /* Total number of checksum failures during base backup. */
126 : static long long int total_checksum_failures;
127 :
128 : /* Do not verify checksums. */
129 : static bool noverify_checksums = false;
130 :
131 : /*
132 : * Definition of one element part of an exclusion list, used for paths part
133 : * of checksum validation or base backups. "name" is the name of the file
134 : * or path to check for exclusion. If "match_prefix" is true, any items
135 : * matching the name as prefix are excluded.
136 : */
137 : struct exclude_list_item
138 : {
139 : const char *name;
140 : bool match_prefix;
141 : };
142 :
143 : /*
144 : * The contents of these directories are removed or recreated during server
145 : * start so they are not included in backups. The directories themselves are
146 : * kept and included as empty to preserve access permissions.
147 : *
148 : * Note: this list should be kept in sync with the filter lists in pg_rewind's
149 : * filemap.c.
150 : */
151 : static const char *const excludeDirContents[] =
152 : {
153 : /*
154 : * Skip temporary statistics files. PG_STAT_TMP_DIR must be skipped
155 : * because extensions like pg_stat_statements store data there.
156 : */
157 : PG_STAT_TMP_DIR,
158 :
159 : /*
160 : * It is generally not useful to backup the contents of this directory
161 : * even if the intention is to restore to another primary. See backup.sgml
162 : * for a more detailed description.
163 : */
164 : "pg_replslot",
165 :
166 : /* Contents removed on startup, see dsm_cleanup_for_mmap(). */
167 : PG_DYNSHMEM_DIR,
168 :
169 : /* Contents removed on startup, see AsyncShmemInit(). */
170 : "pg_notify",
171 :
172 : /*
173 : * Old contents are loaded for possible debugging but are not required for
174 : * normal operation, see SerialInit().
175 : */
176 : "pg_serial",
177 :
178 : /* Contents removed on startup, see DeleteAllExportedSnapshotFiles(). */
179 : "pg_snapshots",
180 :
181 : /* Contents zeroed on startup, see StartupSUBTRANS(). */
182 : "pg_subtrans",
183 :
184 : /* end of list */
185 : NULL
186 : };
187 :
188 : /*
189 : * List of files excluded from backups.
190 : */
191 : static const struct exclude_list_item excludeFiles[] =
192 : {
193 : /* Skip auto conf temporary file. */
194 : {PG_AUTOCONF_FILENAME ".tmp", false},
195 :
196 : /* Skip current log file temporary file */
197 : {LOG_METAINFO_DATAFILE_TMP, false},
198 :
199 : /*
200 : * Skip relation cache because it is rebuilt on startup. This includes
201 : * temporary files.
202 : */
203 : {RELCACHE_INIT_FILENAME, true},
204 :
205 : /*
206 : * backup_label and tablespace_map should not exist in a running cluster
207 : * capable of doing an online backup, but exclude them just in case.
208 : */
209 : {BACKUP_LABEL_FILE, false},
210 : {TABLESPACE_MAP, false},
211 :
212 : /*
213 : * If there's a backup_manifest, it belongs to a backup that was used to
214 : * start this server. It is *not* correct for this backup. Our
215 : * backup_manifest is injected into the backup separately if users want
216 : * it.
217 : */
218 : {"backup_manifest", false},
219 :
220 : {"postmaster.pid", false},
221 : {"postmaster.opts", false},
222 :
223 : /* end of list */
224 : {NULL, false}
225 : };
226 :
227 : /*
228 : * Actually do a base backup for the specified tablespaces.
229 : *
230 : * This is split out mainly to avoid complaints about "variable might be
231 : * clobbered by longjmp" from stupider versions of gcc.
232 : */
233 : static void
234 278 : perform_base_backup(basebackup_options *opt, bbsink *sink,
235 : IncrementalBackupInfo *ib)
236 : {
237 : bbsink_state state;
238 : XLogRecPtr endptr;
239 : TimeLineID endtli;
240 : backup_manifest_info manifest;
241 : BackupState *backup_state;
242 : StringInfo tablespace_map;
243 :
244 : /* Initial backup state, insofar as we know it now. */
245 278 : state.tablespaces = NIL;
246 278 : state.tablespace_num = 0;
247 278 : state.bytes_done = 0;
248 278 : state.bytes_total = 0;
249 278 : state.bytes_total_is_valid = false;
250 :
251 : /* we're going to use a BufFile, so we need a ResourceOwner */
252 : Assert(CurrentResourceOwner == NULL);
253 278 : CurrentResourceOwner = ResourceOwnerCreate(NULL, "base backup");
254 :
255 278 : backup_started_in_recovery = RecoveryInProgress();
256 :
257 278 : InitializeBackupManifest(&manifest, opt->manifest,
258 : opt->manifest_checksum_type);
259 :
260 278 : total_checksum_failures = 0;
261 :
262 : /* Allocate backup related variables. */
263 278 : backup_state = (BackupState *) palloc0(sizeof(BackupState));
264 278 : tablespace_map = makeStringInfo();
265 :
266 278 : basebackup_progress_wait_checkpoint();
267 278 : do_pg_backup_start(opt->label, opt->fastcheckpoint, &state.tablespaces,
268 : backup_state, tablespace_map);
269 :
270 278 : state.startptr = backup_state->startpoint;
271 278 : state.starttli = backup_state->starttli;
272 :
273 : /*
274 : * Once do_pg_backup_start has been called, ensure that any failure causes
275 : * us to abort the backup so we don't "leak" a backup counter. For this
276 : * reason, *all* functionality between do_pg_backup_start() and the end of
277 : * do_pg_backup_stop() should be inside the error cleanup block!
278 : */
279 :
280 278 : PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false));
281 : {
282 : ListCell *lc;
283 : tablespaceinfo *newti;
284 :
285 : /* If this is an incremental backup, execute preparatory steps. */
286 278 : if (ib != NULL)
287 14 : PrepareForIncrementalBackup(ib, backup_state);
288 :
289 : /* Add a node for the base directory at the end */
290 278 : newti = palloc0(sizeof(tablespaceinfo));
291 278 : newti->size = -1;
292 278 : state.tablespaces = lappend(state.tablespaces, newti);
293 :
294 : /*
295 : * Calculate the total backup size by summing up the size of each
296 : * tablespace
297 : */
298 278 : if (opt->progress)
299 : {
300 278 : basebackup_progress_estimate_backup_size();
301 :
302 622 : foreach(lc, state.tablespaces)
303 : {
304 344 : tablespaceinfo *tmp = (tablespaceinfo *) lfirst(lc);
305 :
306 344 : if (tmp->path == NULL)
307 278 : tmp->size = sendDir(sink, ".", 1, true, state.tablespaces,
308 : true, NULL, InvalidOid, NULL);
309 : else
310 66 : tmp->size = sendTablespace(sink, tmp->path, tmp->oid, true,
311 : NULL, NULL);
312 344 : state.bytes_total += tmp->size;
313 : }
314 278 : state.bytes_total_is_valid = true;
315 : }
316 :
317 : /* notify basebackup sink about start of backup */
318 278 : bbsink_begin_backup(sink, &state, SINK_BUFFER_LENGTH);
319 :
320 : /* Send off our tablespaces one by one */
321 610 : foreach(lc, state.tablespaces)
322 : {
323 344 : tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
324 :
325 344 : if (ti->path == NULL)
326 : {
327 : struct stat statbuf;
328 278 : bool sendtblspclinks = true;
329 : char *backup_label;
330 :
331 278 : bbsink_begin_archive(sink, "base.tar");
332 :
333 : /* In the main tar, include the backup_label first... */
334 278 : backup_label = build_backup_content(backup_state, false);
335 278 : sendFileWithContent(sink, BACKUP_LABEL_FILE,
336 : backup_label, -1, &manifest);
337 278 : pfree(backup_label);
338 :
339 : /* Then the tablespace_map file, if required... */
340 278 : if (opt->sendtblspcmapfile)
341 : {
342 34 : sendFileWithContent(sink, TABLESPACE_MAP,
343 34 : tablespace_map->data, -1, &manifest);
344 34 : sendtblspclinks = false;
345 : }
346 :
347 : /* Then the bulk of the files... */
348 278 : sendDir(sink, ".", 1, false, state.tablespaces,
349 : sendtblspclinks, &manifest, InvalidOid, ib);
350 :
351 : /* ... and pg_control after everything else. */
352 266 : if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0)
353 0 : ereport(ERROR,
354 : (errcode_for_file_access(),
355 : errmsg("could not stat file \"%s\": %m",
356 : XLOG_CONTROL_FILE)));
357 266 : sendFile(sink, XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf,
358 : false, InvalidOid, InvalidOid,
359 : InvalidRelFileNumber, 0, &manifest, 0, NULL, 0);
360 : }
361 : else
362 : {
363 66 : char *archive_name = psprintf("%u.tar", ti->oid);
364 :
365 66 : bbsink_begin_archive(sink, archive_name);
366 :
367 66 : sendTablespace(sink, ti->path, ti->oid, false, &manifest, ib);
368 : }
369 :
370 : /*
371 : * If we're including WAL, and this is the main data directory we
372 : * don't treat this as the end of the tablespace. Instead, we will
373 : * include the xlog files below and stop afterwards. This is safe
374 : * since the main data directory is always sent *last*.
375 : */
376 332 : if (opt->includewal && ti->path == NULL)
377 : {
378 : Assert(lnext(state.tablespaces, lc) == NULL);
379 : }
380 : else
381 : {
382 : /* Properly terminate the tarfile. */
383 : StaticAssertDecl(2 * TAR_BLOCK_SIZE <= BLCKSZ,
384 : "BLCKSZ too small for 2 tar blocks");
385 310 : memset(sink->bbs_buffer, 0, 2 * TAR_BLOCK_SIZE);
386 310 : bbsink_archive_contents(sink, 2 * TAR_BLOCK_SIZE);
387 :
388 : /* OK, that's the end of the archive. */
389 310 : bbsink_end_archive(sink);
390 : }
391 : }
392 :
393 266 : basebackup_progress_wait_wal_archive(&state);
394 266 : do_pg_backup_stop(backup_state, !opt->nowait);
395 :
396 266 : endptr = backup_state->stoppoint;
397 266 : endtli = backup_state->stoptli;
398 :
399 : /* Deallocate backup-related variables. */
400 266 : destroyStringInfo(tablespace_map);
401 266 : pfree(backup_state);
402 : }
403 268 : PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false));
404 :
405 :
406 266 : if (opt->includewal)
407 : {
408 : /*
409 : * We've left the last tar file "open", so we can now append the
410 : * required WAL files to it.
411 : */
412 : char pathbuf[MAXPGPATH];
413 : XLogSegNo segno;
414 : XLogSegNo startsegno;
415 : XLogSegNo endsegno;
416 : struct stat statbuf;
417 22 : List *historyFileList = NIL;
418 22 : List *walFileList = NIL;
419 : char firstoff[MAXFNAMELEN];
420 : char lastoff[MAXFNAMELEN];
421 : DIR *dir;
422 : struct dirent *de;
423 : ListCell *lc;
424 : TimeLineID tli;
425 :
426 22 : basebackup_progress_transfer_wal();
427 :
428 : /*
429 : * I'd rather not worry about timelines here, so scan pg_wal and
430 : * include all WAL files in the range between 'startptr' and 'endptr',
431 : * regardless of the timeline the file is stamped with. If there are
432 : * some spurious WAL files belonging to timelines that don't belong in
433 : * this server's history, they will be included too. Normally there
434 : * shouldn't be such files, but if there are, there's little harm in
435 : * including them.
436 : */
437 22 : XLByteToSeg(state.startptr, startsegno, wal_segment_size);
438 22 : XLogFileName(firstoff, state.starttli, startsegno, wal_segment_size);
439 22 : XLByteToPrevSeg(endptr, endsegno, wal_segment_size);
440 22 : XLogFileName(lastoff, endtli, endsegno, wal_segment_size);
441 :
442 22 : dir = AllocateDir("pg_wal");
443 160 : while ((de = ReadDir(dir, "pg_wal")) != NULL)
444 : {
445 : /* Does it look like a WAL segment, and is it in the range? */
446 138 : if (IsXLogFileName(de->d_name) &&
447 50 : strcmp(de->d_name + 8, firstoff + 8) >= 0 &&
448 50 : strcmp(de->d_name + 8, lastoff + 8) <= 0)
449 : {
450 22 : walFileList = lappend(walFileList, pstrdup(de->d_name));
451 : }
452 : /* Does it look like a timeline history file? */
453 116 : else if (IsTLHistoryFileName(de->d_name))
454 : {
455 0 : historyFileList = lappend(historyFileList, pstrdup(de->d_name));
456 : }
457 : }
458 22 : FreeDir(dir);
459 :
460 : /*
461 : * Before we go any further, check that none of the WAL segments we
462 : * need were removed.
463 : */
464 22 : CheckXLogRemoved(startsegno, state.starttli);
465 :
466 : /*
467 : * Sort the WAL filenames. We want to send the files in order from
468 : * oldest to newest, to reduce the chance that a file is recycled
469 : * before we get a chance to send it over.
470 : */
471 22 : list_sort(walFileList, compareWalFileNames);
472 :
473 : /*
474 : * There must be at least one xlog file in the pg_wal directory, since
475 : * we are doing backup-including-xlog.
476 : */
477 22 : if (walFileList == NIL)
478 0 : ereport(ERROR,
479 : (errmsg("could not find any WAL files")));
480 :
481 : /*
482 : * Sanity check: the first and last segment should cover startptr and
483 : * endptr, with no gaps in between.
484 : */
485 22 : XLogFromFileName((char *) linitial(walFileList),
486 : &tli, &segno, wal_segment_size);
487 22 : if (segno != startsegno)
488 : {
489 : char startfname[MAXFNAMELEN];
490 :
491 0 : XLogFileName(startfname, state.starttli, startsegno,
492 : wal_segment_size);
493 0 : ereport(ERROR,
494 : (errmsg("could not find WAL file \"%s\"", startfname)));
495 : }
496 44 : foreach(lc, walFileList)
497 : {
498 22 : char *walFileName = (char *) lfirst(lc);
499 22 : XLogSegNo currsegno = segno;
500 22 : XLogSegNo nextsegno = segno + 1;
501 :
502 22 : XLogFromFileName(walFileName, &tli, &segno, wal_segment_size);
503 22 : if (!(nextsegno == segno || currsegno == segno))
504 : {
505 : char nextfname[MAXFNAMELEN];
506 :
507 0 : XLogFileName(nextfname, tli, nextsegno, wal_segment_size);
508 0 : ereport(ERROR,
509 : (errmsg("could not find WAL file \"%s\"", nextfname)));
510 : }
511 : }
512 22 : if (segno != endsegno)
513 : {
514 : char endfname[MAXFNAMELEN];
515 :
516 0 : XLogFileName(endfname, endtli, endsegno, wal_segment_size);
517 0 : ereport(ERROR,
518 : (errmsg("could not find WAL file \"%s\"", endfname)));
519 : }
520 :
521 : /* Ok, we have everything we need. Send the WAL files. */
522 44 : foreach(lc, walFileList)
523 : {
524 22 : char *walFileName = (char *) lfirst(lc);
525 : int fd;
526 : ssize_t cnt;
527 22 : pgoff_t len = 0;
528 :
529 22 : snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", walFileName);
530 22 : XLogFromFileName(walFileName, &tli, &segno, wal_segment_size);
531 :
532 22 : fd = OpenTransientFile(pathbuf, O_RDONLY | PG_BINARY);
533 22 : if (fd < 0)
534 : {
535 0 : int save_errno = errno;
536 :
537 : /*
538 : * Most likely reason for this is that the file was already
539 : * removed by a checkpoint, so check for that to get a better
540 : * error message.
541 : */
542 0 : CheckXLogRemoved(segno, tli);
543 :
544 0 : errno = save_errno;
545 0 : ereport(ERROR,
546 : (errcode_for_file_access(),
547 : errmsg("could not open file \"%s\": %m", pathbuf)));
548 : }
549 :
550 22 : if (fstat(fd, &statbuf) != 0)
551 0 : ereport(ERROR,
552 : (errcode_for_file_access(),
553 : errmsg("could not stat file \"%s\": %m",
554 : pathbuf)));
555 22 : if (statbuf.st_size != wal_segment_size)
556 : {
557 0 : CheckXLogRemoved(segno, tli);
558 0 : ereport(ERROR,
559 : (errcode_for_file_access(),
560 : errmsg("unexpected WAL file size \"%s\"", walFileName)));
561 : }
562 :
563 : /* send the WAL file itself */
564 22 : _tarWriteHeader(sink, pathbuf, NULL, &statbuf, false);
565 :
566 11286 : while ((cnt = basebackup_read_file(fd, sink->bbs_buffer,
567 11264 : Min(sink->bbs_buffer_length,
568 : wal_segment_size - len),
569 : len, pathbuf, true)) > 0)
570 : {
571 11264 : CheckXLogRemoved(segno, tli);
572 11264 : bbsink_archive_contents(sink, cnt);
573 :
574 11264 : len += cnt;
575 :
576 11264 : if (len == wal_segment_size)
577 22 : break;
578 : }
579 :
580 22 : if (len != wal_segment_size)
581 : {
582 0 : CheckXLogRemoved(segno, tli);
583 0 : ereport(ERROR,
584 : (errcode_for_file_access(),
585 : errmsg("unexpected WAL file size \"%s\"", walFileName)));
586 : }
587 :
588 : /*
589 : * wal_segment_size is a multiple of TAR_BLOCK_SIZE, so no need
590 : * for padding.
591 : */
592 : Assert(wal_segment_size % TAR_BLOCK_SIZE == 0);
593 :
594 22 : CloseTransientFile(fd);
595 :
596 : /*
597 : * Mark file as archived, otherwise files can get archived again
598 : * after promotion of a new node. This is in line with
599 : * walreceiver.c always doing an XLogArchiveForceDone() after a
600 : * complete segment.
601 : */
602 22 : StatusFilePath(pathbuf, walFileName, ".done");
603 22 : sendFileWithContent(sink, pathbuf, "", -1, &manifest);
604 : }
605 :
606 : /*
607 : * Send timeline history files too. Only the latest timeline history
608 : * file is required for recovery, and even that only if there happens
609 : * to be a timeline switch in the first WAL segment that contains the
610 : * checkpoint record, or if we're taking a base backup from a standby
611 : * server and the target timeline changes while the backup is taken.
612 : * But they are small and highly useful for debugging purposes, so
613 : * better include them all, always.
614 : */
615 22 : foreach(lc, historyFileList)
616 : {
617 0 : char *fname = lfirst(lc);
618 :
619 0 : snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", fname);
620 :
621 0 : if (lstat(pathbuf, &statbuf) != 0)
622 0 : ereport(ERROR,
623 : (errcode_for_file_access(),
624 : errmsg("could not stat file \"%s\": %m", pathbuf)));
625 :
626 0 : sendFile(sink, pathbuf, pathbuf, &statbuf, false,
627 : InvalidOid, InvalidOid, InvalidRelFileNumber, 0,
628 : &manifest, 0, NULL, 0);
629 :
630 : /* unconditionally mark file as archived */
631 0 : StatusFilePath(pathbuf, fname, ".done");
632 0 : sendFileWithContent(sink, pathbuf, "", -1, &manifest);
633 : }
634 :
635 : /* Properly terminate the tar file. */
636 : StaticAssertStmt(2 * TAR_BLOCK_SIZE <= BLCKSZ,
637 : "BLCKSZ too small for 2 tar blocks");
638 22 : memset(sink->bbs_buffer, 0, 2 * TAR_BLOCK_SIZE);
639 22 : bbsink_archive_contents(sink, 2 * TAR_BLOCK_SIZE);
640 :
641 : /* OK, that's the end of the archive. */
642 22 : bbsink_end_archive(sink);
643 : }
644 :
645 266 : AddWALInfoToBackupManifest(&manifest, state.startptr, state.starttli,
646 : endptr, endtli);
647 :
648 266 : SendBackupManifest(&manifest, sink);
649 :
650 266 : bbsink_end_backup(sink, endptr, endtli);
651 :
652 266 : if (total_checksum_failures)
653 : {
654 6 : if (total_checksum_failures > 1)
655 4 : ereport(WARNING,
656 : (errmsg_plural("%lld total checksum verification failure",
657 : "%lld total checksum verification failures",
658 : total_checksum_failures,
659 : total_checksum_failures)));
660 :
661 6 : ereport(ERROR,
662 : (errcode(ERRCODE_DATA_CORRUPTED),
663 : errmsg("checksum verification failure during base backup")));
664 : }
665 :
666 : /*
667 : * Make sure to free the manifest before the resource owners as manifests
668 : * use cryptohash contexts that may depend on resource owners (like
669 : * OpenSSL).
670 : */
671 260 : FreeBackupManifest(&manifest);
672 :
673 : /* clean up the resource owner we created */
674 260 : WalSndResourceCleanup(true);
675 :
676 260 : basebackup_progress_done();
677 260 : }
678 :
679 : /*
680 : * list_sort comparison function, to compare log/seg portion of WAL segment
681 : * filenames, ignoring the timeline portion.
682 : */
683 : static int
684 0 : compareWalFileNames(const ListCell *a, const ListCell *b)
685 : {
686 0 : char *fna = (char *) lfirst(a);
687 0 : char *fnb = (char *) lfirst(b);
688 :
689 0 : return strcmp(fna + 8, fnb + 8);
690 : }
691 :
692 : /*
693 : * Parse the base backup options passed down by the parser
694 : */
695 : static void
696 310 : parse_basebackup_options(List *options, basebackup_options *opt)
697 : {
698 : ListCell *lopt;
699 310 : bool o_label = false;
700 310 : bool o_progress = false;
701 310 : bool o_checkpoint = false;
702 310 : bool o_nowait = false;
703 310 : bool o_wal = false;
704 310 : bool o_incremental = false;
705 310 : bool o_maxrate = false;
706 310 : bool o_tablespace_map = false;
707 310 : bool o_noverify_checksums = false;
708 310 : bool o_manifest = false;
709 310 : bool o_manifest_checksums = false;
710 310 : bool o_target = false;
711 310 : bool o_target_detail = false;
712 310 : char *target_str = NULL;
713 310 : char *target_detail_str = NULL;
714 310 : bool o_compression = false;
715 310 : bool o_compression_detail = false;
716 310 : char *compression_detail_str = NULL;
717 :
718 3410 : MemSet(opt, 0, sizeof(*opt));
719 310 : opt->manifest = MANIFEST_OPTION_NO;
720 310 : opt->manifest_checksum_type = CHECKSUM_TYPE_CRC32C;
721 310 : opt->compression = PG_COMPRESSION_NONE;
722 310 : opt->compression_specification.algorithm = PG_COMPRESSION_NONE;
723 :
724 2316 : foreach(lopt, options)
725 : {
726 2010 : DefElem *defel = (DefElem *) lfirst(lopt);
727 :
728 2010 : if (strcmp(defel->defname, "label") == 0)
729 : {
730 310 : if (o_label)
731 0 : ereport(ERROR,
732 : (errcode(ERRCODE_SYNTAX_ERROR),
733 : errmsg("duplicate option \"%s\"", defel->defname)));
734 310 : opt->label = defGetString(defel);
735 310 : o_label = true;
736 : }
737 1700 : else if (strcmp(defel->defname, "progress") == 0)
738 : {
739 310 : if (o_progress)
740 0 : ereport(ERROR,
741 : (errcode(ERRCODE_SYNTAX_ERROR),
742 : errmsg("duplicate option \"%s\"", defel->defname)));
743 310 : opt->progress = defGetBoolean(defel);
744 310 : o_progress = true;
745 : }
746 1390 : else if (strcmp(defel->defname, "checkpoint") == 0)
747 : {
748 290 : char *optval = defGetString(defel);
749 :
750 290 : if (o_checkpoint)
751 0 : ereport(ERROR,
752 : (errcode(ERRCODE_SYNTAX_ERROR),
753 : errmsg("duplicate option \"%s\"", defel->defname)));
754 290 : if (pg_strcasecmp(optval, "fast") == 0)
755 290 : opt->fastcheckpoint = true;
756 0 : else if (pg_strcasecmp(optval, "spread") == 0)
757 0 : opt->fastcheckpoint = false;
758 : else
759 0 : ereport(ERROR,
760 : (errcode(ERRCODE_SYNTAX_ERROR),
761 : errmsg("unrecognized checkpoint type: \"%s\"",
762 : optval)));
763 290 : o_checkpoint = true;
764 : }
765 1100 : else if (strcmp(defel->defname, "wait") == 0)
766 : {
767 296 : if (o_nowait)
768 0 : ereport(ERROR,
769 : (errcode(ERRCODE_SYNTAX_ERROR),
770 : errmsg("duplicate option \"%s\"", defel->defname)));
771 296 : opt->nowait = !defGetBoolean(defel);
772 296 : o_nowait = true;
773 : }
774 804 : else if (strcmp(defel->defname, "wal") == 0)
775 : {
776 30 : if (o_wal)
777 0 : ereport(ERROR,
778 : (errcode(ERRCODE_SYNTAX_ERROR),
779 : errmsg("duplicate option \"%s\"", defel->defname)));
780 30 : opt->includewal = defGetBoolean(defel);
781 30 : o_wal = true;
782 : }
783 774 : else if (strcmp(defel->defname, "incremental") == 0)
784 : {
785 14 : if (o_incremental)
786 0 : ereport(ERROR,
787 : (errcode(ERRCODE_SYNTAX_ERROR),
788 : errmsg("duplicate option \"%s\"", defel->defname)));
789 14 : opt->incremental = defGetBoolean(defel);
790 14 : if (opt->incremental && !summarize_wal)
791 0 : ereport(ERROR,
792 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
793 : errmsg("incremental backups cannot be taken unless WAL summarization is enabled")));
794 14 : o_incremental = true;
795 : }
796 760 : else if (strcmp(defel->defname, "max_rate") == 0)
797 : {
798 : int64 maxrate;
799 :
800 2 : if (o_maxrate)
801 0 : ereport(ERROR,
802 : (errcode(ERRCODE_SYNTAX_ERROR),
803 : errmsg("duplicate option \"%s\"", defel->defname)));
804 :
805 2 : maxrate = defGetInt64(defel);
806 2 : if (maxrate < MAX_RATE_LOWER || maxrate > MAX_RATE_UPPER)
807 0 : ereport(ERROR,
808 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
809 : errmsg("%d is outside the valid range for parameter \"%s\" (%d .. %d)",
810 : (int) maxrate, "MAX_RATE", MAX_RATE_LOWER, MAX_RATE_UPPER)));
811 :
812 2 : opt->maxrate = (uint32) maxrate;
813 2 : o_maxrate = true;
814 : }
815 758 : else if (strcmp(defel->defname, "tablespace_map") == 0)
816 : {
817 44 : if (o_tablespace_map)
818 0 : ereport(ERROR,
819 : (errcode(ERRCODE_SYNTAX_ERROR),
820 : errmsg("duplicate option \"%s\"", defel->defname)));
821 44 : opt->sendtblspcmapfile = defGetBoolean(defel);
822 44 : o_tablespace_map = true;
823 : }
824 714 : else if (strcmp(defel->defname, "verify_checksums") == 0)
825 : {
826 2 : if (o_noverify_checksums)
827 0 : ereport(ERROR,
828 : (errcode(ERRCODE_SYNTAX_ERROR),
829 : errmsg("duplicate option \"%s\"", defel->defname)));
830 2 : noverify_checksums = !defGetBoolean(defel);
831 2 : o_noverify_checksums = true;
832 : }
833 712 : else if (strcmp(defel->defname, "manifest") == 0)
834 : {
835 308 : char *optval = defGetString(defel);
836 : bool manifest_bool;
837 :
838 308 : if (o_manifest)
839 0 : ereport(ERROR,
840 : (errcode(ERRCODE_SYNTAX_ERROR),
841 : errmsg("duplicate option \"%s\"", defel->defname)));
842 308 : if (parse_bool(optval, &manifest_bool))
843 : {
844 306 : if (manifest_bool)
845 306 : opt->manifest = MANIFEST_OPTION_YES;
846 : else
847 0 : opt->manifest = MANIFEST_OPTION_NO;
848 : }
849 2 : else if (pg_strcasecmp(optval, "force-encode") == 0)
850 2 : opt->manifest = MANIFEST_OPTION_FORCE_ENCODE;
851 : else
852 0 : ereport(ERROR,
853 : (errcode(ERRCODE_SYNTAX_ERROR),
854 : errmsg("unrecognized manifest option: \"%s\"",
855 : optval)));
856 308 : o_manifest = true;
857 : }
858 404 : else if (strcmp(defel->defname, "manifest_checksums") == 0)
859 : {
860 14 : char *optval = defGetString(defel);
861 :
862 14 : if (o_manifest_checksums)
863 0 : ereport(ERROR,
864 : (errcode(ERRCODE_SYNTAX_ERROR),
865 : errmsg("duplicate option \"%s\"", defel->defname)));
866 14 : if (!pg_checksum_parse_type(optval,
867 : &opt->manifest_checksum_type))
868 2 : ereport(ERROR,
869 : (errcode(ERRCODE_SYNTAX_ERROR),
870 : errmsg("unrecognized checksum algorithm: \"%s\"",
871 : optval)));
872 12 : o_manifest_checksums = true;
873 : }
874 390 : else if (strcmp(defel->defname, "target") == 0)
875 : {
876 308 : if (o_target)
877 0 : ereport(ERROR,
878 : (errcode(ERRCODE_SYNTAX_ERROR),
879 : errmsg("duplicate option \"%s\"", defel->defname)));
880 308 : target_str = defGetString(defel);
881 308 : o_target = true;
882 : }
883 82 : else if (strcmp(defel->defname, "target_detail") == 0)
884 : {
885 12 : char *optval = defGetString(defel);
886 :
887 12 : if (o_target_detail)
888 0 : ereport(ERROR,
889 : (errcode(ERRCODE_SYNTAX_ERROR),
890 : errmsg("duplicate option \"%s\"", defel->defname)));
891 12 : target_detail_str = optval;
892 12 : o_target_detail = true;
893 : }
894 70 : else if (strcmp(defel->defname, "compression") == 0)
895 : {
896 48 : char *optval = defGetString(defel);
897 :
898 48 : if (o_compression)
899 0 : ereport(ERROR,
900 : (errcode(ERRCODE_SYNTAX_ERROR),
901 : errmsg("duplicate option \"%s\"", defel->defname)));
902 48 : if (!parse_compress_algorithm(optval, &opt->compression))
903 2 : ereport(ERROR,
904 : (errcode(ERRCODE_SYNTAX_ERROR),
905 : errmsg("unrecognized compression algorithm: \"%s\"",
906 : optval)));
907 46 : o_compression = true;
908 : }
909 22 : else if (strcmp(defel->defname, "compression_detail") == 0)
910 : {
911 22 : if (o_compression_detail)
912 0 : ereport(ERROR,
913 : (errcode(ERRCODE_SYNTAX_ERROR),
914 : errmsg("duplicate option \"%s\"", defel->defname)));
915 22 : compression_detail_str = defGetString(defel);
916 22 : o_compression_detail = true;
917 : }
918 : else
919 0 : ereport(ERROR,
920 : (errcode(ERRCODE_SYNTAX_ERROR),
921 : errmsg("unrecognized base backup option: \"%s\"",
922 : defel->defname)));
923 : }
924 :
925 306 : if (opt->label == NULL)
926 0 : opt->label = "base backup";
927 306 : if (opt->manifest == MANIFEST_OPTION_NO)
928 : {
929 2 : if (o_manifest_checksums)
930 0 : ereport(ERROR,
931 : (errcode(ERRCODE_SYNTAX_ERROR),
932 : errmsg("manifest checksums require a backup manifest")));
933 2 : opt->manifest_checksum_type = CHECKSUM_TYPE_NONE;
934 : }
935 :
936 306 : if (target_str == NULL)
937 : {
938 0 : if (target_detail_str != NULL)
939 0 : ereport(ERROR,
940 : (errcode(ERRCODE_SYNTAX_ERROR),
941 : errmsg("target detail cannot be used without target")));
942 0 : opt->use_copytblspc = true;
943 0 : opt->send_to_client = true;
944 : }
945 306 : else if (strcmp(target_str, "client") == 0)
946 : {
947 282 : if (target_detail_str != NULL)
948 0 : ereport(ERROR,
949 : (errcode(ERRCODE_SYNTAX_ERROR),
950 : errmsg("target \"%s\" does not accept a target detail",
951 : target_str)));
952 282 : opt->send_to_client = true;
953 : }
954 : else
955 20 : opt->target_handle =
956 24 : BaseBackupGetTargetHandle(target_str, target_detail_str);
957 :
958 302 : if (o_compression_detail && !o_compression)
959 0 : ereport(ERROR,
960 : (errcode(ERRCODE_SYNTAX_ERROR),
961 : errmsg("compression detail cannot be specified unless compression is enabled")));
962 :
963 302 : if (o_compression)
964 : {
965 : char *error_detail;
966 :
967 42 : parse_compress_specification(opt->compression, compression_detail_str,
968 : &opt->compression_specification);
969 : error_detail =
970 42 : validate_compress_specification(&opt->compression_specification);
971 42 : if (error_detail != NULL)
972 18 : ereport(ERROR,
973 : errcode(ERRCODE_SYNTAX_ERROR),
974 : errmsg("invalid compression specification: %s",
975 : error_detail));
976 : }
977 284 : }
978 :
979 :
980 : /*
981 : * SendBaseBackup() - send a complete base backup.
982 : *
983 : * The function will put the system into backup mode like pg_backup_start()
984 : * does, so that the backup is consistent even though we read directly from
985 : * the filesystem, bypassing the buffer cache.
986 : */
987 : void
988 312 : SendBaseBackup(BaseBackupCmd *cmd, IncrementalBackupInfo *ib)
989 : {
990 : basebackup_options opt;
991 : bbsink *sink;
992 312 : SessionBackupState status = get_backup_status();
993 :
994 312 : if (status == SESSION_BACKUP_RUNNING)
995 2 : ereport(ERROR,
996 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
997 : errmsg("a backup is already in progress in this session")));
998 :
999 310 : parse_basebackup_options(cmd->options, &opt);
1000 :
1001 284 : WalSndSetState(WALSNDSTATE_BACKUP);
1002 :
1003 284 : if (update_process_title)
1004 : {
1005 : char activitymsg[50];
1006 :
1007 284 : snprintf(activitymsg, sizeof(activitymsg), "sending backup \"%s\"",
1008 : opt.label);
1009 284 : set_ps_display(activitymsg);
1010 : }
1011 :
1012 : /*
1013 : * If we're asked to perform an incremental backup and the user has not
1014 : * supplied a manifest, that's an ERROR.
1015 : *
1016 : * If we're asked to perform a full backup and the user did supply a
1017 : * manifest, just ignore it.
1018 : */
1019 284 : if (!opt.incremental)
1020 270 : ib = NULL;
1021 14 : else if (ib == NULL)
1022 0 : ereport(ERROR,
1023 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1024 : errmsg("must UPLOAD_MANIFEST before performing an incremental BASE_BACKUP")));
1025 :
1026 : /*
1027 : * If the target is specifically 'client' then set up to stream the backup
1028 : * to the client; otherwise, it's being sent someplace else and should not
1029 : * be sent to the client. BaseBackupGetSink has the job of setting up a
1030 : * sink to send the backup data wherever it needs to go.
1031 : */
1032 284 : sink = bbsink_copystream_new(opt.send_to_client);
1033 284 : if (opt.target_handle != NULL)
1034 20 : sink = BaseBackupGetSink(opt.target_handle, sink);
1035 :
1036 : /* Set up network throttling, if client requested it */
1037 278 : if (opt.maxrate > 0)
1038 2 : sink = bbsink_throttle_new(sink, opt.maxrate);
1039 :
1040 : /* Set up server-side compression, if client requested it */
1041 278 : if (opt.compression == PG_COMPRESSION_GZIP)
1042 4 : sink = bbsink_gzip_new(sink, &opt.compression_specification);
1043 274 : else if (opt.compression == PG_COMPRESSION_LZ4)
1044 2 : sink = bbsink_lz4_new(sink, &opt.compression_specification);
1045 272 : else if (opt.compression == PG_COMPRESSION_ZSTD)
1046 0 : sink = bbsink_zstd_new(sink, &opt.compression_specification);
1047 :
1048 : /* Set up progress reporting. */
1049 278 : sink = bbsink_progress_new(sink, opt.progress);
1050 :
1051 : /*
1052 : * Perform the base backup, but make sure we clean up the bbsink even if
1053 : * an error occurs.
1054 : */
1055 278 : PG_TRY();
1056 : {
1057 278 : perform_base_backup(&opt, sink, ib);
1058 : }
1059 8 : PG_FINALLY();
1060 : {
1061 268 : bbsink_cleanup(sink);
1062 : }
1063 268 : PG_END_TRY();
1064 260 : }
1065 :
1066 : /*
1067 : * Inject a file with given name and content in the output tar stream.
1068 : *
1069 : * "len" can optionally be set to an arbitrary length of data sent. If set
1070 : * to -1, the content sent is treated as a string with strlen() as length.
1071 : */
1072 : static void
1073 334 : sendFileWithContent(bbsink *sink, const char *filename, const char *content,
1074 : int len, backup_manifest_info *manifest)
1075 : {
1076 : struct stat statbuf;
1077 334 : int bytes_done = 0;
1078 : pg_checksum_context checksum_ctx;
1079 :
1080 334 : if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0)
1081 0 : elog(ERROR, "could not initialize checksum of file \"%s\"",
1082 : filename);
1083 :
1084 334 : if (len < 0)
1085 334 : len = strlen(content);
1086 :
1087 : /*
1088 : * Construct a stat struct for the file we're injecting in the tar.
1089 : */
1090 :
1091 : /* Windows doesn't have the concept of uid and gid */
1092 : #ifdef WIN32
1093 : statbuf.st_uid = 0;
1094 : statbuf.st_gid = 0;
1095 : #else
1096 334 : statbuf.st_uid = geteuid();
1097 334 : statbuf.st_gid = getegid();
1098 : #endif
1099 334 : statbuf.st_mtime = time(NULL);
1100 334 : statbuf.st_mode = pg_file_create_mode;
1101 334 : statbuf.st_size = len;
1102 :
1103 334 : _tarWriteHeader(sink, filename, NULL, &statbuf, false);
1104 :
1105 334 : if (pg_checksum_update(&checksum_ctx, (uint8 *) content, len) < 0)
1106 0 : elog(ERROR, "could not update checksum of file \"%s\"",
1107 : filename);
1108 :
1109 616 : while (bytes_done < len)
1110 : {
1111 282 : size_t remaining = len - bytes_done;
1112 282 : size_t nbytes = Min(sink->bbs_buffer_length, remaining);
1113 :
1114 282 : memcpy(sink->bbs_buffer, content, nbytes);
1115 282 : bbsink_archive_contents(sink, nbytes);
1116 282 : bytes_done += nbytes;
1117 282 : content += nbytes;
1118 : }
1119 :
1120 334 : _tarWritePadding(sink, len);
1121 :
1122 334 : AddFileToBackupManifest(manifest, InvalidOid, filename, len,
1123 334 : (pg_time_t) statbuf.st_mtime, &checksum_ctx);
1124 334 : }
1125 :
1126 : /*
1127 : * Include the tablespace directory pointed to by 'path' in the output tar
1128 : * stream. If 'sizeonly' is true, we just calculate a total length and return
1129 : * it, without actually sending anything.
1130 : *
1131 : * Only used to send auxiliary tablespaces, not PGDATA.
1132 : */
1133 : static int64
1134 132 : sendTablespace(bbsink *sink, char *path, Oid spcoid, bool sizeonly,
1135 : backup_manifest_info *manifest, IncrementalBackupInfo *ib)
1136 : {
1137 : int64 size;
1138 : char pathbuf[MAXPGPATH];
1139 : struct stat statbuf;
1140 :
1141 : /*
1142 : * 'path' points to the tablespace location, but we only want to include
1143 : * the version directory in it that belongs to us.
1144 : */
1145 132 : snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path,
1146 : TABLESPACE_VERSION_DIRECTORY);
1147 :
1148 : /*
1149 : * Store a directory entry in the tar file so we get the permissions
1150 : * right.
1151 : */
1152 132 : if (lstat(pathbuf, &statbuf) != 0)
1153 : {
1154 0 : if (errno != ENOENT)
1155 0 : ereport(ERROR,
1156 : (errcode_for_file_access(),
1157 : errmsg("could not stat file or directory \"%s\": %m",
1158 : pathbuf)));
1159 :
1160 : /* If the tablespace went away while scanning, it's no error. */
1161 0 : return 0;
1162 : }
1163 :
1164 132 : size = _tarWriteHeader(sink, TABLESPACE_VERSION_DIRECTORY, NULL, &statbuf,
1165 : sizeonly);
1166 :
1167 : /* Send all the files in the tablespace version directory */
1168 132 : size += sendDir(sink, pathbuf, strlen(path), sizeonly, NIL, true, manifest,
1169 : spcoid, ib);
1170 :
1171 132 : return size;
1172 : }
1173 :
1174 : /*
1175 : * Include all files from the given directory in the output tar stream. If
1176 : * 'sizeonly' is true, we just calculate a total length and return it, without
1177 : * actually sending anything.
1178 : *
1179 : * Omit any directory in the tablespaces list, to avoid backing up
1180 : * tablespaces twice when they were created inside PGDATA.
1181 : *
1182 : * If sendtblspclinks is true, we need to include symlink
1183 : * information in the tar file. If not, we can skip that
1184 : * as it will be sent separately in the tablespace_map file.
1185 : */
1186 : static int64
1187 9636 : sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly,
1188 : List *tablespaces, bool sendtblspclinks, backup_manifest_info *manifest,
1189 : Oid spcoid, IncrementalBackupInfo *ib)
1190 : {
1191 : DIR *dir;
1192 : struct dirent *de;
1193 : char pathbuf[MAXPGPATH * 2];
1194 : struct stat statbuf;
1195 9636 : int64 size = 0;
1196 : const char *lastDir; /* Split last dir from parent path. */
1197 9636 : bool isRelationDir = false; /* Does directory contain relations? */
1198 9636 : bool isGlobalDir = false;
1199 9636 : Oid dboid = InvalidOid;
1200 9636 : BlockNumber *relative_block_numbers = NULL;
1201 :
1202 : /*
1203 : * Since this array is relatively large, avoid putting it on the stack.
1204 : * But we don't need it at all if this is not an incremental backup.
1205 : */
1206 9636 : if (ib != NULL)
1207 246 : relative_block_numbers = palloc(sizeof(BlockNumber) * RELSEG_SIZE);
1208 :
1209 : /*
1210 : * Determine if the current path is a database directory that can contain
1211 : * relations.
1212 : *
1213 : * Start by finding the location of the delimiter between the parent path
1214 : * and the current path.
1215 : */
1216 9636 : lastDir = last_dir_separator(path);
1217 :
1218 : /* Does this path look like a database path (i.e. all digits)? */
1219 9636 : if (lastDir != NULL &&
1220 9080 : strspn(lastDir + 1, "0123456789") == strlen(lastDir + 1))
1221 1784 : {
1222 : /* Part of path that contains the parent directory. */
1223 1784 : int parentPathLen = lastDir - path;
1224 :
1225 : /*
1226 : * Mark path as a database directory if the parent path is either
1227 : * $PGDATA/base or a tablespace version path.
1228 : */
1229 1784 : if (strncmp(path, "./base", parentPathLen) == 0 ||
1230 88 : (parentPathLen >= (sizeof(TABLESPACE_VERSION_DIRECTORY) - 1) &&
1231 88 : strncmp(lastDir - (sizeof(TABLESPACE_VERSION_DIRECTORY) - 1),
1232 : TABLESPACE_VERSION_DIRECTORY,
1233 : sizeof(TABLESPACE_VERSION_DIRECTORY) - 1) == 0))
1234 : {
1235 1784 : isRelationDir = true;
1236 1784 : dboid = atooid(lastDir + 1);
1237 : }
1238 : }
1239 7852 : else if (strcmp(path, "./global") == 0)
1240 : {
1241 550 : isRelationDir = true;
1242 550 : isGlobalDir = true;
1243 : }
1244 :
1245 9636 : dir = AllocateDir(path);
1246 591518 : while ((de = ReadDir(dir, path)) != NULL)
1247 : {
1248 : int excludeIdx;
1249 : bool excludeFound;
1250 581906 : RelFileNumber relfilenumber = InvalidRelFileNumber;
1251 581906 : ForkNumber relForkNum = InvalidForkNumber;
1252 581906 : unsigned segno = 0;
1253 581906 : bool isRelationFile = false;
1254 :
1255 : /* Skip special stuff */
1256 581906 : if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
1257 27334 : continue;
1258 :
1259 : /* Skip temporary files */
1260 562666 : if (strncmp(de->d_name,
1261 : PG_TEMP_FILE_PREFIX,
1262 : strlen(PG_TEMP_FILE_PREFIX)) == 0)
1263 544 : continue;
1264 :
1265 : /* Skip macOS system files */
1266 562122 : if (strcmp(de->d_name, ".DS_Store") == 0)
1267 142 : continue;
1268 :
1269 : /*
1270 : * Check if the postmaster has signaled us to exit, and abort with an
1271 : * error in that case. The error handler further up will call
1272 : * do_pg_abort_backup() for us. Also check that if the backup was
1273 : * started while still in recovery, the server wasn't promoted.
1274 : * do_pg_backup_stop() will check that too, but it's better to stop
1275 : * the backup early than continue to the end and fail there.
1276 : */
1277 561980 : CHECK_FOR_INTERRUPTS();
1278 561970 : if (RecoveryInProgress() != backup_started_in_recovery)
1279 0 : ereport(ERROR,
1280 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1281 : errmsg("the standby was promoted during online backup"),
1282 : errhint("This means that the backup being taken is corrupt "
1283 : "and should not be used. "
1284 : "Try taking another online backup.")));
1285 :
1286 : /* Scan for files that should be excluded */
1287 561970 : excludeFound = false;
1288 5048882 : for (excludeIdx = 0; excludeFiles[excludeIdx].name != NULL; excludeIdx++)
1289 : {
1290 4489162 : int cmplen = strlen(excludeFiles[excludeIdx].name);
1291 :
1292 4489162 : if (!excludeFiles[excludeIdx].match_prefix)
1293 3927464 : cmplen++;
1294 4489162 : if (strncmp(de->d_name, excludeFiles[excludeIdx].name, cmplen) == 0)
1295 : {
1296 2250 : elog(DEBUG1, "file \"%s\" excluded from backup", de->d_name);
1297 2250 : excludeFound = true;
1298 2250 : break;
1299 : }
1300 : }
1301 :
1302 561970 : if (excludeFound)
1303 2250 : continue;
1304 :
1305 : /*
1306 : * If there could be non-temporary relation files in this directory,
1307 : * try to parse the filename.
1308 : */
1309 559720 : if (isRelationDir)
1310 : isRelationFile =
1311 540996 : parse_filename_for_nontemp_relation(de->d_name,
1312 : &relfilenumber,
1313 : &relForkNum, &segno);
1314 :
1315 : /* Exclude all forks for unlogged tables except the init fork */
1316 559720 : if (isRelationFile && relForkNum != INIT_FORKNUM)
1317 : {
1318 : char initForkFile[MAXPGPATH];
1319 :
1320 : /*
1321 : * If any other type of fork, check if there is an init fork with
1322 : * the same RelFileNumber. If so, the file can be excluded.
1323 : */
1324 536294 : snprintf(initForkFile, sizeof(initForkFile), "%s/%u_init",
1325 : path, relfilenumber);
1326 :
1327 536294 : if (lstat(initForkFile, &statbuf) == 0)
1328 : {
1329 140 : elog(DEBUG2,
1330 : "unlogged relation file \"%s\" excluded from backup",
1331 : de->d_name);
1332 :
1333 140 : continue;
1334 : }
1335 : }
1336 :
1337 : /* Exclude temporary relations */
1338 559580 : if (OidIsValid(dboid) && looks_like_temp_rel_name(de->d_name))
1339 : {
1340 72 : elog(DEBUG2,
1341 : "temporary relation file \"%s\" excluded from backup",
1342 : de->d_name);
1343 :
1344 72 : continue;
1345 : }
1346 :
1347 559508 : snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path, de->d_name);
1348 :
1349 : /* Skip pg_control here to back up it last */
1350 559508 : if (strcmp(pathbuf, "./global/pg_control") == 0)
1351 548 : continue;
1352 :
1353 558960 : if (lstat(pathbuf, &statbuf) != 0)
1354 : {
1355 0 : if (errno != ENOENT)
1356 0 : ereport(ERROR,
1357 : (errcode_for_file_access(),
1358 : errmsg("could not stat file or directory \"%s\": %m",
1359 : pathbuf)));
1360 :
1361 : /* If the file went away while scanning, it's not an error. */
1362 0 : continue;
1363 : }
1364 :
1365 : /* Scan for directories whose contents should be excluded */
1366 558960 : excludeFound = false;
1367 4456258 : for (excludeIdx = 0; excludeDirContents[excludeIdx] != NULL; excludeIdx++)
1368 : {
1369 3901148 : if (strcmp(de->d_name, excludeDirContents[excludeIdx]) == 0)
1370 : {
1371 3850 : elog(DEBUG1, "contents of directory \"%s\" excluded from backup", de->d_name);
1372 3850 : convert_link_to_directory(pathbuf, &statbuf);
1373 3850 : size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, NULL,
1374 : &statbuf, sizeonly);
1375 3850 : excludeFound = true;
1376 3850 : break;
1377 : }
1378 : }
1379 :
1380 558960 : if (excludeFound)
1381 3850 : continue;
1382 :
1383 : /*
1384 : * We can skip pg_wal, the WAL segments need to be fetched from the
1385 : * WAL archive anyway. But include it as an empty directory anyway, so
1386 : * we get permissions right.
1387 : */
1388 555110 : if (strcmp(pathbuf, "./pg_wal") == 0)
1389 : {
1390 : /* If pg_wal is a symlink, write it as a directory anyway */
1391 548 : convert_link_to_directory(pathbuf, &statbuf);
1392 548 : size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, NULL,
1393 : &statbuf, sizeonly);
1394 :
1395 : /*
1396 : * Also send archive_status and summaries directories (by
1397 : * hackishly reusing statbuf from above ...).
1398 : */
1399 548 : size += _tarWriteHeader(sink, "./pg_wal/archive_status", NULL,
1400 : &statbuf, sizeonly);
1401 548 : size += _tarWriteHeader(sink, "./pg_wal/summaries", NULL,
1402 : &statbuf, sizeonly);
1403 :
1404 548 : continue; /* don't recurse into pg_wal */
1405 : }
1406 :
1407 : /* Allow symbolic links in pg_tblspc only */
1408 554562 : if (strcmp(path, "./pg_tblspc") == 0 && S_ISLNK(statbuf.st_mode))
1409 70 : {
1410 : char linkpath[MAXPGPATH];
1411 : int rllen;
1412 :
1413 70 : rllen = readlink(pathbuf, linkpath, sizeof(linkpath));
1414 70 : if (rllen < 0)
1415 0 : ereport(ERROR,
1416 : (errcode_for_file_access(),
1417 : errmsg("could not read symbolic link \"%s\": %m",
1418 : pathbuf)));
1419 70 : if (rllen >= sizeof(linkpath))
1420 0 : ereport(ERROR,
1421 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1422 : errmsg("symbolic link \"%s\" target is too long",
1423 : pathbuf)));
1424 70 : linkpath[rllen] = '\0';
1425 :
1426 70 : size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, linkpath,
1427 : &statbuf, sizeonly);
1428 : }
1429 554492 : else if (S_ISDIR(statbuf.st_mode))
1430 : {
1431 9036 : bool skip_this_dir = false;
1432 : ListCell *lc;
1433 :
1434 : /*
1435 : * Store a directory entry in the tar file so we can get the
1436 : * permissions right.
1437 : */
1438 9036 : size += _tarWriteHeader(sink, pathbuf + basepathlen + 1, NULL, &statbuf,
1439 : sizeonly);
1440 :
1441 : /*
1442 : * Call ourselves recursively for a directory, unless it happens
1443 : * to be a separate tablespace located within PGDATA.
1444 : */
1445 20106 : foreach(lc, tablespaces)
1446 : {
1447 11126 : tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
1448 :
1449 : /*
1450 : * ti->rpath is the tablespace relative path within PGDATA, or
1451 : * NULL if the tablespace has been properly located somewhere
1452 : * else.
1453 : *
1454 : * Skip past the leading "./" in pathbuf when comparing.
1455 : */
1456 11126 : if (ti->rpath && strcmp(ti->rpath, pathbuf + 2) == 0)
1457 : {
1458 56 : skip_this_dir = true;
1459 56 : break;
1460 : }
1461 : }
1462 :
1463 : /*
1464 : * skip sending directories inside pg_tblspc, if not required.
1465 : */
1466 9036 : if (strcmp(pathbuf, "./pg_tblspc") == 0 && !sendtblspclinks)
1467 32 : skip_this_dir = true;
1468 :
1469 9036 : if (!skip_this_dir)
1470 8948 : size += sendDir(sink, pathbuf, basepathlen, sizeonly, tablespaces,
1471 : sendtblspclinks, manifest, spcoid, ib);
1472 : }
1473 545456 : else if (S_ISREG(statbuf.st_mode))
1474 : {
1475 545456 : bool sent = false;
1476 545456 : unsigned num_blocks_required = 0;
1477 545456 : unsigned truncation_block_length = 0;
1478 : char tarfilenamebuf[MAXPGPATH * 2];
1479 545456 : char *tarfilename = pathbuf + basepathlen + 1;
1480 545456 : FileBackupMethod method = BACK_UP_FILE_FULLY;
1481 :
1482 545456 : if (ib != NULL && isRelationFile)
1483 : {
1484 : Oid relspcoid;
1485 : char *lookup_path;
1486 :
1487 14528 : if (OidIsValid(spcoid))
1488 : {
1489 18 : relspcoid = spcoid;
1490 18 : lookup_path = psprintf("pg_tblspc/%u/%s", spcoid,
1491 : tarfilename);
1492 : }
1493 : else
1494 : {
1495 14510 : if (isGlobalDir)
1496 840 : relspcoid = GLOBALTABLESPACE_OID;
1497 : else
1498 13670 : relspcoid = DEFAULTTABLESPACE_OID;
1499 14510 : lookup_path = pstrdup(tarfilename);
1500 : }
1501 :
1502 14528 : method = GetFileBackupMethod(ib, lookup_path, dboid, relspcoid,
1503 : relfilenumber, relForkNum,
1504 14528 : segno, statbuf.st_size,
1505 : &num_blocks_required,
1506 : relative_block_numbers,
1507 : &truncation_block_length);
1508 14528 : if (method == BACK_UP_FILE_INCREMENTALLY)
1509 : {
1510 9308 : statbuf.st_size =
1511 9308 : GetIncrementalFileSize(num_blocks_required);
1512 9308 : snprintf(tarfilenamebuf, sizeof(tarfilenamebuf),
1513 : "%s/INCREMENTAL.%s",
1514 9308 : path + basepathlen + 1,
1515 9308 : de->d_name);
1516 9308 : tarfilename = tarfilenamebuf;
1517 : }
1518 :
1519 14528 : pfree(lookup_path);
1520 : }
1521 :
1522 545456 : if (!sizeonly)
1523 267326 : sent = sendFile(sink, pathbuf, tarfilename, &statbuf,
1524 : true, dboid, spcoid,
1525 : relfilenumber, segno, manifest,
1526 : num_blocks_required,
1527 : method == BACK_UP_FILE_INCREMENTALLY ? relative_block_numbers : NULL,
1528 : truncation_block_length);
1529 :
1530 545454 : if (sent || sizeonly)
1531 : {
1532 : /* Add size. */
1533 545454 : size += statbuf.st_size;
1534 :
1535 : /* Pad to a multiple of the tar block size. */
1536 545454 : size += tarPaddingBytesRequired(statbuf.st_size);
1537 :
1538 : /* Size of the header for the file. */
1539 545454 : size += TAR_BLOCK_SIZE;
1540 : }
1541 : }
1542 : else
1543 0 : ereport(WARNING,
1544 : (errmsg("skipping special file \"%s\"", pathbuf)));
1545 : }
1546 :
1547 9612 : if (relative_block_numbers != NULL)
1548 246 : pfree(relative_block_numbers);
1549 :
1550 9612 : FreeDir(dir);
1551 9612 : return size;
1552 : }
1553 :
1554 : /*
1555 : * Given the member, write the TAR header & send the file.
1556 : *
1557 : * If 'missing_ok' is true, will not throw an error if the file is not found.
1558 : *
1559 : * If dboid is anything other than InvalidOid then any checksum failures
1560 : * detected will get reported to the cumulative stats system.
1561 : *
1562 : * If the file is to be sent incrementally, then num_incremental_blocks
1563 : * should be the number of blocks to be sent, and incremental_blocks
1564 : * an array of block numbers relative to the start of the current segment.
1565 : * If the whole file is to be sent, then incremental_blocks should be NULL,
1566 : * and num_incremental_blocks can have any value, as it will be ignored.
1567 : *
1568 : * Returns true if the file was successfully sent, false if 'missing_ok',
1569 : * and the file did not exist.
1570 : */
1571 : static bool
1572 267592 : sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
1573 : struct stat *statbuf, bool missing_ok, Oid dboid, Oid spcoid,
1574 : RelFileNumber relfilenumber, unsigned segno,
1575 : backup_manifest_info *manifest, unsigned num_incremental_blocks,
1576 : BlockNumber *incremental_blocks, unsigned truncation_block_length)
1577 : {
1578 : int fd;
1579 267592 : BlockNumber blkno = 0;
1580 267592 : int checksum_failures = 0;
1581 : off_t cnt;
1582 267592 : pgoff_t bytes_done = 0;
1583 267592 : bool verify_checksum = false;
1584 : pg_checksum_context checksum_ctx;
1585 267592 : int ibindex = 0;
1586 :
1587 267592 : if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0)
1588 0 : elog(ERROR, "could not initialize checksum of file \"%s\"",
1589 : readfilename);
1590 :
1591 267592 : fd = OpenTransientFile(readfilename, O_RDONLY | PG_BINARY);
1592 267592 : if (fd < 0)
1593 : {
1594 0 : if (errno == ENOENT && missing_ok)
1595 0 : return false;
1596 0 : ereport(ERROR,
1597 : (errcode_for_file_access(),
1598 : errmsg("could not open file \"%s\": %m", readfilename)));
1599 : }
1600 :
1601 267592 : _tarWriteHeader(sink, tarfilename, NULL, statbuf, false);
1602 :
1603 : /*
1604 : * Checksums are verified in multiples of BLCKSZ, so the buffer length
1605 : * should be a multiple of the block size as well.
1606 : */
1607 : Assert((sink->bbs_buffer_length % BLCKSZ) == 0);
1608 :
1609 : /*
1610 : * If we weren't told not to verify checksums, and if checksums are
1611 : * enabled for this cluster, and if this is a relation file, then verify
1612 : * the checksum.
1613 : */
1614 267590 : if (!noverify_checksums && DataChecksumsEnabled() &&
1615 : RelFileNumberIsValid(relfilenumber))
1616 55772 : verify_checksum = true;
1617 :
1618 : /*
1619 : * If we're sending an incremental file, write the file header.
1620 : */
1621 267590 : if (incremental_blocks != NULL)
1622 : {
1623 9308 : unsigned magic = INCREMENTAL_MAGIC;
1624 9308 : size_t header_bytes_done = 0;
1625 : char padding[BLCKSZ];
1626 : size_t paddinglen;
1627 :
1628 : /* Emit header data. */
1629 9308 : push_to_sink(sink, &checksum_ctx, &header_bytes_done,
1630 : &magic, sizeof(magic));
1631 9308 : push_to_sink(sink, &checksum_ctx, &header_bytes_done,
1632 : &num_incremental_blocks, sizeof(num_incremental_blocks));
1633 9308 : push_to_sink(sink, &checksum_ctx, &header_bytes_done,
1634 : &truncation_block_length, sizeof(truncation_block_length));
1635 9308 : push_to_sink(sink, &checksum_ctx, &header_bytes_done,
1636 : incremental_blocks,
1637 : sizeof(BlockNumber) * num_incremental_blocks);
1638 :
1639 : /*
1640 : * Add padding to align header to a multiple of BLCKSZ, but only if
1641 : * the incremental file has some blocks, and the alignment is actually
1642 : * needed (i.e. header is not already a multiple of BLCKSZ). If there
1643 : * are no blocks we don't want to make the file unnecessarily large,
1644 : * as that might make some filesystem optimizations impossible.
1645 : */
1646 9308 : if ((num_incremental_blocks > 0) && (header_bytes_done % BLCKSZ != 0))
1647 : {
1648 46 : paddinglen = (BLCKSZ - (header_bytes_done % BLCKSZ));
1649 :
1650 46 : memset(padding, 0, paddinglen);
1651 46 : bytes_done += paddinglen;
1652 :
1653 46 : push_to_sink(sink, &checksum_ctx, &header_bytes_done,
1654 : padding, paddinglen);
1655 : }
1656 :
1657 : /* Flush out any data still in the buffer so it's again empty. */
1658 9308 : if (header_bytes_done > 0)
1659 : {
1660 9308 : bbsink_archive_contents(sink, header_bytes_done);
1661 9308 : if (pg_checksum_update(&checksum_ctx,
1662 9308 : (uint8 *) sink->bbs_buffer,
1663 : header_bytes_done) < 0)
1664 0 : elog(ERROR, "could not update checksum of base backup");
1665 : }
1666 :
1667 : /* Update our notion of file position. */
1668 9308 : bytes_done += sizeof(magic);
1669 9308 : bytes_done += sizeof(num_incremental_blocks);
1670 9308 : bytes_done += sizeof(truncation_block_length);
1671 9308 : bytes_done += sizeof(BlockNumber) * num_incremental_blocks;
1672 : }
1673 :
1674 : /*
1675 : * Loop until we read the amount of data the caller told us to expect. The
1676 : * file could be longer, if it was extended while we were sending it, but
1677 : * for a base backup we can ignore such extended data. It will be restored
1678 : * from WAL.
1679 : */
1680 : while (1)
1681 : {
1682 : /*
1683 : * Determine whether we've read all the data that we need, and if not,
1684 : * read some more.
1685 : */
1686 574994 : if (incremental_blocks == NULL)
1687 : {
1688 565608 : size_t remaining = statbuf->st_size - bytes_done;
1689 :
1690 : /*
1691 : * If we've read the required number of bytes, then it's time to
1692 : * stop.
1693 : */
1694 565608 : if (bytes_done >= statbuf->st_size)
1695 258282 : break;
1696 :
1697 : /*
1698 : * Read as many bytes as will fit in the buffer, or however many
1699 : * are left to read, whichever is less.
1700 : */
1701 307326 : cnt = read_file_data_into_buffer(sink, readfilename, fd,
1702 : bytes_done, remaining,
1703 307326 : blkno + segno * RELSEG_SIZE,
1704 : verify_checksum,
1705 : &checksum_failures);
1706 : }
1707 : else
1708 : {
1709 : BlockNumber relative_blkno;
1710 :
1711 : /*
1712 : * If we've read all the blocks, then it's time to stop.
1713 : */
1714 9386 : if (ibindex >= num_incremental_blocks)
1715 9308 : break;
1716 :
1717 : /*
1718 : * Read just one block, whichever one is the next that we're
1719 : * supposed to include.
1720 : */
1721 78 : relative_blkno = incremental_blocks[ibindex++];
1722 78 : cnt = read_file_data_into_buffer(sink, readfilename, fd,
1723 78 : relative_blkno * BLCKSZ,
1724 : BLCKSZ,
1725 78 : relative_blkno + segno * RELSEG_SIZE,
1726 : verify_checksum,
1727 : &checksum_failures);
1728 :
1729 : /*
1730 : * If we get a partial read, that must mean that the relation is
1731 : * being truncated. Ultimately, it should be truncated to a
1732 : * multiple of BLCKSZ, since this path should only be reached for
1733 : * relation files, but we might transiently observe an
1734 : * intermediate value.
1735 : *
1736 : * It should be fine to treat this just as if the entire block had
1737 : * been truncated away - i.e. fill this and all later blocks with
1738 : * zeroes. WAL replay will fix things up.
1739 : */
1740 78 : if (cnt < BLCKSZ)
1741 0 : break;
1742 : }
1743 :
1744 : /*
1745 : * If the amount of data we were able to read was not a multiple of
1746 : * BLCKSZ, we cannot verify checksums, which are block-level.
1747 : */
1748 307404 : if (verify_checksum && (cnt % BLCKSZ != 0))
1749 : {
1750 0 : ereport(WARNING,
1751 : (errmsg("could not verify checksum in file \"%s\", block "
1752 : "%u: read buffer size %d and page size %d "
1753 : "differ",
1754 : readfilename, blkno, (int) cnt, BLCKSZ)));
1755 0 : verify_checksum = false;
1756 : }
1757 :
1758 : /*
1759 : * If we hit end-of-file, a concurrent truncation must have occurred.
1760 : * That's not an error condition, because WAL replay will fix things
1761 : * up.
1762 : */
1763 307404 : if (cnt == 0)
1764 0 : break;
1765 :
1766 : /* Update block number and # of bytes done for next loop iteration. */
1767 307404 : blkno += cnt / BLCKSZ;
1768 307404 : bytes_done += cnt;
1769 :
1770 : /*
1771 : * Make sure incremental files with block data are properly aligned
1772 : * (header is a multiple of BLCKSZ, blocks are BLCKSZ too).
1773 : */
1774 : Assert(!((incremental_blocks != NULL && num_incremental_blocks > 0) &&
1775 : (bytes_done % BLCKSZ != 0)));
1776 :
1777 : /* Archive the data we just read. */
1778 307404 : bbsink_archive_contents(sink, cnt);
1779 :
1780 : /* Also feed it to the checksum machinery. */
1781 307404 : if (pg_checksum_update(&checksum_ctx,
1782 307404 : (uint8 *) sink->bbs_buffer, cnt) < 0)
1783 0 : elog(ERROR, "could not update checksum of base backup");
1784 : }
1785 :
1786 : /* If the file was truncated while we were sending it, pad it with zeros */
1787 267590 : while (bytes_done < statbuf->st_size)
1788 : {
1789 0 : size_t remaining = statbuf->st_size - bytes_done;
1790 0 : size_t nbytes = Min(sink->bbs_buffer_length, remaining);
1791 :
1792 0 : MemSet(sink->bbs_buffer, 0, nbytes);
1793 0 : if (pg_checksum_update(&checksum_ctx,
1794 0 : (uint8 *) sink->bbs_buffer,
1795 : nbytes) < 0)
1796 0 : elog(ERROR, "could not update checksum of base backup");
1797 0 : bbsink_archive_contents(sink, nbytes);
1798 0 : bytes_done += nbytes;
1799 : }
1800 :
1801 : /*
1802 : * Pad to a block boundary, per tar format requirements. (This small piece
1803 : * of data is probably not worth throttling, and is not checksummed
1804 : * because it's not actually part of the file.)
1805 : */
1806 267590 : _tarWritePadding(sink, bytes_done);
1807 :
1808 267590 : CloseTransientFile(fd);
1809 :
1810 267590 : if (checksum_failures > 1)
1811 : {
1812 4 : ereport(WARNING,
1813 : (errmsg_plural("file \"%s\" has a total of %d checksum verification failure",
1814 : "file \"%s\" has a total of %d checksum verification failures",
1815 : checksum_failures,
1816 : readfilename, checksum_failures)));
1817 :
1818 4 : pgstat_report_checksum_failures_in_db(dboid, checksum_failures);
1819 : }
1820 :
1821 267590 : total_checksum_failures += checksum_failures;
1822 :
1823 267590 : AddFileToBackupManifest(manifest, spcoid, tarfilename, statbuf->st_size,
1824 267590 : (pg_time_t) statbuf->st_mtime, &checksum_ctx);
1825 :
1826 267590 : return true;
1827 : }
1828 :
1829 : /*
1830 : * Read some more data from the file into the bbsink's buffer, verifying
1831 : * checksums as required.
1832 : *
1833 : * 'offset' is the file offset from which we should begin to read, and
1834 : * 'length' is the amount of data that should be read. The actual amount
1835 : * of data read will be less than the requested amount if the bbsink's
1836 : * buffer isn't big enough to hold it all, or if the underlying file has
1837 : * been truncated. The return value is the number of bytes actually read.
1838 : *
1839 : * 'blkno' is the block number of the first page in the bbsink's buffer
1840 : * relative to the start of the relation.
1841 : *
1842 : * 'verify_checksum' indicates whether we should try to verify checksums
1843 : * for the blocks we read. If we do this, we'll update *checksum_failures
1844 : * and issue warnings as appropriate.
1845 : */
1846 : static off_t
1847 307404 : read_file_data_into_buffer(bbsink *sink, const char *readfilename, int fd,
1848 : off_t offset, size_t length, BlockNumber blkno,
1849 : bool verify_checksum, int *checksum_failures)
1850 : {
1851 : off_t cnt;
1852 : int i;
1853 : char *page;
1854 :
1855 : /* Try to read some more data. */
1856 307404 : cnt = basebackup_read_file(fd, sink->bbs_buffer,
1857 307404 : Min(sink->bbs_buffer_length, length),
1858 : offset, readfilename, true);
1859 :
1860 : /* Can't verify checksums if read length is not a multiple of BLCKSZ. */
1861 307404 : if (!verify_checksum || (cnt % BLCKSZ) != 0)
1862 240250 : return cnt;
1863 :
1864 : /* Verify checksum for each block. */
1865 232244 : for (i = 0; i < cnt / BLCKSZ; i++)
1866 : {
1867 : int reread_cnt;
1868 : uint16 expected_checksum;
1869 :
1870 165090 : page = sink->bbs_buffer + BLCKSZ * i;
1871 :
1872 : /* If the page is OK, go on to the next one. */
1873 165090 : if (verify_page_checksum(page, sink->bbs_state->startptr, blkno + i,
1874 : &expected_checksum))
1875 165062 : continue;
1876 :
1877 : /*
1878 : * Retry the block on the first failure. It's possible that we read
1879 : * the first 4K page of the block just before postgres updated the
1880 : * entire block so it ends up looking torn to us. If, before we retry
1881 : * the read, the concurrent write of the block finishes, the page LSN
1882 : * will be updated and we'll realize that we should ignore this block.
1883 : *
1884 : * There's no guarantee that this will actually happen, though: the
1885 : * torn write could take an arbitrarily long time to complete.
1886 : * Retrying multiple times wouldn't fix this problem, either, though
1887 : * it would reduce the chances of it happening in practice. The only
1888 : * real fix here seems to be to have some kind of interlock that
1889 : * allows us to wait until we can be certain that no write to the
1890 : * block is in progress. Since we don't have any such thing right now,
1891 : * we just do this and hope for the best.
1892 : */
1893 28 : reread_cnt =
1894 28 : basebackup_read_file(fd, sink->bbs_buffer + BLCKSZ * i,
1895 28 : BLCKSZ, offset + BLCKSZ * i,
1896 : readfilename, false);
1897 28 : if (reread_cnt == 0)
1898 : {
1899 : /*
1900 : * If we hit end-of-file, a concurrent truncation must have
1901 : * occurred, so reduce cnt to reflect only the blocks already
1902 : * processed and break out of this loop.
1903 : */
1904 0 : cnt = BLCKSZ * i;
1905 0 : break;
1906 : }
1907 :
1908 : /* If the page now looks OK, go on to the next one. */
1909 28 : if (verify_page_checksum(page, sink->bbs_state->startptr, blkno + i,
1910 : &expected_checksum))
1911 0 : continue;
1912 :
1913 : /* Handle checksum failure. */
1914 28 : (*checksum_failures)++;
1915 28 : if (*checksum_failures <= 5)
1916 24 : ereport(WARNING,
1917 : (errmsg("checksum verification failed in "
1918 : "file \"%s\", block %u: calculated "
1919 : "%X but expected %X",
1920 : readfilename, blkno + i, expected_checksum,
1921 : ((PageHeader) page)->pd_checksum)));
1922 28 : if (*checksum_failures == 5)
1923 4 : ereport(WARNING,
1924 : (errmsg("further checksum verification "
1925 : "failures in file \"%s\" will not "
1926 : "be reported", readfilename)));
1927 : }
1928 :
1929 67154 : return cnt;
1930 : }
1931 :
1932 : /*
1933 : * Push data into a bbsink.
1934 : *
1935 : * It's better, when possible, to read data directly into the bbsink's buffer,
1936 : * rather than using this function to copy it into the buffer; this function is
1937 : * for cases where that approach is not practical.
1938 : *
1939 : * bytes_done should point to a count of the number of bytes that are
1940 : * currently used in the bbsink's buffer. Upon return, the bytes identified by
1941 : * data and length will have been copied into the bbsink's buffer, flushing
1942 : * as required, and *bytes_done will have been updated accordingly. If the
1943 : * buffer was flushed, the previous contents will also have been fed to
1944 : * checksum_ctx.
1945 : *
1946 : * Note that after one or more calls to this function it is the caller's
1947 : * responsibility to perform any required final flush.
1948 : */
1949 : static void
1950 37278 : push_to_sink(bbsink *sink, pg_checksum_context *checksum_ctx,
1951 : size_t *bytes_done, void *data, size_t length)
1952 : {
1953 37278 : while (length > 0)
1954 : {
1955 : size_t bytes_to_copy;
1956 :
1957 : /*
1958 : * We use < here rather than <= so that if the data exactly fills the
1959 : * remaining buffer space, we trigger a flush now.
1960 : */
1961 28016 : if (length < sink->bbs_buffer_length - *bytes_done)
1962 : {
1963 : /* Append remaining data to buffer. */
1964 28016 : memcpy(sink->bbs_buffer + *bytes_done, data, length);
1965 28016 : *bytes_done += length;
1966 28016 : return;
1967 : }
1968 :
1969 : /* Copy until buffer is full and flush it. */
1970 0 : bytes_to_copy = sink->bbs_buffer_length - *bytes_done;
1971 0 : memcpy(sink->bbs_buffer + *bytes_done, data, bytes_to_copy);
1972 0 : data = ((char *) data) + bytes_to_copy;
1973 0 : length -= bytes_to_copy;
1974 0 : bbsink_archive_contents(sink, sink->bbs_buffer_length);
1975 0 : if (pg_checksum_update(checksum_ctx, (uint8 *) sink->bbs_buffer,
1976 : sink->bbs_buffer_length) < 0)
1977 0 : elog(ERROR, "could not update checksum");
1978 0 : *bytes_done = 0;
1979 : }
1980 : }
1981 :
1982 : /*
1983 : * Try to verify the checksum for the provided page, if it seems appropriate
1984 : * to do so.
1985 : *
1986 : * Returns true if verification succeeds or if we decide not to check it,
1987 : * and false if verification fails. When return false, it also sets
1988 : * *expected_checksum to the computed value.
1989 : */
1990 : static bool
1991 165118 : verify_page_checksum(Page page, XLogRecPtr start_lsn, BlockNumber blkno,
1992 : uint16 *expected_checksum)
1993 : {
1994 : PageHeader phdr;
1995 : uint16 checksum;
1996 :
1997 : /*
1998 : * Only check pages which have not been modified since the start of the
1999 : * base backup. Otherwise, they might have been written only halfway and
2000 : * the checksum would not be valid. However, replaying WAL would
2001 : * reinstate the correct page in this case. We also skip completely new
2002 : * pages, since they don't have a checksum yet.
2003 : */
2004 165118 : if (PageIsNew(page) || PageGetLSN(page) >= start_lsn)
2005 152 : return true;
2006 :
2007 : /* Perform the actual checksum calculation. */
2008 164966 : checksum = pg_checksum_page(page, blkno);
2009 :
2010 : /* See whether it matches the value from the page. */
2011 164966 : phdr = (PageHeader) page;
2012 164966 : if (phdr->pd_checksum == checksum)
2013 164910 : return true;
2014 56 : *expected_checksum = checksum;
2015 56 : return false;
2016 : }
2017 :
2018 : static int64
2019 282680 : _tarWriteHeader(bbsink *sink, const char *filename, const char *linktarget,
2020 : struct stat *statbuf, bool sizeonly)
2021 : {
2022 : enum tarError rc;
2023 :
2024 282680 : if (!sizeonly)
2025 : {
2026 : /*
2027 : * As of this writing, the smallest supported block size is 1kB, which
2028 : * is twice TAR_BLOCK_SIZE. Since the buffer size is required to be a
2029 : * multiple of BLCKSZ, it should be safe to assume that the buffer is
2030 : * large enough to fit an entire tar block. We double-check by means
2031 : * of these assertions.
2032 : */
2033 : StaticAssertDecl(TAR_BLOCK_SIZE <= BLCKSZ,
2034 : "BLCKSZ too small for tar block");
2035 : Assert(sink->bbs_buffer_length >= TAR_BLOCK_SIZE);
2036 :
2037 275238 : rc = tarCreateHeader(sink->bbs_buffer, filename, linktarget,
2038 : statbuf->st_size, statbuf->st_mode,
2039 : statbuf->st_uid, statbuf->st_gid,
2040 : statbuf->st_mtime);
2041 :
2042 275238 : switch (rc)
2043 : {
2044 275236 : case TAR_OK:
2045 275236 : break;
2046 2 : case TAR_NAME_TOO_LONG:
2047 2 : ereport(ERROR,
2048 : (errmsg("file name too long for tar format: \"%s\"",
2049 : filename)));
2050 : break;
2051 0 : case TAR_SYMLINK_TOO_LONG:
2052 0 : ereport(ERROR,
2053 : (errmsg("symbolic link target too long for tar format: "
2054 : "file name \"%s\", target \"%s\"",
2055 : filename, linktarget)));
2056 : break;
2057 0 : default:
2058 0 : elog(ERROR, "unrecognized tar error: %d", rc);
2059 : }
2060 :
2061 275236 : bbsink_archive_contents(sink, TAR_BLOCK_SIZE);
2062 : }
2063 :
2064 282678 : return TAR_BLOCK_SIZE;
2065 : }
2066 :
2067 : /*
2068 : * Pad with zero bytes out to a multiple of TAR_BLOCK_SIZE.
2069 : */
2070 : static void
2071 267924 : _tarWritePadding(bbsink *sink, int len)
2072 : {
2073 267924 : int pad = tarPaddingBytesRequired(len);
2074 :
2075 : /*
2076 : * As in _tarWriteHeader, it should be safe to assume that the buffer is
2077 : * large enough that we don't need to do this in multiple chunks.
2078 : */
2079 : Assert(sink->bbs_buffer_length >= TAR_BLOCK_SIZE);
2080 : Assert(pad <= TAR_BLOCK_SIZE);
2081 :
2082 267924 : if (pad > 0)
2083 : {
2084 62414 : MemSet(sink->bbs_buffer, 0, pad);
2085 13224 : bbsink_archive_contents(sink, pad);
2086 : }
2087 267924 : }
2088 :
2089 : /*
2090 : * If the entry in statbuf is a link, then adjust statbuf to make it look like a
2091 : * directory, so that it will be written that way.
2092 : */
2093 : static void
2094 4398 : convert_link_to_directory(const char *pathbuf, struct stat *statbuf)
2095 : {
2096 : /* If symlink, write it as a directory anyway */
2097 4398 : if (S_ISLNK(statbuf->st_mode))
2098 136 : statbuf->st_mode = S_IFDIR | pg_dir_create_mode;
2099 4398 : }
2100 :
2101 : /*
2102 : * Read some data from a file, setting a wait event and reporting any error
2103 : * encountered.
2104 : *
2105 : * If partial_read_ok is false, also report an error if the number of bytes
2106 : * read is not equal to the number of bytes requested.
2107 : *
2108 : * Returns the number of bytes read.
2109 : */
2110 : static ssize_t
2111 318696 : basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset,
2112 : const char *filename, bool partial_read_ok)
2113 : {
2114 : ssize_t rc;
2115 :
2116 318696 : pgstat_report_wait_start(WAIT_EVENT_BASEBACKUP_READ);
2117 318696 : rc = pg_pread(fd, buf, nbytes, offset);
2118 318696 : pgstat_report_wait_end();
2119 :
2120 318696 : if (rc < 0)
2121 0 : ereport(ERROR,
2122 : (errcode_for_file_access(),
2123 : errmsg("could not read file \"%s\": %m", filename)));
2124 318696 : if (!partial_read_ok && rc > 0 && rc != nbytes)
2125 0 : ereport(ERROR,
2126 : (errcode_for_file_access(),
2127 : errmsg("could not read file \"%s\": read %zd of %zu",
2128 : filename, rc, nbytes)));
2129 :
2130 318696 : return rc;
2131 : }
|