Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * datachecksum_state.c
4 : * Background worker for enabling or disabling data checksums online as
5 : * well as functionality for manipulating data checksum state
6 : *
7 : * When enabling data checksums on a cluster at initdb time or when shut down
8 : * with pg_checksums, no extra process is required as each page is checksummed,
9 : * and verified, when accessed. When enabling checksums on an already running
10 : * cluster, this worker will ensure that all pages are checksummed before
11 : * verification of the checksums is turned on. In the case of disabling
12 : * checksums, the state transition is performed only in the control file, no
13 : * changes are performed on the data pages.
14 : *
15 : * Checksums can be either enabled or disabled cluster-wide, with on/off being
16 : * the end state for data_checksums.
17 : *
18 : * 1. Enabling checksums
19 : * ---------------------
20 : * When enabling checksums in an online cluster, data_checksums will be set to
21 : * "inprogress-on" which signals that write operations MUST compute and write
22 : * the checksum on the data page, but during reading the checksum SHALL NOT be
23 : * verified. This ensures that all objects created during when checksums are
24 : * being enabled will have checksums set, but reads won't fail due to missing or
25 : * invalid checksums. Invalid checksums can be present in case the cluster had
26 : * checksums enabled, then disabled them and updated the page while they were
27 : * disabled.
28 : *
29 : * The DataChecksumsWorker will compile a list of all databases at the start,
30 : * any databases created concurrently will see the in-progress state and will
31 : * be checksummed automatically. All databases from the original list MUST BE
32 : * successfully processed in order for data checksums to be enabled, the only
33 : * exception are databases which are dropped before having been processed.
34 : *
35 : * For each database, all relations which have storage are read and every data
36 : * page is marked dirty to force a write with the checksum. This will generate
37 : * a lot of WAL as the entire database is read and written.
38 : *
39 : * If the processing is interrupted by a cluster crash or restart, it needs to
40 : * be restarted from the beginning again as state isn't persisted.
41 : *
42 : * 2. Disabling checksums
43 : * ----------------------
44 : * When disabling checksums, data_checksums will be set to "inprogress-off"
45 : * which signals that checksums are written but no longer need to be verified.
46 : * This ensures that backends which have not yet transitioned to the
47 : * "inprogress-off" state will still see valid checksums on pages.
48 : *
49 : * 3. Synchronization and Correctness
50 : * ----------------------------------
51 : * The processes involved in enabling or disabling data checksums in an
52 : * online cluster must be properly synchronized with the normal backends
53 : * serving concurrent queries to ensure correctness. Correctness is defined
54 : * as the following:
55 : *
56 : * - Backends SHALL NOT violate the data_checksums state they have agreed to
57 : * by acknowledging the procsignalbarrier: This means that all backends
58 : * MUST calculate and write data checksums during all states except off;
59 : * MUST validate checksums only in the 'on' state.
60 : * - Data checksums SHALL NOT be considered enabled cluster-wide until all
61 : * currently connected backends have state "on": This means that all
62 : * backends must wait on the procsignalbarrier to be acknowledged by all
63 : * before proceeding to validate data checksums.
64 : *
65 : * There are two steps of synchronization required for changing data_checksums
66 : * in an online cluster: (i) changing state in the active backends ("on",
67 : * "off", "inprogress-on" and "inprogress-off"), and (ii) ensuring no
68 : * incompatible objects and processes are left in a database when workers end.
69 : * The former deals with cluster-wide agreement on data checksum state and the
70 : * latter with ensuring that any concurrent activity cannot break the data
71 : * checksum contract during processing.
72 : *
73 : * Synchronizing the state change is done with procsignal barriers. Before
74 : * updating the data_checksums state in the control file, all other backends must absorb the
75 : * barrier. Barrier absorption will happen during interrupt processing, which
76 : * means that connected backends will change state at different times. If
77 : * waiting for a barrier is done during startup, for example during replay, it
78 : * is important to realize that any locks held by the startup process might
79 : * cause deadlocks if backends end up waiting for those locks while startup
80 : * is waiting for a procsignalbarrier.
81 : *
82 : * 3.1 When Enabling Data Checksums
83 : * --------------------------------
84 : * A process which fails to observe data checksums being enabled can induce two
85 : * types of errors: failing to write the checksum when modifying the page and
86 : * failing to validate the data checksum on the page when reading it.
87 : *
88 : * When processing starts all backends belong to one of the below sets, with
89 : * one of Bd and Bi being empty:
90 : *
91 : * Bg: Backend updating the global state and emitting the procsignalbarrier
92 : * Bd: Backends in "off" state
93 : * Bi: Backends in "inprogress-on" state
94 : *
95 : * If processing is started in an online cluster then all backends are in Bd.
96 : * If processing was halted by the cluster shutting down (due to a crash or
97 : * intentional restart), the controlfile state "inprogress-on" will be observed
98 : * on system startup and all backends will be placed in Bd. The controlfile
99 : * state will also be set to "off".
100 : *
101 : * Backends transition Bd -> Bi via a procsignalbarrier which is emitted by the
102 : * DataChecksumsWorkerLauncherMain. When all backends have acknowledged the
103 : * barrier then Bd will be empty and the next phase can begin: calculating and
104 : * writing data checksums with DataChecksumsWorkers. When the
105 : * DataChecksumsWorker processes have finished writing checksums on all pages,
106 : * data checksums are enabled cluster-wide via another procsignalbarrier.
107 : * There are four sets of backends where Bd shall be an empty set:
108 : *
109 : * Bg: Backend updating the global state and emitting the procsignalbarrier
110 : * Bd: Backends in "off" state
111 : * Be: Backends in "on" state
112 : * Bi: Backends in "inprogress-on" state
113 : *
114 : * Backends in Bi and Be will write checksums when modifying a page, but only
115 : * backends in Be will verify the checksum during reading. The Bg backend is
116 : * blocked waiting for all backends in Bi to process interrupts and move to
117 : * Be. Any backend starting while Bg is waiting on the procsignalbarrier will
118 : * observe the global state being "on" and will thus automatically belong to
119 : * Be. Checksums are enabled cluster-wide when Bi is an empty set. Bi and Be
120 : * are compatible sets while still operating based on their local state as
121 : * both write data checksums.
122 : *
123 : * 3.2 When Disabling Data Checksums
124 : * ---------------------------------
125 : * A process which fails to observe that data checksums have been disabled
126 : * can induce two types of errors: writing the checksum when modifying the
127 : * page and validating a data checksum which is no longer correct due to
128 : * modifications to the page. The former is not an error per se as data
129 : * integrity is maintained, but it is wasteful. The latter will cause errors
130 : * in user operations. Assuming the following sets of backends:
131 : *
132 : * Bg: Backend updating the global state and emitting the procsignalbarrier
133 : * Bd: Backends in "off" state
134 : * Be: Backends in "on" state
135 : * Bo: Backends in "inprogress-off" state
136 : * Bi: Backends in "inprogress-on" state
137 : *
138 : * Backends transition from the Be state to Bd like so: Be -> Bo -> Bd. From
139 : * all other states, the transition can be straight to Bd.
140 : *
141 : * The goal is to transition all backends to Bd making the others empty sets.
142 : * Backends in Bo write data checksums, but don't validate them, such that
143 : * backends still in Be can continue to validate pages until the barrier has
144 : * been absorbed such that they are in Bo. Once all backends are in Bo, the
145 : * barrier to transition to "off" can be raised and all backends can safely
146 : * stop writing data checksums as no backend is enforcing data checksum
147 : * validation any longer.
148 : *
149 : * 4. Future opportunities for optimizations
150 : * -----------------------------------------
151 : * Below are some potential optimizations and improvements which were brought
152 : * up during reviews of this feature, but which weren't implemented in the
153 : * initial version. These are ideas listed without any validation on their
154 : * feasibility or potential payoff. More discussion on (most of) these can be
155 : * found on the -hackers threads linked to in the commit message of this
156 : * feature.
157 : *
158 : * * Launching datachecksumsworker for resuming operation from the startup
159 : * process: Currently users have to restart processing manually after a
160 : * restart since dynamic background worker cannot be started from the
161 : * postmaster. Changing the startup process could make restarting the
162 : * processing automatic on cluster restart.
163 : * * Avoid dirtying the page when checksums already match: Iff the checksum
164 : * on the page happens to already match we still dirty the page. It should
165 : * be enough to only do the log_newpage_buffer() call in that case.
166 : * * Teach pg_checksums to avoid checksummed pages when pg_checksums is used
167 : * to enable checksums on a cluster which is in inprogress-on state and
168 : * may have checksummed pages (make pg_checksums be able to resume an
169 : * online operation). This should only be attempted for wal_level minimal.
170 : * * Restartability (not necessarily with page granularity).
171 : * * Avoid processing databases which were created during inprogress-on.
172 : * Right now all databases are processed regardless to be safe.
173 : * * Teach CREATE DATABASE to calculate checksums for databases created
174 : * during inprogress-on with a template database which has yet to be
175 : * processed.
176 : *
177 : *
178 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
179 : * Portions Copyright (c) 1994, Regents of the University of California
180 : *
181 : *
182 : * IDENTIFICATION
183 : * src/backend/postmaster/datachecksum_state.c
184 : *
185 : *-------------------------------------------------------------------------
186 : */
187 : #include "postgres.h"
188 :
189 : #include "access/genam.h"
190 : #include "access/heapam.h"
191 : #include "access/htup_details.h"
192 : #include "access/xact.h"
193 : #include "access/xlog.h"
194 : #include "access/xloginsert.h"
195 : #include "catalog/indexing.h"
196 : #include "catalog/pg_class.h"
197 : #include "catalog/pg_database.h"
198 : #include "commands/progress.h"
199 : #include "commands/vacuum.h"
200 : #include "common/relpath.h"
201 : #include "miscadmin.h"
202 : #include "pgstat.h"
203 : #include "postmaster/bgworker.h"
204 : #include "postmaster/bgwriter.h"
205 : #include "postmaster/datachecksum_state.h"
206 : #include "storage/bufmgr.h"
207 : #include "storage/checksum.h"
208 : #include "storage/ipc.h"
209 : #include "storage/latch.h"
210 : #include "storage/lmgr.h"
211 : #include "storage/lwlock.h"
212 : #include "storage/procarray.h"
213 : #include "storage/smgr.h"
214 : #include "storage/subsystems.h"
215 : #include "tcop/tcopprot.h"
216 : #include "utils/builtins.h"
217 : #include "utils/fmgroids.h"
218 : #include "utils/injection_point.h"
219 : #include "utils/lsyscache.h"
220 : #include "utils/ps_status.h"
221 : #include "utils/syscache.h"
222 : #include "utils/wait_event.h"
223 :
224 : /*
225 : * Configuration of conditions which must match when absorbing a procsignal
226 : * barrier during data checksum enable/disable operations. A single function
227 : * is used for absorbing all barriers, and the current and target states must
228 : * be defined as a from/to tuple in the checksum_barriers struct.
229 : */
230 : typedef struct ChecksumBarrierCondition
231 : {
232 : /* Current state of data checksums */
233 : int from;
234 : /* Target state for data checksums */
235 : int to;
236 : } ChecksumBarrierCondition;
237 :
238 : static const ChecksumBarrierCondition checksum_barriers[9] =
239 : {
240 : /*
241 : * Disabling checksums: If checksums are currently enabled, disabling must
242 : * go through the 'inprogress-off' state.
243 : */
244 : {PG_DATA_CHECKSUM_VERSION, PG_DATA_CHECKSUM_INPROGRESS_OFF},
245 : {PG_DATA_CHECKSUM_INPROGRESS_OFF, PG_DATA_CHECKSUM_OFF},
246 :
247 : /*
248 : * If checksums are in the process of being enabled, but are not yet being
249 : * verified, we can abort by going back to 'off' state.
250 : */
251 : {PG_DATA_CHECKSUM_INPROGRESS_ON, PG_DATA_CHECKSUM_OFF},
252 :
253 : /*
254 : * Enabling checksums must normally go through the 'inprogress-on' state.
255 : */
256 : {PG_DATA_CHECKSUM_OFF, PG_DATA_CHECKSUM_INPROGRESS_ON},
257 : {PG_DATA_CHECKSUM_INPROGRESS_ON, PG_DATA_CHECKSUM_VERSION},
258 :
259 : /*
260 : * If checksums are being disabled but all backends are still computing
261 : * checksums, we can go straight back to 'on'
262 : */
263 : {PG_DATA_CHECKSUM_INPROGRESS_OFF, PG_DATA_CHECKSUM_VERSION},
264 :
265 : /*
266 : * If checksums are being enabled when launcher_exit is executed, state is
267 : * set to off since we cannot reach on at that point.
268 : */
269 : {PG_DATA_CHECKSUM_INPROGRESS_ON, PG_DATA_CHECKSUM_INPROGRESS_OFF},
270 :
271 : /*
272 : * Transitions that can happen when a new request is made while another is
273 : * currently being processed.
274 : */
275 : {PG_DATA_CHECKSUM_INPROGRESS_OFF, PG_DATA_CHECKSUM_INPROGRESS_ON},
276 : {PG_DATA_CHECKSUM_OFF, PG_DATA_CHECKSUM_INPROGRESS_OFF},
277 : };
278 :
279 : /*
280 : * Signaling between backends calling pg_enable/disable_data_checksums, the
281 : * checksums launcher process, and the checksums worker process.
282 : *
283 : * This struct is protected by DataChecksumsWorkerLock
284 : */
285 : typedef struct DataChecksumsStateStruct
286 : {
287 : /*
288 : * These are set by pg_{enable|disable}_data_checksums, to tell the
289 : * launcher what the target state is.
290 : */
291 : DataChecksumsWorkerOperation launch_operation;
292 : int launch_cost_delay;
293 : int launch_cost_limit;
294 :
295 : /*
296 : * Is a launcher process currently running? This is set by the main
297 : * launcher process, after it has read the above launch_* parameters.
298 : */
299 : bool launcher_running;
300 :
301 : /*
302 : * Is a worker process currently running? This is set by the worker
303 : * launcher when it starts waiting for a worker process to finish.
304 : */
305 : int worker_pid;
306 :
307 : /*
308 : * These fields indicate the target state that the launcher is currently
309 : * working towards. They can be different from the corresponding launch_*
310 : * fields, if a new pg_enable/disable_data_checksums() call was made while
311 : * the launcher/worker was already running.
312 : *
313 : * The below members are set when the launcher starts, and are only
314 : * accessed read-only by the single worker. Thus, we can access these
315 : * without a lock. If multiple workers, or dynamic cost parameters, are
316 : * supported at some point then this would need to be revisited.
317 : */
318 : DataChecksumsWorkerOperation operation;
319 : int cost_delay;
320 : int cost_limit;
321 :
322 : /*
323 : * Signaling between the launcher and the worker process.
324 : *
325 : * As there is only a single worker, and the launcher won't read these
326 : * until the worker exits, they can be accessed without the need for a
327 : * lock. If multiple workers are supported then this will have to be
328 : * revisited.
329 : */
330 :
331 : /* result, set by worker before exiting */
332 : DataChecksumsWorkerResult success;
333 :
334 : /*
335 : * Tells the worker process whether it should also process the shared
336 : * catalogs
337 : */
338 : bool process_shared_catalogs;
339 : } DataChecksumsStateStruct;
340 :
341 : /* Shared memory segment for datachecksumsworker */
342 : static DataChecksumsStateStruct *DataChecksumState;
343 :
344 : typedef struct DataChecksumsWorkerDatabase
345 : {
346 : Oid dboid;
347 : char *dbname;
348 : } DataChecksumsWorkerDatabase;
349 :
350 : /* Flag set by the interrupt handler */
351 : static volatile sig_atomic_t abort_requested = false;
352 :
353 : /*
354 : * Have we set the DataChecksumsStateStruct->launcher_running flag?
355 : * If we have, we need to clear it before exiting!
356 : */
357 : static volatile sig_atomic_t launcher_running = false;
358 :
359 : /* Are we enabling data checksums, or disabling them? */
360 : static DataChecksumsWorkerOperation operation;
361 :
362 : /* Prototypes */
363 : static void DataChecksumsShmemRequest(void *arg);
364 : static bool DatabaseExists(Oid dboid);
365 : static List *BuildDatabaseList(void);
366 : static List *BuildRelationList(bool temp_relations, bool include_shared);
367 : static void FreeDatabaseList(List *dblist);
368 : static DataChecksumsWorkerResult ProcessDatabase(DataChecksumsWorkerDatabase *db);
369 : static bool ProcessAllDatabases(void);
370 : static bool ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy);
371 : static void launcher_cancel_handler(SIGNAL_ARGS);
372 : static void WaitForAllTransactionsToFinish(void);
373 :
374 : const ShmemCallbacks DataChecksumsShmemCallbacks = {
375 : .request_fn = DataChecksumsShmemRequest,
376 : };
377 :
378 : #define CHECK_FOR_ABORT_REQUEST() \
379 : do { \
380 : LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED); \
381 : if (DataChecksumState->launch_operation != operation) \
382 : abort_requested = true; \
383 : LWLockRelease(DataChecksumsWorkerLock); \
384 : } while (0)
385 :
386 :
387 : /*****************************************************************************
388 : * Functionality for manipulating the data checksum state in the cluster
389 : */
390 :
391 : void
392 6 : EmitAndWaitDataChecksumsBarrier(uint32 state)
393 : {
394 : uint64 barrier;
395 :
396 6 : switch (state)
397 : {
398 2 : case PG_DATA_CHECKSUM_INPROGRESS_ON:
399 2 : barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON);
400 2 : WaitForProcSignalBarrier(barrier);
401 2 : break;
402 :
403 1 : case PG_DATA_CHECKSUM_INPROGRESS_OFF:
404 1 : barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF);
405 1 : WaitForProcSignalBarrier(barrier);
406 1 : break;
407 :
408 2 : case PG_DATA_CHECKSUM_VERSION:
409 2 : barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON);
410 2 : WaitForProcSignalBarrier(barrier);
411 2 : break;
412 :
413 1 : case PG_DATA_CHECKSUM_OFF:
414 1 : barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF);
415 1 : WaitForProcSignalBarrier(barrier);
416 1 : break;
417 :
418 6 : default:
419 : Assert(false);
420 : }
421 6 : }
422 :
423 : /*
424 : * AbsorbDataChecksumsBarrier
425 : * Generic function for absorbing data checksum state changes
426 : *
427 : * All procsignalbarriers regarding data checksum state changes are absorbed
428 : * with this function. The set of conditions required for the state change to
429 : * be accepted are listed in the checksum_barriers struct, target_state is
430 : * used to look up the relevant entry.
431 : */
432 : bool
433 264 : AbsorbDataChecksumsBarrier(ProcSignalBarrierType barrier)
434 : {
435 : uint32 target_state;
436 264 : int current = data_checksums;
437 264 : bool found = false;
438 :
439 : /*
440 : * Translate the barrier condition to the target state, doing it here
441 : * instead of in the procsignal code saves the latter from knowing about
442 : * checksum states.
443 : */
444 264 : switch (barrier)
445 : {
446 88 : case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON:
447 88 : target_state = PG_DATA_CHECKSUM_INPROGRESS_ON;
448 88 : break;
449 70 : case PROCSIGNAL_BARRIER_CHECKSUM_ON:
450 70 : target_state = PG_DATA_CHECKSUM_VERSION;
451 70 : break;
452 56 : case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF:
453 56 : target_state = PG_DATA_CHECKSUM_INPROGRESS_OFF;
454 56 : break;
455 50 : case PROCSIGNAL_BARRIER_CHECKSUM_OFF:
456 50 : target_state = PG_DATA_CHECKSUM_OFF;
457 50 : break;
458 0 : default:
459 0 : elog(ERROR, "incorrect barrier \"%i\" received", barrier);
460 : }
461 :
462 : /*
463 : * If the target state matches the current state then the barrier has been
464 : * repeated.
465 : */
466 264 : if (current == target_state)
467 0 : return true;
468 :
469 : /*
470 : * If the cluster is in recovery we skip the validation of current state
471 : * since the replay is trusted.
472 : */
473 264 : if (RecoveryInProgress())
474 : {
475 36 : SetLocalDataChecksumState(target_state);
476 36 : return true;
477 : }
478 :
479 : /*
480 : * Find the barrier condition definition for the target state. Not finding
481 : * a condition would be a grave programmer error as the states are a
482 : * discrete set.
483 : */
484 1056 : for (int i = 0; i < lengthof(checksum_barriers) && !found; i++)
485 : {
486 828 : if (checksum_barriers[i].from == current && checksum_barriers[i].to == target_state)
487 228 : found = true;
488 : }
489 :
490 : /*
491 : * If the relevant state criteria aren't satisfied, throw an error which
492 : * will be caught by the procsignal machinery for a later retry.
493 : */
494 228 : if (!found)
495 0 : ereport(ERROR,
496 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
497 : errmsg("incorrect data checksum state %i for target state %i",
498 : current, target_state));
499 :
500 228 : SetLocalDataChecksumState(target_state);
501 228 : return true;
502 : }
503 :
504 :
505 : /*
506 : * Disables data checksums for the cluster, if applicable. Starts a background
507 : * worker which turns off the data checksums.
508 : */
509 : Datum
510 6 : disable_data_checksums(PG_FUNCTION_ARGS)
511 : {
512 6 : PreventCommandDuringRecovery("pg_disable_data_checksums()");
513 :
514 6 : if (!superuser())
515 0 : ereport(ERROR,
516 : errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
517 : errmsg("must be superuser to change data checksum state"));
518 :
519 6 : StartDataChecksumsWorkerLauncher(DISABLE_DATACHECKSUMS, 0, 0);
520 6 : PG_RETURN_VOID();
521 : }
522 :
523 : /*
524 : * Enables data checksums for the cluster, if applicable. Supports vacuum-
525 : * like cost based throttling to limit system load. Starts a background worker
526 : * which updates data checksums on existing data.
527 : */
528 : Datum
529 10 : enable_data_checksums(PG_FUNCTION_ARGS)
530 : {
531 10 : int cost_delay = PG_GETARG_INT32(0);
532 10 : int cost_limit = PG_GETARG_INT32(1);
533 :
534 10 : PreventCommandDuringRecovery("pg_enable_data_checksums()");
535 :
536 10 : if (!superuser())
537 0 : ereport(ERROR,
538 : errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
539 : errmsg("must be superuser to change data checksum state"));
540 :
541 10 : if (cost_delay < 0)
542 0 : ereport(ERROR,
543 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
544 : errmsg("cost delay cannot be a negative value"));
545 :
546 10 : if (cost_limit <= 0)
547 0 : ereport(ERROR,
548 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
549 : errmsg("cost limit must be greater than zero"));
550 :
551 10 : StartDataChecksumsWorkerLauncher(ENABLE_DATACHECKSUMS, cost_delay, cost_limit);
552 :
553 10 : PG_RETURN_VOID();
554 : }
555 :
556 :
557 : /*****************************************************************************
558 : * Functionality for running the datachecksumsworker and associated launcher
559 : */
560 :
561 : /*
562 : * StartDataChecksumsWorkerLauncher
563 : * Main entry point for datachecksumsworker launcher process
564 : *
565 : * The main entrypoint for starting data checksums processing for enabling as
566 : * well as disabling.
567 : */
568 : void
569 16 : StartDataChecksumsWorkerLauncher(DataChecksumsWorkerOperation op,
570 : int cost_delay,
571 : int cost_limit)
572 : {
573 : BackgroundWorker bgw;
574 : BackgroundWorkerHandle *bgw_handle;
575 : bool running;
576 :
577 : #ifdef USE_ASSERT_CHECKING
578 : /* The cost delay settings have no effect when disabling */
579 : if (op == DISABLE_DATACHECKSUMS)
580 : Assert(cost_delay == 0 && cost_limit == 0);
581 : #endif
582 :
583 16 : INJECTION_POINT("datachecksumsworker-startup-delay", NULL);
584 :
585 : /* Store the desired state in shared memory */
586 16 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
587 :
588 16 : DataChecksumState->launch_operation = op;
589 16 : DataChecksumState->launch_cost_delay = cost_delay;
590 16 : DataChecksumState->launch_cost_limit = cost_limit;
591 :
592 : /* Is the launcher already running? If so, what is it doing? */
593 16 : running = DataChecksumState->launcher_running;
594 :
595 16 : LWLockRelease(DataChecksumsWorkerLock);
596 :
597 : /*
598 : * Launch a new launcher process, if it's not running already.
599 : *
600 : * If the launcher is currently busy enabling the checksums, and we want
601 : * them disabled (or vice versa), the launcher will notice that at latest
602 : * when it's about to exit, and will loop back process the new request. So
603 : * if the launcher is already running, we don't need to do anything more
604 : * here to abort it.
605 : *
606 : * If you call pg_enable/disable_data_checksums() twice in a row, before
607 : * the launcher has had a chance to start up, we still end up launching it
608 : * twice. That's OK, the second invocation will see that a launcher is
609 : * already running and exit quickly.
610 : */
611 16 : if (!running)
612 : {
613 16 : if ((op == ENABLE_DATACHECKSUMS && DataChecksumsOn()) ||
614 6 : (op == DISABLE_DATACHECKSUMS && DataChecksumsOff()))
615 : {
616 3 : ereport(LOG,
617 : errmsg("data checksums already in desired state, exiting"));
618 3 : return;
619 : }
620 :
621 : /*
622 : * Prepare the BackgroundWorker and launch it.
623 : */
624 13 : memset(&bgw, 0, sizeof(bgw));
625 13 : bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION;
626 13 : bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
627 13 : snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres");
628 13 : snprintf(bgw.bgw_function_name, BGW_MAXLEN, "DataChecksumsWorkerLauncherMain");
629 13 : snprintf(bgw.bgw_name, BGW_MAXLEN, "datachecksum launcher");
630 13 : snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksum launcher");
631 13 : bgw.bgw_restart_time = BGW_NEVER_RESTART;
632 13 : bgw.bgw_notify_pid = MyProcPid;
633 13 : bgw.bgw_main_arg = (Datum) 0;
634 :
635 13 : if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle))
636 0 : ereport(ERROR,
637 : errcode(ERRCODE_INSUFFICIENT_RESOURCES),
638 : errmsg("failed to start background worker to process data checksums"));
639 : }
640 : else
641 : {
642 0 : ereport(LOG,
643 : errmsg("data checksum processing already running"));
644 : }
645 : }
646 :
647 : /*
648 : * ProcessSingleRelationFork
649 : * Enable data checksums in a single relation/fork.
650 : *
651 : * Returns true if successful, and false if *aborted*. On error, an actual
652 : * error is raised in the lower levels.
653 : */
654 : static bool
655 7598 : ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy)
656 : {
657 7598 : BlockNumber numblocks = RelationGetNumberOfBlocksInFork(reln, forkNum);
658 : char activity[NAMEDATALEN * 2 + 128];
659 : char *relns;
660 :
661 7598 : relns = get_namespace_name(RelationGetNamespace(reln));
662 :
663 : /* Report the current relation to pg_stat_activity */
664 7598 : snprintf(activity, sizeof(activity) - 1, "processing: %s.%s (%s, %u blocks)",
665 7598 : (relns ? relns : ""), RelationGetRelationName(reln), forkNames[forkNum], numblocks);
666 7598 : pgstat_report_activity(STATE_RUNNING, activity);
667 7598 : pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_BLOCKS_TOTAL, numblocks);
668 7598 : if (relns)
669 7598 : pfree(relns);
670 :
671 : /*
672 : * We are looping over the blocks which existed at the time of process
673 : * start, which is safe since new blocks are created with checksums set
674 : * already due to the state being "inprogress-on".
675 : */
676 47916 : for (BlockNumber blknum = 0; blknum < numblocks; blknum++)
677 : {
678 40319 : Buffer buf = ReadBufferExtended(reln, forkNum, blknum, RBM_NORMAL, strategy);
679 :
680 : /* Need to get an exclusive lock to mark the buffer as dirty */
681 40319 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
682 :
683 : /*
684 : * Mark the buffer as dirty and force a full page write. We have to
685 : * re-write the page to WAL even if the checksum hasn't changed,
686 : * because if there is a replica it might have a slightly different
687 : * version of the page with an invalid checksum, caused by unlogged
688 : * changes (e.g. hint bits) on the primary happening while checksums
689 : * were off. This can happen if there was a valid checksum on the page
690 : * at one point in the past, so only when checksums are first on, then
691 : * off, and then turned on again. TODO: investigate if this could be
692 : * avoided if the checksum is calculated to be correct and wal_level
693 : * is set to "minimal".
694 : *
695 : * Unlogged relations don't need WAL since they are reset to their
696 : * init fork on recovery. We still dirty the buffer so that the
697 : * checksum is written to disk at the next checkpoint.
698 : *
699 : * The init fork is an exception: it is WAL-logged so the standby can
700 : * materialize the relation after promotion (see
701 : * ResetUnloggedRelations()). Skipping it here would leave the
702 : * standby with a stale init fork that, once copied to the main fork
703 : * on promotion, would fail checksum verification on every read.
704 : */
705 40319 : START_CRIT_SECTION();
706 40319 : MarkBufferDirty(buf);
707 40319 : if (RelationNeedsWAL(reln) || forkNum == INIT_FORKNUM)
708 40285 : log_newpage_buffer(buf, false);
709 40319 : END_CRIT_SECTION();
710 :
711 40319 : UnlockReleaseBuffer(buf);
712 :
713 : /*
714 : * This is the only place where we check if we are asked to abort, the
715 : * abortion will bubble up from here.
716 : */
717 : Assert(operation == ENABLE_DATACHECKSUMS);
718 40319 : LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED);
719 40319 : if (DataChecksumState->launch_operation == DISABLE_DATACHECKSUMS)
720 0 : abort_requested = true;
721 40319 : LWLockRelease(DataChecksumsWorkerLock);
722 :
723 40319 : if (abort_requested)
724 0 : return false;
725 :
726 : /* update the block counter */
727 40319 : pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_BLOCKS_DONE,
728 40319 : (blknum + 1));
729 :
730 : /*
731 : * Processing is re-using the vacuum cost delay for process
732 : * throttling, hence why we call vacuum APIs here.
733 : */
734 40319 : vacuum_delay_point(false);
735 : }
736 :
737 7597 : return true;
738 : }
739 :
740 : /*
741 : * ProcessSingleRelationByOid
742 : * Process a single relation based on oid.
743 : *
744 : * Returns true if successful, and false if *aborted*. On error, an actual
745 : * error is raised in the lower levels.
746 : */
747 : static bool
748 5866 : ProcessSingleRelationByOid(Oid relationId, BufferAccessStrategy strategy)
749 : {
750 : Relation rel;
751 5866 : bool aborted = false;
752 :
753 5866 : StartTransactionCommand();
754 :
755 5866 : rel = try_relation_open(relationId, AccessShareLock);
756 5866 : if (rel == NULL)
757 : {
758 : /*
759 : * Relation no longer exists. We don't consider this an error since
760 : * there are no pages in it that need data checksums, and thus return
761 : * true. The worker operates off a list of relations generated at the
762 : * start of processing, so relations being dropped in the meantime is
763 : * to be expected.
764 : */
765 0 : CommitTransactionCommand();
766 0 : pgstat_report_activity(STATE_IDLE, NULL);
767 0 : return true;
768 : }
769 5866 : RelationGetSmgr(rel);
770 :
771 29326 : for (ForkNumber fnum = 0; fnum <= MAX_FORKNUM; fnum++)
772 : {
773 23461 : if (smgrexists(rel->rd_smgr, fnum))
774 : {
775 7598 : if (!ProcessSingleRelationFork(rel, fnum, strategy))
776 : {
777 0 : aborted = true;
778 0 : break;
779 : }
780 : }
781 : }
782 5865 : relation_close(rel, AccessShareLock);
783 :
784 5865 : CommitTransactionCommand();
785 5865 : pgstat_report_activity(STATE_IDLE, NULL);
786 :
787 5865 : return !aborted;
788 : }
789 :
790 : /*
791 : * ProcessDatabase
792 : * Enable data checksums in a single database.
793 : *
794 : * We do this by launching a dynamic background worker into this database, and
795 : * waiting for it to finish. We have to do this in a separate worker, since
796 : * each process can only be connected to one database during its lifetime.
797 : */
798 : static DataChecksumsWorkerResult
799 23 : ProcessDatabase(DataChecksumsWorkerDatabase *db)
800 : {
801 : BackgroundWorker bgw;
802 : BackgroundWorkerHandle *bgw_handle;
803 : BgwHandleStatus status;
804 : pid_t pid;
805 : char activity[NAMEDATALEN + 64];
806 :
807 23 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
808 23 : DataChecksumState->success = DATACHECKSUMSWORKER_FAILED;
809 23 : LWLockRelease(DataChecksumsWorkerLock);
810 :
811 23 : memset(&bgw, 0, sizeof(bgw));
812 23 : bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION;
813 23 : bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
814 23 : snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres");
815 23 : snprintf(bgw.bgw_function_name, BGW_MAXLEN, "%s", "DataChecksumsWorkerMain");
816 23 : snprintf(bgw.bgw_name, BGW_MAXLEN, "datachecksum worker");
817 23 : snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksum worker");
818 23 : bgw.bgw_restart_time = BGW_NEVER_RESTART;
819 23 : bgw.bgw_notify_pid = MyProcPid;
820 23 : bgw.bgw_main_arg = ObjectIdGetDatum(db->dboid);
821 :
822 : /*
823 : * If there are no worker slots available, there is little we can do. If
824 : * we retry in a bit it's still unlikely that the user has managed to
825 : * reconfigure in the meantime and we'd be run through retries fast.
826 : */
827 23 : if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle))
828 : {
829 0 : ereport(WARNING,
830 : errmsg("could not start background worker for enabling data checksums in database \"%s\"",
831 : db->dbname),
832 : errhint("The \"%s\" setting might be too low.", "max_worker_processes"));
833 0 : return DATACHECKSUMSWORKER_FAILED;
834 : }
835 :
836 23 : status = WaitForBackgroundWorkerStartup(bgw_handle, &pid);
837 23 : if (status == BGWH_STOPPED)
838 : {
839 : /*
840 : * If the worker managed to start, and stop, before we got to waiting
841 : * for it we can see a STOPPED status here without it being a failure.
842 : */
843 0 : LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED);
844 0 : if (DataChecksumState->success == DATACHECKSUMSWORKER_SUCCESSFUL)
845 : {
846 0 : LWLockRelease(DataChecksumsWorkerLock);
847 0 : pgstat_report_activity(STATE_IDLE, NULL);
848 0 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
849 0 : DataChecksumState->worker_pid = InvalidPid;
850 0 : LWLockRelease(DataChecksumsWorkerLock);
851 0 : return DataChecksumState->success;
852 : }
853 0 : LWLockRelease(DataChecksumsWorkerLock);
854 :
855 0 : ereport(WARNING,
856 : errmsg("could not start background worker for enabling data checksums in database \"%s\"",
857 : db->dbname),
858 : errhint("More details on the error might be found in the server log."));
859 :
860 : /*
861 : * Heuristic to see if the database was dropped, and if it was we can
862 : * treat it as not an error, else treat as fatal and error out.
863 : */
864 0 : if (DatabaseExists(db->dboid))
865 0 : return DATACHECKSUMSWORKER_FAILED;
866 : else
867 0 : return DATACHECKSUMSWORKER_DROPDB;
868 : }
869 :
870 : /*
871 : * If the postmaster crashed we cannot end up with a processed database so
872 : * we have no alternative other than exiting. When enabling checksums we
873 : * won't at this time have changed the data checksums state in pg_control
874 : * to enabled so when the cluster comes back up processing will have to be
875 : * restarted.
876 : */
877 23 : if (status == BGWH_POSTMASTER_DIED)
878 0 : ereport(FATAL,
879 : errcode(ERRCODE_ADMIN_SHUTDOWN),
880 : errmsg("cannot enable data checksums without the postmaster process"),
881 : errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums()."));
882 :
883 : Assert(status == BGWH_STARTED);
884 23 : ereport(LOG,
885 : errmsg("initiating data checksum processing in database \"%s\"",
886 : db->dbname));
887 :
888 : /* Save the pid of the worker so we can signal it later */
889 23 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
890 23 : DataChecksumState->worker_pid = pid;
891 23 : LWLockRelease(DataChecksumsWorkerLock);
892 :
893 23 : snprintf(activity, sizeof(activity) - 1,
894 : "Waiting for worker in database %s (pid %ld)", db->dbname, (long) pid);
895 23 : pgstat_report_activity(STATE_RUNNING, activity);
896 :
897 23 : status = WaitForBackgroundWorkerShutdown(bgw_handle);
898 22 : if (status == BGWH_POSTMASTER_DIED)
899 0 : ereport(FATAL,
900 : errcode(ERRCODE_ADMIN_SHUTDOWN),
901 : errmsg("postmaster exited during data checksum processing in \"%s\"",
902 : db->dbname),
903 : errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums()."));
904 :
905 22 : LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED);
906 22 : if (DataChecksumState->success == DATACHECKSUMSWORKER_ABORTED)
907 0 : ereport(LOG,
908 : errmsg("data checksums processing was aborted in database \"%s\"",
909 : db->dbname));
910 22 : LWLockRelease(DataChecksumsWorkerLock);
911 :
912 22 : pgstat_report_activity(STATE_IDLE, NULL);
913 22 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
914 22 : DataChecksumState->worker_pid = InvalidPid;
915 22 : LWLockRelease(DataChecksumsWorkerLock);
916 :
917 22 : return DataChecksumState->success;
918 : }
919 :
920 : /*
921 : * launcher_exit
922 : *
923 : * Internal routine for cleaning up state when a launcher process which has
924 : * performed checksum operations exits. A launcher process which is exiting due
925 : * to a duplicate started launcher does not need to perform any cleanup and
926 : * this function should not be called. Otherwise, we need to clean up the abort
927 : * flag to ensure that processing started again if it was previously aborted
928 : * (note: started again, *not* restarted from where it left off).
929 : */
930 : static void
931 13 : launcher_exit(int code, Datum arg)
932 : {
933 13 : abort_requested = false;
934 :
935 13 : if (launcher_running)
936 : {
937 2 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
938 2 : if (DataChecksumState->worker_pid != InvalidPid)
939 : {
940 1 : ereport(LOG,
941 : errmsg("data checksums launcher exiting while worker is still running, signalling worker"));
942 1 : kill(DataChecksumState->worker_pid, SIGTERM);
943 : }
944 2 : LWLockRelease(DataChecksumsWorkerLock);
945 : }
946 :
947 : /*
948 : * If the launcher is exiting before data checksums are enabled then set
949 : * the state to off since processing cannot be resumed.
950 : */
951 13 : if (DataChecksumsInProgressOn())
952 1 : SetDataChecksumsOff();
953 :
954 13 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
955 13 : launcher_running = false;
956 13 : DataChecksumState->launcher_running = false;
957 13 : LWLockRelease(DataChecksumsWorkerLock);
958 13 : }
959 :
960 : /*
961 : * launcher_cancel_handler
962 : *
963 : * Internal routine for reacting to SIGINT and flagging the worker to abort.
964 : * The worker won't be interrupted immediately but will check for abort flag
965 : * between each block in a relation.
966 : */
967 : static void
968 0 : launcher_cancel_handler(SIGNAL_ARGS)
969 : {
970 0 : int save_errno = errno;
971 :
972 0 : abort_requested = true;
973 :
974 : /*
975 : * There is no sleeping in the main loop, the flag will be checked
976 : * periodically in ProcessSingleRelationFork. The worker does however
977 : * sleep when waiting for concurrent transactions to end so we still need
978 : * to set the latch.
979 : */
980 0 : SetLatch(MyLatch);
981 :
982 0 : errno = save_errno;
983 0 : }
984 :
985 : /*
986 : * WaitForAllTransactionsToFinish
987 : * Blocks awaiting all current transactions to finish
988 : *
989 : * Returns when all transactions which are active at the call of the function
990 : * have ended, or if the postmaster dies while waiting. If the postmaster dies
991 : * the abort flag will be set to indicate that the caller of this shouldn't
992 : * proceed.
993 : *
994 : * NB: this will return early, if aborted by SIGINT or if the target state
995 : * is changed while we're running.
996 : */
997 : static void
998 9 : WaitForAllTransactionsToFinish(void)
999 : {
1000 : TransactionId waitforxid;
1001 :
1002 9 : LWLockAcquire(XidGenLock, LW_SHARED);
1003 9 : waitforxid = XidFromFullTransactionId(TransamVariables->nextXid);
1004 9 : LWLockRelease(XidGenLock);
1005 :
1006 9 : while (TransactionIdPrecedes(GetOldestActiveTransactionId(false, true), waitforxid))
1007 : {
1008 : char activity[64];
1009 : int rc;
1010 :
1011 : /* Oldest running xid is older than us, so wait */
1012 0 : snprintf(activity,
1013 : sizeof(activity),
1014 : "Waiting for current transactions to finish (waiting for %u)",
1015 : waitforxid);
1016 0 : pgstat_report_activity(STATE_RUNNING, activity);
1017 :
1018 : /* Retry every 3 seconds */
1019 0 : ResetLatch(MyLatch);
1020 0 : rc = WaitLatch(MyLatch,
1021 : WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
1022 : 3000,
1023 : WAIT_EVENT_CHECKSUM_ENABLE_STARTCONDITION);
1024 :
1025 : /*
1026 : * If the postmaster died we won't be able to enable checksums
1027 : * cluster-wide so abort and hope to continue when restarted.
1028 : */
1029 0 : if (rc & WL_POSTMASTER_DEATH)
1030 0 : ereport(FATAL,
1031 : errcode(ERRCODE_ADMIN_SHUTDOWN),
1032 : errmsg("postmaster exited during data checksums processing"),
1033 : errhint("Data checksums processing must be restarted manually after cluster restart."));
1034 :
1035 0 : CHECK_FOR_INTERRUPTS();
1036 0 : CHECK_FOR_ABORT_REQUEST();
1037 :
1038 0 : if (abort_requested)
1039 0 : break;
1040 : }
1041 :
1042 9 : pgstat_report_activity(STATE_IDLE, NULL);
1043 9 : return;
1044 : }
1045 :
1046 : /*
1047 : * DataChecksumsWorkerLauncherMain
1048 : *
1049 : * Main function for launching dynamic background workers for processing data
1050 : * checksums in databases. This function has the bgworker management, with
1051 : * ProcessAllDatabases being responsible for looping over the databases and
1052 : * initiating processing.
1053 : */
1054 : void
1055 13 : DataChecksumsWorkerLauncherMain(Datum arg)
1056 : {
1057 :
1058 13 : ereport(DEBUG1,
1059 : errmsg("background worker \"datachecksums launcher\" started"));
1060 :
1061 13 : pqsignal(SIGTERM, die);
1062 13 : pqsignal(SIGINT, launcher_cancel_handler);
1063 13 : pqsignal(SIGUSR1, procsignal_sigusr1_handler);
1064 13 : pqsignal(SIGUSR2, PG_SIG_IGN);
1065 :
1066 13 : BackgroundWorkerUnblockSignals();
1067 :
1068 13 : MyBackendType = B_DATACHECKSUMSWORKER_LAUNCHER;
1069 13 : init_ps_display(NULL);
1070 :
1071 13 : INJECTION_POINT("datachecksumsworker-launcher-delay", NULL);
1072 :
1073 13 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
1074 :
1075 13 : if (DataChecksumState->launcher_running)
1076 : {
1077 0 : ereport(LOG,
1078 : errmsg("background worker \"datachecksums launcher\" already running, exiting"));
1079 : /* Launcher was already running, let it finish */
1080 0 : LWLockRelease(DataChecksumsWorkerLock);
1081 0 : return;
1082 : }
1083 :
1084 13 : on_shmem_exit(launcher_exit, 0);
1085 13 : launcher_running = true;
1086 :
1087 : /* Initialize a connection to shared catalogs only */
1088 13 : BackgroundWorkerInitializeConnectionByOid(InvalidOid, InvalidOid, 0);
1089 :
1090 13 : operation = DataChecksumState->launch_operation;
1091 13 : DataChecksumState->launcher_running = true;
1092 13 : DataChecksumState->operation = operation;
1093 13 : DataChecksumState->cost_delay = DataChecksumState->launch_cost_delay;
1094 13 : DataChecksumState->cost_limit = DataChecksumState->launch_cost_limit;
1095 13 : LWLockRelease(DataChecksumsWorkerLock);
1096 :
1097 : /*
1098 : * The target state can change while we are busy enabling/disabling
1099 : * checksums, if the user calls pg_disable/enable_data_checksums() before
1100 : * we are finished with the previous request. In that case, we will loop
1101 : * back here, to process the new request.
1102 : */
1103 13 : again:
1104 :
1105 13 : pgstat_progress_start_command(PROGRESS_COMMAND_DATACHECKSUMS,
1106 : InvalidOid);
1107 :
1108 13 : if (operation == ENABLE_DATACHECKSUMS)
1109 : {
1110 : /*
1111 : * If we are asked to enable checksums in a cluster which already has
1112 : * checksums enabled, exit immediately as there is nothing more to do.
1113 : */
1114 9 : if (DataChecksumsNeedVerify())
1115 0 : goto done;
1116 :
1117 9 : ereport(LOG,
1118 : errmsg("enabling data checksums requested, starting data checksum calculation"));
1119 :
1120 : /*
1121 : * Set the state to inprogress-on and wait on the procsignal barrier.
1122 : */
1123 9 : pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE,
1124 : PROGRESS_DATACHECKSUMS_PHASE_ENABLING);
1125 9 : SetDataChecksumsOnInProgress();
1126 :
1127 : /*
1128 : * All backends are now in inprogress-on state and are writing data
1129 : * checksums. Start processing all data at rest.
1130 : */
1131 9 : if (!ProcessAllDatabases())
1132 : {
1133 : /*
1134 : * If the target state changed during processing then it's not a
1135 : * failure, so restart processing instead.
1136 : */
1137 0 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
1138 0 : if (DataChecksumState->launch_operation != operation)
1139 : {
1140 0 : LWLockRelease(DataChecksumsWorkerLock);
1141 0 : goto done;
1142 : }
1143 0 : LWLockRelease(DataChecksumsWorkerLock);
1144 0 : ereport(ERROR,
1145 : errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1146 : errmsg("unable to enable data checksums in cluster"));
1147 : }
1148 :
1149 : /*
1150 : * Data checksums have been set on all pages, set the state to on in
1151 : * order to instruct backends to validate checksums on reading.
1152 : */
1153 7 : SetDataChecksumsOn();
1154 :
1155 7 : ereport(LOG,
1156 : errmsg("data checksums are now enabled"));
1157 : }
1158 4 : else if (operation == DISABLE_DATACHECKSUMS)
1159 : {
1160 4 : ereport(LOG,
1161 : errmsg("disabling data checksums requested"));
1162 :
1163 4 : pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE,
1164 : PROGRESS_DATACHECKSUMS_PHASE_DISABLING);
1165 4 : SetDataChecksumsOff();
1166 4 : ereport(LOG,
1167 : errmsg("data checksums are now disabled"));
1168 : }
1169 : else
1170 : Assert(false);
1171 :
1172 0 : done:
1173 :
1174 : /*
1175 : * This state will only be displayed for a fleeting moment, but for the
1176 : * sake of correctness it is still added before ending the command.
1177 : */
1178 11 : pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE,
1179 : PROGRESS_DATACHECKSUMS_PHASE_DONE);
1180 :
1181 : /*
1182 : * All done. But before we exit, check if the target state was changed
1183 : * while we were running. In that case we will have to start all over
1184 : * again.
1185 : */
1186 11 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
1187 11 : if (DataChecksumState->launch_operation != operation)
1188 : {
1189 0 : DataChecksumState->operation = DataChecksumState->launch_operation;
1190 0 : operation = DataChecksumState->launch_operation;
1191 0 : DataChecksumState->cost_delay = DataChecksumState->launch_cost_delay;
1192 0 : DataChecksumState->cost_limit = DataChecksumState->launch_cost_limit;
1193 0 : LWLockRelease(DataChecksumsWorkerLock);
1194 0 : goto again;
1195 : }
1196 :
1197 : /* Shut down progress reporting as we are done */
1198 11 : pgstat_progress_end_command();
1199 :
1200 11 : launcher_running = false;
1201 11 : DataChecksumState->launcher_running = false;
1202 11 : LWLockRelease(DataChecksumsWorkerLock);
1203 : }
1204 :
1205 : /*
1206 : * ProcessAllDatabases
1207 : * Compute the list of all databases and process checksums in each
1208 : *
1209 : * This will generate a list of databases to process for enabling checksums.
1210 : * If a database encounters a failure then processing will end immediately and
1211 : * return an error.
1212 : */
1213 : static bool
1214 9 : ProcessAllDatabases(void)
1215 : {
1216 : List *DatabaseList;
1217 9 : int cumulative_total = 0;
1218 :
1219 : /* Set up so first run processes shared catalogs, not once in every db */
1220 9 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
1221 9 : DataChecksumState->process_shared_catalogs = true;
1222 9 : LWLockRelease(DataChecksumsWorkerLock);
1223 :
1224 : /* Get a list of all databases to process */
1225 9 : WaitForAllTransactionsToFinish();
1226 9 : DatabaseList = BuildDatabaseList();
1227 :
1228 : /*
1229 : * Update progress reporting with the total number of databases we need to
1230 : * process. This number should not be changed during processing, the
1231 : * columns for processed databases is instead increased such that it can
1232 : * be compared against the total.
1233 : */
1234 : {
1235 9 : const int index[] = {
1236 : PROGRESS_DATACHECKSUMS_DBS_TOTAL,
1237 : PROGRESS_DATACHECKSUMS_DBS_DONE,
1238 : PROGRESS_DATACHECKSUMS_RELS_TOTAL,
1239 : PROGRESS_DATACHECKSUMS_RELS_DONE,
1240 : PROGRESS_DATACHECKSUMS_BLOCKS_TOTAL,
1241 : PROGRESS_DATACHECKSUMS_BLOCKS_DONE,
1242 : };
1243 :
1244 : int64 vals[6];
1245 :
1246 9 : vals[0] = list_length(DatabaseList);
1247 9 : vals[1] = 0;
1248 : /* translated to NULL */
1249 9 : vals[2] = -1;
1250 9 : vals[3] = -1;
1251 9 : vals[4] = -1;
1252 9 : vals[5] = -1;
1253 :
1254 9 : pgstat_progress_update_multi_param(6, index, vals);
1255 : }
1256 :
1257 37 : foreach_ptr(DataChecksumsWorkerDatabase, db, DatabaseList)
1258 : {
1259 : DataChecksumsWorkerResult result;
1260 :
1261 23 : result = ProcessDatabase(db);
1262 :
1263 : #ifdef USE_INJECTION_POINTS
1264 : /* Allow a test process to alter the result of the operation */
1265 22 : if (IS_INJECTION_POINT_ATTACHED("datachecksumsworker-fail-db-result"))
1266 : {
1267 1 : result = DATACHECKSUMSWORKER_FAILED;
1268 1 : INJECTION_POINT_CACHED("datachecksumsworker-fail-db-result",
1269 : db->dbname);
1270 : }
1271 : #endif
1272 :
1273 22 : pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_DBS_DONE,
1274 : ++cumulative_total);
1275 :
1276 22 : if (result == DATACHECKSUMSWORKER_FAILED)
1277 : {
1278 : /*
1279 : * Disable checksums on cluster, because we failed one of the
1280 : * databases and this is an all or nothing process.
1281 : */
1282 1 : SetDataChecksumsOff();
1283 1 : ereport(ERROR,
1284 : errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1285 : errmsg("data checksums failed to get enabled in all databases, aborting"),
1286 : errhint("The server log might have more information on the cause of the error."));
1287 : }
1288 21 : else if (result == DATACHECKSUMSWORKER_ABORTED || abort_requested)
1289 : {
1290 : /* Abort flag set, so exit the whole process */
1291 0 : return false;
1292 : }
1293 :
1294 : /*
1295 : * When one database has completed, it will have done shared catalogs
1296 : * so we don't have to process them again.
1297 : */
1298 21 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
1299 21 : DataChecksumState->process_shared_catalogs = false;
1300 21 : LWLockRelease(DataChecksumsWorkerLock);
1301 : }
1302 :
1303 7 : FreeDatabaseList(DatabaseList);
1304 :
1305 7 : pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE,
1306 : PROGRESS_DATACHECKSUMS_PHASE_WAITING_BARRIER);
1307 7 : return true;
1308 : }
1309 :
1310 : /*
1311 : * DataChecksumsShmemRequest
1312 : * Request datachecksumsworker-related shared memory
1313 : */
1314 : static void
1315 1251 : DataChecksumsShmemRequest(void *arg)
1316 : {
1317 1251 : ShmemRequestStruct(.name = "DataChecksumsWorker Data",
1318 : .size = sizeof(DataChecksumsStateStruct),
1319 : .ptr = (void **) &DataChecksumState,
1320 : );
1321 1251 : }
1322 :
1323 : /*
1324 : * DatabaseExists
1325 : *
1326 : * Scans the system catalog to check if a database with the given Oid exists
1327 : * and returns true if it is found and valid, else false. Note, we cannot use
1328 : * database_is_invalid_oid here as it will ERROR out, and we want to gracefully
1329 : * handle errors.
1330 : */
1331 : static bool
1332 0 : DatabaseExists(Oid dboid)
1333 : {
1334 : Relation rel;
1335 : ScanKeyData skey;
1336 : SysScanDesc scan;
1337 : bool found;
1338 : HeapTuple tuple;
1339 : Form_pg_database pg_database_tuple;
1340 :
1341 0 : StartTransactionCommand();
1342 :
1343 0 : rel = table_open(DatabaseRelationId, AccessShareLock);
1344 0 : ScanKeyInit(&skey,
1345 : Anum_pg_database_oid,
1346 : BTEqualStrategyNumber, F_OIDEQ,
1347 : ObjectIdGetDatum(dboid));
1348 0 : scan = systable_beginscan(rel, DatabaseOidIndexId, true, SnapshotSelf,
1349 : 1, &skey);
1350 0 : tuple = systable_getnext(scan);
1351 0 : found = HeapTupleIsValid(tuple);
1352 :
1353 : /* If the Oid exists, ensure that it's not partially dropped */
1354 0 : if (found)
1355 : {
1356 0 : pg_database_tuple = (Form_pg_database) GETSTRUCT(tuple);
1357 0 : if (database_is_invalid_form(pg_database_tuple))
1358 0 : found = false;
1359 : }
1360 :
1361 0 : systable_endscan(scan);
1362 0 : table_close(rel, AccessShareLock);
1363 :
1364 0 : CommitTransactionCommand();
1365 :
1366 0 : return found;
1367 : }
1368 :
1369 : /*
1370 : * BuildDatabaseList
1371 : * Compile a list of all currently available databases in the cluster
1372 : *
1373 : * This creates the list of databases for the datachecksumsworker workers to
1374 : * add checksums to. If the caller wants to ensure that no concurrently
1375 : * running CREATE DATABASE calls exist, this needs to be preceded by a call
1376 : * to WaitForAllTransactionsToFinish().
1377 : */
1378 : static List *
1379 9 : BuildDatabaseList(void)
1380 : {
1381 9 : List *DatabaseList = NIL;
1382 : Relation rel;
1383 : TableScanDesc scan;
1384 : HeapTuple tup;
1385 9 : MemoryContext ctx = CurrentMemoryContext;
1386 : MemoryContext oldctx;
1387 :
1388 9 : StartTransactionCommand();
1389 :
1390 9 : rel = table_open(DatabaseRelationId, AccessShareLock);
1391 9 : scan = table_beginscan_catalog(rel, 0, NULL);
1392 :
1393 36 : while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection)))
1394 : {
1395 27 : Form_pg_database pgdb = (Form_pg_database) GETSTRUCT(tup);
1396 : DataChecksumsWorkerDatabase *db;
1397 :
1398 27 : oldctx = MemoryContextSwitchTo(ctx);
1399 :
1400 27 : db = (DataChecksumsWorkerDatabase *) palloc0(sizeof(DataChecksumsWorkerDatabase));
1401 :
1402 27 : db->dboid = pgdb->oid;
1403 27 : db->dbname = pstrdup(NameStr(pgdb->datname));
1404 :
1405 27 : DatabaseList = lappend(DatabaseList, db);
1406 :
1407 27 : MemoryContextSwitchTo(oldctx);
1408 : }
1409 :
1410 9 : table_endscan(scan);
1411 9 : table_close(rel, AccessShareLock);
1412 :
1413 9 : CommitTransactionCommand();
1414 :
1415 9 : return DatabaseList;
1416 : }
1417 :
1418 : static void
1419 7 : FreeDatabaseList(List *dblist)
1420 : {
1421 7 : if (!dblist)
1422 0 : return;
1423 :
1424 35 : foreach_ptr(DataChecksumsWorkerDatabase, db, dblist)
1425 : {
1426 21 : if (db->dbname != NULL)
1427 21 : pfree(db->dbname);
1428 : }
1429 :
1430 7 : list_free_deep(dblist);
1431 : }
1432 :
1433 : /*
1434 : * BuildRelationList
1435 : * Compile a list of relations in the database
1436 : *
1437 : * Returns a list of OIDs for the request relation types. If temp_relations
1438 : * is True then only temporary relations are returned. If temp_relations is
1439 : * False then non-temporary relations which have data checksums are returned.
1440 : * If include_shared is True then shared relations are included as well in a
1441 : * non-temporary list. include_shared has no relevance when building a list of
1442 : * temporary relations.
1443 : */
1444 : static List *
1445 68 : BuildRelationList(bool temp_relations, bool include_shared)
1446 : {
1447 68 : List *RelationList = NIL;
1448 : Relation rel;
1449 : TableScanDesc scan;
1450 : HeapTuple tup;
1451 68 : MemoryContext ctx = CurrentMemoryContext;
1452 : MemoryContext oldctx;
1453 :
1454 68 : StartTransactionCommand();
1455 :
1456 68 : rel = table_open(RelationRelationId, AccessShareLock);
1457 68 : scan = table_beginscan_catalog(rel, 0, NULL);
1458 :
1459 30847 : while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection)))
1460 : {
1461 30779 : Form_pg_class pgc = (Form_pg_class) GETSTRUCT(tup);
1462 :
1463 : /* Only include temporary relations when explicitly asked to */
1464 30779 : if (pgc->relpersistence == RELPERSISTENCE_TEMP)
1465 : {
1466 2 : if (!temp_relations)
1467 1 : continue;
1468 : }
1469 : else
1470 : {
1471 : /*
1472 : * If we are only interested in temp relations then continue
1473 : * immediately as the current relation isn't a temp relation.
1474 : */
1475 30777 : if (temp_relations)
1476 20367 : continue;
1477 :
1478 10410 : if (!RELKIND_HAS_STORAGE(pgc->relkind))
1479 3726 : continue;
1480 :
1481 6684 : if (pgc->relisshared && !include_shared)
1482 644 : continue;
1483 : }
1484 :
1485 6041 : oldctx = MemoryContextSwitchTo(ctx);
1486 6041 : RelationList = lappend_oid(RelationList, pgc->oid);
1487 6041 : MemoryContextSwitchTo(oldctx);
1488 : }
1489 :
1490 68 : table_endscan(scan);
1491 68 : table_close(rel, AccessShareLock);
1492 :
1493 68 : CommitTransactionCommand();
1494 :
1495 68 : return RelationList;
1496 : }
1497 :
1498 : /*
1499 : * DataChecksumsWorkerMain
1500 : *
1501 : * Main function for enabling checksums in a single database. This is the
1502 : * function set as the bgw_function_name in the dynamic background worker
1503 : * process initiated for each database by the worker launcher. After enabling
1504 : * data checksums in each applicable relation in the database, it will wait for
1505 : * all temporary relations that were present when the function started to
1506 : * disappear before returning. This is required since we cannot rewrite
1507 : * existing temporary relations with data checksums.
1508 : */
1509 : void
1510 23 : DataChecksumsWorkerMain(Datum arg)
1511 : {
1512 23 : Oid dboid = DatumGetObjectId(arg);
1513 23 : List *RelationList = NIL;
1514 23 : List *InitialTempTableList = NIL;
1515 : BufferAccessStrategy strategy;
1516 23 : bool aborted = false;
1517 : int64 rels_done;
1518 : #ifdef USE_INJECTION_POINTS
1519 23 : bool retried = false;
1520 : #endif
1521 :
1522 23 : operation = ENABLE_DATACHECKSUMS;
1523 :
1524 23 : pqsignal(SIGTERM, die);
1525 23 : pqsignal(SIGUSR1, procsignal_sigusr1_handler);
1526 :
1527 23 : BackgroundWorkerUnblockSignals();
1528 :
1529 23 : MyBackendType = B_DATACHECKSUMSWORKER_WORKER;
1530 23 : init_ps_display(NULL);
1531 :
1532 23 : BackgroundWorkerInitializeConnectionByOid(dboid, InvalidOid,
1533 : BGWORKER_BYPASS_ALLOWCONN);
1534 :
1535 : /* worker will have a separate entry in pg_stat_progress_data_checksums */
1536 23 : pgstat_progress_start_command(PROGRESS_COMMAND_DATACHECKSUMS,
1537 : InvalidOid);
1538 :
1539 : /*
1540 : * Get a list of all temp tables present as we start in this database. We
1541 : * need to wait until they are all gone until we are done, since we cannot
1542 : * access these relations and modify them.
1543 : */
1544 23 : InitialTempTableList = BuildRelationList(true, false);
1545 :
1546 : /*
1547 : * Enable vacuum cost delay, if any. While this process isn't doing any
1548 : * vacuuming, we are re-using the infrastructure that vacuum cost delay
1549 : * provides rather than inventing something bespoke. This is an internal
1550 : * implementation detail and care should be taken to avoid it bleeding
1551 : * through to the user to avoid confusion.
1552 : *
1553 : * VacuumUpdateCosts() propagates the values to the variables actually
1554 : * read by vacuum_delay_point().
1555 : */
1556 23 : VacuumCostDelay = DataChecksumState->cost_delay;
1557 23 : VacuumCostLimit = DataChecksumState->cost_limit;
1558 23 : VacuumUpdateCosts();
1559 23 : VacuumCostBalance = 0;
1560 :
1561 : /*
1562 : * Create and set the vacuum strategy as our buffer strategy.
1563 : */
1564 23 : strategy = GetAccessStrategy(BAS_VACUUM);
1565 :
1566 23 : RelationList = BuildRelationList(false,
1567 23 : DataChecksumState->process_shared_catalogs);
1568 :
1569 : /* Update the total number of relations to be processed in this DB. */
1570 : {
1571 23 : const int index[] = {
1572 : PROGRESS_DATACHECKSUMS_RELS_TOTAL,
1573 : PROGRESS_DATACHECKSUMS_RELS_DONE
1574 : };
1575 :
1576 : int64 vals[2];
1577 :
1578 23 : vals[0] = list_length(RelationList);
1579 23 : vals[1] = 0;
1580 :
1581 23 : pgstat_progress_update_multi_param(2, index, vals);
1582 : }
1583 :
1584 : /* Process the relations */
1585 23 : rels_done = 0;
1586 5910 : foreach_oid(reloid, RelationList)
1587 : {
1588 5866 : bool costs_updated = false;
1589 :
1590 5866 : if (!ProcessSingleRelationByOid(reloid, strategy))
1591 : {
1592 0 : aborted = true;
1593 0 : break;
1594 : }
1595 :
1596 5865 : pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_RELS_DONE,
1597 : ++rels_done);
1598 5865 : CHECK_FOR_INTERRUPTS();
1599 5865 : CHECK_FOR_ABORT_REQUEST();
1600 :
1601 5865 : if (abort_requested)
1602 0 : break;
1603 :
1604 : /*
1605 : * Check if the cost settings changed during runtime and if so, update
1606 : * to reflect the new values and signal that the access strategy needs
1607 : * to be refreshed.
1608 : */
1609 5865 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
1610 5865 : if ((DataChecksumState->launch_cost_delay != DataChecksumState->cost_delay)
1611 5865 : || (DataChecksumState->launch_cost_limit != DataChecksumState->cost_limit))
1612 : {
1613 0 : costs_updated = true;
1614 0 : VacuumCostDelay = DataChecksumState->launch_cost_delay;
1615 0 : VacuumCostLimit = DataChecksumState->launch_cost_limit;
1616 0 : VacuumUpdateCosts();
1617 :
1618 0 : DataChecksumState->cost_delay = DataChecksumState->launch_cost_delay;
1619 0 : DataChecksumState->cost_limit = DataChecksumState->launch_cost_limit;
1620 : }
1621 : else
1622 5865 : costs_updated = false;
1623 5865 : LWLockRelease(DataChecksumsWorkerLock);
1624 :
1625 5865 : if (costs_updated)
1626 : {
1627 0 : FreeAccessStrategy(strategy);
1628 0 : strategy = GetAccessStrategy(BAS_VACUUM);
1629 : }
1630 : }
1631 :
1632 22 : list_free(RelationList);
1633 22 : FreeAccessStrategy(strategy);
1634 :
1635 22 : if (aborted || abort_requested)
1636 : {
1637 0 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
1638 0 : DataChecksumState->success = DATACHECKSUMSWORKER_ABORTED;
1639 0 : LWLockRelease(DataChecksumsWorkerLock);
1640 0 : ereport(DEBUG1,
1641 : errmsg("data checksum processing aborted in database OID %u",
1642 : dboid));
1643 0 : return;
1644 : }
1645 :
1646 : /* The worker is about to wait for temporary tables to go away. */
1647 22 : pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE,
1648 : PROGRESS_DATACHECKSUMS_PHASE_WAITING_TEMPREL);
1649 :
1650 : /*
1651 : * Wait for all temp tables that existed when we started to go away. This
1652 : * is necessary since we cannot "reach" them to enable checksums. Any temp
1653 : * tables created after we started will already have checksums in them
1654 : * (due to the "inprogress-on" state), so no need to wait for those.
1655 : */
1656 : for (;;)
1657 0 : {
1658 : List *CurrentTempTables;
1659 : int numleft;
1660 : char activity[64];
1661 :
1662 22 : CurrentTempTables = BuildRelationList(true, false);
1663 22 : numleft = 0;
1664 44 : foreach_oid(tmptbloid, InitialTempTableList)
1665 : {
1666 0 : if (list_member_oid(CurrentTempTables, tmptbloid))
1667 0 : numleft++;
1668 : }
1669 22 : list_free(CurrentTempTables);
1670 :
1671 : #ifdef USE_INJECTION_POINTS
1672 22 : if (IS_INJECTION_POINT_ATTACHED("datachecksumsworker-fake-temptable-wait"))
1673 : {
1674 : /* Make sure to just cause one retry */
1675 0 : if (!retried && numleft == 0)
1676 : {
1677 0 : numleft = 1;
1678 0 : retried = true;
1679 :
1680 0 : INJECTION_POINT_CACHED("datachecksumsworker-fake-temptable-wait", NULL);
1681 : }
1682 : }
1683 : #endif
1684 :
1685 22 : if (numleft == 0)
1686 22 : break;
1687 :
1688 : /*
1689 : * At least one temp table is left to wait for, indicate in pgstat
1690 : * activity and progress reporting.
1691 : */
1692 0 : snprintf(activity,
1693 : sizeof(activity),
1694 : "Waiting for %d temp tables to be removed", numleft);
1695 0 : pgstat_report_activity(STATE_RUNNING, activity);
1696 :
1697 : /* Retry every 3 seconds */
1698 0 : ResetLatch(MyLatch);
1699 0 : (void) WaitLatch(MyLatch,
1700 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
1701 : 3000,
1702 : WAIT_EVENT_CHECKSUM_ENABLE_TEMPTABLE_WAIT);
1703 :
1704 0 : CHECK_FOR_INTERRUPTS();
1705 0 : CHECK_FOR_ABORT_REQUEST();
1706 :
1707 0 : if (aborted || abort_requested)
1708 : {
1709 0 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
1710 0 : DataChecksumState->success = DATACHECKSUMSWORKER_ABORTED;
1711 0 : LWLockRelease(DataChecksumsWorkerLock);
1712 0 : ereport(LOG,
1713 : errmsg("data checksum processing aborted in database OID %u",
1714 : dboid));
1715 0 : return;
1716 : }
1717 : }
1718 :
1719 22 : list_free(InitialTempTableList);
1720 :
1721 : /* worker done */
1722 22 : pgstat_progress_end_command();
1723 :
1724 22 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
1725 22 : DataChecksumState->success = DATACHECKSUMSWORKER_SUCCESSFUL;
1726 22 : LWLockRelease(DataChecksumsWorkerLock);
1727 : }
|