Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * datachecksum_state.c
4 : * Background worker for enabling or disabling data checksums online as
5 : * well as functionality for manipulating data checksum state
6 : *
7 : * When enabling data checksums on a cluster at initdb time or when shut down
8 : * with pg_checksums, no extra process is required as each page is checksummed,
9 : * and verified, when accessed. When enabling checksums on an already running
10 : * cluster, this worker will ensure that all pages are checksummed before
11 : * verification of the checksums is turned on. In the case of disabling
12 : * checksums, the state transition is performed only in the control file, no
13 : * changes are performed on the data pages.
14 : *
15 : * Checksums can be either enabled or disabled cluster-wide, with on/off being
16 : * the end state for data_checksums.
17 : *
18 : * 1. Enabling checksums
19 : * ---------------------
20 : * When enabling checksums in an online cluster, data_checksums will be set to
21 : * "inprogress-on" which signals that write operations MUST compute and write
22 : * the checksum on the data page, but during reading the checksum SHALL NOT be
23 : * verified. This ensures that all objects created during when checksums are
24 : * being enabled will have checksums set, but reads won't fail due to missing or
25 : * invalid checksums. Invalid checksums can be present in case the cluster had
26 : * checksums enabled, then disabled them and updated the page while they were
27 : * disabled.
28 : *
29 : * The DataChecksumsWorker will compile a list of all databases at the start,
30 : * any databases created concurrently will see the in-progress state and will
31 : * be checksummed automatically. All databases from the original list MUST BE
32 : * successfully processed in order for data checksums to be enabled, the only
33 : * exception are databases which are dropped before having been processed.
34 : *
35 : * For each database, all relations which have storage are read and every data
36 : * page is marked dirty to force a write with the checksum. This will generate
37 : * a lot of WAL as the entire database is read and written.
38 : *
39 : * If the processing is interrupted by a cluster crash or restart, it needs to
40 : * be restarted from the beginning again as state isn't persisted.
41 : *
42 : * 2. Disabling checksums
43 : * ----------------------
44 : * When disabling checksums, data_checksums will be set to "inprogress-off"
45 : * which signals that checksums are written but no longer need to be verified.
46 : * This ensures that backends which have not yet transitioned to the
47 : * "inprogress-off" state will still see valid checksums on pages.
48 : *
49 : * 3. Synchronization and Correctness
50 : * ----------------------------------
51 : * The processes involved in enabling or disabling data checksums in an
52 : * online cluster must be properly synchronized with the normal backends
53 : * serving concurrent queries to ensure correctness. Correctness is defined
54 : * as the following:
55 : *
56 : * - Backends SHALL NOT violate the data_checksums state they have agreed to
57 : * by acknowledging the procsignalbarrier: This means that all backends
58 : * MUST calculate and write data checksums during all states except off;
59 : * MUST validate checksums only in the 'on' state.
60 : * - Data checksums SHALL NOT be considered enabled cluster-wide until all
61 : * currently connected backends have state "on": This means that all
62 : * backends must wait on the procsignalbarrier to be acknowledged by all
63 : * before proceeding to validate data checksums.
64 : *
65 : * There are two steps of synchronization required for changing data_checksums
66 : * in an online cluster: (i) changing state in the active backends ("on",
67 : * "off", "inprogress-on" and "inprogress-off"), and (ii) ensuring no
68 : * incompatible objects and processes are left in a database when workers end.
69 : * The former deals with cluster-wide agreement on data checksum state and the
70 : * latter with ensuring that any concurrent activity cannot break the data
71 : * checksum contract during processing.
72 : *
73 : * Synchronizing the state change is done with procsignal barriers. Before
74 : * updating the data_checksums state in the control file, all other backends must absorb the
75 : * barrier. Barrier absorption will happen during interrupt processing, which
76 : * means that connected backends will change state at different times. If
77 : * waiting for a barrier is done during startup, for example during replay, it
78 : * is important to realize that any locks held by the startup process might
79 : * cause deadlocks if backends end up waiting for those locks while startup
80 : * is waiting for a procsignalbarrier.
81 : *
82 : * 3.1 When Enabling Data Checksums
83 : * --------------------------------
84 : * A process which fails to observe data checksums being enabled can induce two
85 : * types of errors: failing to write the checksum when modifying the page and
86 : * failing to validate the data checksum on the page when reading it.
87 : *
88 : * When processing starts all backends belong to one of the below sets, with
89 : * one of Bd and Bi being empty:
90 : *
91 : * Bg: Backend updating the global state and emitting the procsignalbarrier
92 : * Bd: Backends in "off" state
93 : * Bi: Backends in "inprogress-on" state
94 : *
95 : * If processing is started in an online cluster then all backends are in Bd.
96 : * If processing was halted by the cluster shutting down (due to a crash or
97 : * intentional restart), the controlfile state "inprogress-on" will be observed
98 : * on system startup and all backends will be placed in Bd. The controlfile
99 : * state will also be set to "off".
100 : *
101 : * Backends transition Bd -> Bi via a procsignalbarrier which is emitted by the
102 : * DataChecksumsWorkerLauncherMain. When all backends have acknowledged the
103 : * barrier then Bd will be empty and the next phase can begin: calculating and
104 : * writing data checksums with DataChecksumsWorkers. When the
105 : * DataChecksumsWorker processes have finished writing checksums on all pages,
106 : * data checksums are enabled cluster-wide via another procsignalbarrier.
107 : * There are four sets of backends where Bd shall be an empty set:
108 : *
109 : * Bg: Backend updating the global state and emitting the procsignalbarrier
110 : * Bd: Backends in "off" state
111 : * Be: Backends in "on" state
112 : * Bi: Backends in "inprogress-on" state
113 : *
114 : * Backends in Bi and Be will write checksums when modifying a page, but only
115 : * backends in Be will verify the checksum during reading. The Bg backend is
116 : * blocked waiting for all backends in Bi to process interrupts and move to
117 : * Be. Any backend starting while Bg is waiting on the procsignalbarrier will
118 : * observe the global state being "on" and will thus automatically belong to
119 : * Be. Checksums are enabled cluster-wide when Bi is an empty set. Bi and Be
120 : * are compatible sets while still operating based on their local state as
121 : * both write data checksums.
122 : *
123 : * 3.2 When Disabling Data Checksums
124 : * ---------------------------------
125 : * A process which fails to observe that data checksums have been disabled
126 : * can induce two types of errors: writing the checksum when modifying the
127 : * page and validating a data checksum which is no longer correct due to
128 : * modifications to the page. The former is not an error per se as data
129 : * integrity is maintained, but it is wasteful. The latter will cause errors
130 : * in user operations. Assuming the following sets of backends:
131 : *
132 : * Bg: Backend updating the global state and emitting the procsignalbarrier
133 : * Bd: Backends in "off" state
134 : * Be: Backends in "on" state
135 : * Bo: Backends in "inprogress-off" state
136 : * Bi: Backends in "inprogress-on" state
137 : *
138 : * Backends transition from the Be state to Bd like so: Be -> Bo -> Bd. From
139 : * all other states, the transition can be straight to Bd.
140 : *
141 : * The goal is to transition all backends to Bd making the others empty sets.
142 : * Backends in Bo write data checksums, but don't validate them, such that
143 : * backends still in Be can continue to validate pages until the barrier has
144 : * been absorbed such that they are in Bo. Once all backends are in Bo, the
145 : * barrier to transition to "off" can be raised and all backends can safely
146 : * stop writing data checksums as no backend is enforcing data checksum
147 : * validation any longer.
148 : *
149 : * 4. Future opportunities for optimizations
150 : * -----------------------------------------
151 : * Below are some potential optimizations and improvements which were brought
152 : * up during reviews of this feature, but which weren't implemented in the
153 : * initial version. These are ideas listed without any validation on their
154 : * feasibility or potential payoff. More discussion on (most of) these can be
155 : * found on the -hackers threads linked to in the commit message of this
156 : * feature.
157 : *
158 : * * Launching datachecksumsworker for resuming operation from the startup
159 : * process: Currently users have to restart processing manually after a
160 : * restart since dynamic background worker cannot be started from the
161 : * postmaster. Changing the startup process could make restarting the
162 : * processing automatic on cluster restart.
163 : * * Avoid dirtying the page when checksums already match: Iff the checksum
164 : * on the page happens to already match we still dirty the page. It should
165 : * be enough to only do the log_newpage_buffer() call in that case.
166 : * * Teach pg_checksums to avoid checksummed pages when pg_checksums is used
167 : * to enable checksums on a cluster which is in inprogress-on state and
168 : * may have checksummed pages (make pg_checksums be able to resume an
169 : * online operation). This should only be attempted for wal_level minimal.
170 : * * Restartability (not necessarily with page granularity).
171 : * * Avoid processing databases which were created during inprogress-on.
172 : * Right now all databases are processed regardless to be safe.
173 : * * Teach CREATE DATABASE to calculate checksums for databases created
174 : * during inprogress-on with a template database which has yet to be
175 : * processed.
176 : *
177 : *
178 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
179 : * Portions Copyright (c) 1994, Regents of the University of California
180 : *
181 : *
182 : * IDENTIFICATION
183 : * src/backend/postmaster/datachecksum_state.c
184 : *
185 : *-------------------------------------------------------------------------
186 : */
187 : #include "postgres.h"
188 :
189 : #include "access/genam.h"
190 : #include "access/heapam.h"
191 : #include "access/htup_details.h"
192 : #include "access/xact.h"
193 : #include "access/xlog.h"
194 : #include "access/xloginsert.h"
195 : #include "catalog/indexing.h"
196 : #include "catalog/pg_class.h"
197 : #include "catalog/pg_database.h"
198 : #include "commands/progress.h"
199 : #include "commands/vacuum.h"
200 : #include "common/relpath.h"
201 : #include "miscadmin.h"
202 : #include "pgstat.h"
203 : #include "postmaster/bgworker.h"
204 : #include "postmaster/bgwriter.h"
205 : #include "postmaster/datachecksum_state.h"
206 : #include "storage/bufmgr.h"
207 : #include "storage/checksum.h"
208 : #include "storage/ipc.h"
209 : #include "storage/latch.h"
210 : #include "storage/lmgr.h"
211 : #include "storage/lwlock.h"
212 : #include "storage/procarray.h"
213 : #include "storage/smgr.h"
214 : #include "storage/subsystems.h"
215 : #include "tcop/tcopprot.h"
216 : #include "utils/builtins.h"
217 : #include "utils/fmgroids.h"
218 : #include "utils/injection_point.h"
219 : #include "utils/lsyscache.h"
220 : #include "utils/ps_status.h"
221 : #include "utils/syscache.h"
222 : #include "utils/wait_event.h"
223 :
224 : /*
225 : * Configuration of conditions which must match when absorbing a procsignal
226 : * barrier during data checksum enable/disable operations. A single function
227 : * is used for absorbing all barriers, and the current and target states must
228 : * be defined as a from/to tuple in the checksum_barriers struct.
229 : */
230 : typedef struct ChecksumBarrierCondition
231 : {
232 : /* Current state of data checksums */
233 : int from;
234 : /* Target state for data checksums */
235 : int to;
236 : } ChecksumBarrierCondition;
237 :
238 : static const ChecksumBarrierCondition checksum_barriers[9] =
239 : {
240 : /*
241 : * Disabling checksums: If checksums are currently enabled, disabling must
242 : * go through the 'inprogress-off' state.
243 : */
244 : {PG_DATA_CHECKSUM_VERSION, PG_DATA_CHECKSUM_INPROGRESS_OFF},
245 : {PG_DATA_CHECKSUM_INPROGRESS_OFF, PG_DATA_CHECKSUM_OFF},
246 :
247 : /*
248 : * If checksums are in the process of being enabled, but are not yet being
249 : * verified, we can abort by going back to 'off' state.
250 : */
251 : {PG_DATA_CHECKSUM_INPROGRESS_ON, PG_DATA_CHECKSUM_OFF},
252 :
253 : /*
254 : * Enabling checksums must normally go through the 'inprogress-on' state.
255 : */
256 : {PG_DATA_CHECKSUM_OFF, PG_DATA_CHECKSUM_INPROGRESS_ON},
257 : {PG_DATA_CHECKSUM_INPROGRESS_ON, PG_DATA_CHECKSUM_VERSION},
258 :
259 : /*
260 : * If checksums are being disabled but all backends are still computing
261 : * checksums, we can go straight back to 'on'
262 : */
263 : {PG_DATA_CHECKSUM_INPROGRESS_OFF, PG_DATA_CHECKSUM_VERSION},
264 :
265 : /*
266 : * If checksums are being enabled when launcher_exit is executed, state is
267 : * set to off since we cannot reach on at that point.
268 : */
269 : {PG_DATA_CHECKSUM_INPROGRESS_ON, PG_DATA_CHECKSUM_INPROGRESS_OFF},
270 :
271 : /*
272 : * Transitions that can happen when a new request is made while another is
273 : * currently being processed.
274 : */
275 : {PG_DATA_CHECKSUM_INPROGRESS_OFF, PG_DATA_CHECKSUM_INPROGRESS_ON},
276 : {PG_DATA_CHECKSUM_OFF, PG_DATA_CHECKSUM_INPROGRESS_OFF},
277 : };
278 :
279 : /*
280 : * Signaling between backends calling pg_enable/disable_data_checksums, the
281 : * checksums launcher process, and the checksums worker process.
282 : *
283 : * This struct is protected by DataChecksumsWorkerLock
284 : */
285 : typedef struct DataChecksumsStateStruct
286 : {
287 : /*
288 : * These are set by pg_{enable|disable}_data_checksums, to tell the
289 : * launcher what the target state is.
290 : */
291 : DataChecksumsWorkerOperation launch_operation;
292 : int launch_cost_delay;
293 : int launch_cost_limit;
294 :
295 : /*
296 : * Is a launcher process currently running? This is set by the main
297 : * launcher process, after it has read the above launch_* parameters.
298 : */
299 : bool launcher_running;
300 :
301 : /*
302 : * Is a worker process currently running? This is set by the worker
303 : * launcher when it starts waiting for a worker process to finish.
304 : */
305 : int worker_pid;
306 :
307 : /*
308 : * These fields indicate the target state that the launcher is currently
309 : * working towards. They can be different from the corresponding launch_*
310 : * fields, if a new pg_enable/disable_data_checksums() call was made while
311 : * the launcher/worker was already running.
312 : *
313 : * The below members are set when the launcher starts, and are only
314 : * accessed read-only by the single worker. Thus, we can access these
315 : * without a lock. If multiple workers, or dynamic cost parameters, are
316 : * supported at some point then this would need to be revisited.
317 : */
318 : DataChecksumsWorkerOperation operation;
319 : int cost_delay;
320 : int cost_limit;
321 :
322 : /*
323 : * Signaling between the launcher and the worker process.
324 : *
325 : * As there is only a single worker, and the launcher won't read these
326 : * until the worker exits, they can be accessed without the need for a
327 : * lock. If multiple workers are supported then this will have to be
328 : * revisited.
329 : */
330 :
331 : /* result, set by worker before exiting */
332 : DataChecksumsWorkerResult success;
333 :
334 : /*
335 : * Tells the worker process whether it should also process the shared
336 : * catalogs
337 : */
338 : bool process_shared_catalogs;
339 : } DataChecksumsStateStruct;
340 :
341 : /* Shared memory segment for datachecksumsworker */
342 : static DataChecksumsStateStruct *DataChecksumState;
343 :
344 : typedef struct DataChecksumsWorkerDatabase
345 : {
346 : Oid dboid;
347 : char *dbname;
348 : } DataChecksumsWorkerDatabase;
349 :
350 : /* Flag set by the interrupt handler */
351 : static volatile sig_atomic_t abort_requested = false;
352 :
353 : /*
354 : * Have we set the DataChecksumsStateStruct->launcher_running flag?
355 : * If we have, we need to clear it before exiting!
356 : */
357 : static volatile sig_atomic_t launcher_running = false;
358 :
359 : /* Are we enabling data checksums, or disabling them? */
360 : static DataChecksumsWorkerOperation operation;
361 :
362 : /* Prototypes */
363 : static void DataChecksumsShmemRequest(void *arg);
364 : static bool DatabaseExists(Oid dboid);
365 : static List *BuildDatabaseList(void);
366 : static List *BuildRelationList(bool temp_relations, bool include_shared);
367 : static void FreeDatabaseList(List *dblist);
368 : static DataChecksumsWorkerResult ProcessDatabase(DataChecksumsWorkerDatabase *db);
369 : static bool ProcessAllDatabases(void);
370 : static bool ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy);
371 : static void launcher_cancel_handler(SIGNAL_ARGS);
372 : static void WaitForAllTransactionsToFinish(void);
373 :
374 : const ShmemCallbacks DataChecksumsShmemCallbacks = {
375 : .request_fn = DataChecksumsShmemRequest,
376 : };
377 :
378 : #define CHECK_FOR_ABORT_REQUEST() \
379 : do { \
380 : LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED); \
381 : if (DataChecksumState->launch_operation != operation) \
382 : abort_requested = true; \
383 : LWLockRelease(DataChecksumsWorkerLock); \
384 : } while (0)
385 :
386 :
387 : /*****************************************************************************
388 : * Functionality for manipulating the data checksum state in the cluster
389 : */
390 :
391 : void
392 4 : EmitAndWaitDataChecksumsBarrier(uint32 state)
393 : {
394 : uint64 barrier;
395 :
396 4 : switch (state)
397 : {
398 1 : case PG_DATA_CHECKSUM_INPROGRESS_ON:
399 1 : barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON);
400 1 : WaitForProcSignalBarrier(barrier);
401 1 : break;
402 :
403 1 : case PG_DATA_CHECKSUM_INPROGRESS_OFF:
404 1 : barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF);
405 1 : WaitForProcSignalBarrier(barrier);
406 1 : break;
407 :
408 1 : case PG_DATA_CHECKSUM_VERSION:
409 1 : barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON);
410 1 : WaitForProcSignalBarrier(barrier);
411 1 : break;
412 :
413 1 : case PG_DATA_CHECKSUM_OFF:
414 1 : barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF);
415 1 : WaitForProcSignalBarrier(barrier);
416 1 : break;
417 :
418 4 : default:
419 : Assert(false);
420 : }
421 4 : }
422 :
423 : /*
424 : * AbsorbDataChecksumsBarrier
425 : * Generic function for absorbing data checksum state changes
426 : *
427 : * All procsignalbarriers regarding data checksum state changes are absorbed
428 : * with this function. The set of conditions required for the state change to
429 : * be accepted are listed in the checksum_barriers struct, target_state is
430 : * used to look up the relevant entry.
431 : */
432 : bool
433 231 : AbsorbDataChecksumsBarrier(ProcSignalBarrierType barrier)
434 : {
435 : uint32 target_state;
436 231 : int current = data_checksums;
437 231 : bool found = false;
438 :
439 : /*
440 : * Translate the barrier condition to the target state, doing it here
441 : * instead of in the procsignal code saves the latter from knowing about
442 : * checksum states.
443 : */
444 231 : switch (barrier)
445 : {
446 72 : case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON:
447 72 : target_state = PG_DATA_CHECKSUM_INPROGRESS_ON;
448 72 : break;
449 57 : case PROCSIGNAL_BARRIER_CHECKSUM_ON:
450 57 : target_state = PG_DATA_CHECKSUM_VERSION;
451 57 : break;
452 52 : case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF:
453 52 : target_state = PG_DATA_CHECKSUM_INPROGRESS_OFF;
454 52 : break;
455 50 : case PROCSIGNAL_BARRIER_CHECKSUM_OFF:
456 50 : target_state = PG_DATA_CHECKSUM_OFF;
457 50 : break;
458 0 : default:
459 0 : elog(ERROR, "incorrect barrier \"%i\" received", barrier);
460 : }
461 :
462 : /*
463 : * If the target state matches the current state then the barrier has been
464 : * repeated.
465 : */
466 231 : if (current == target_state)
467 0 : return true;
468 :
469 : /*
470 : * If the cluster is in recovery we skip the validation of current state
471 : * since the replay is trusted.
472 : */
473 231 : if (RecoveryInProgress())
474 : {
475 24 : SetLocalDataChecksumState(target_state);
476 24 : return true;
477 : }
478 :
479 : /*
480 : * Find the barrier condition definition for the target state. Not finding
481 : * a condition would be a grave programmer error as the states are a
482 : * discrete set.
483 : */
484 932 : for (int i = 0; i < lengthof(checksum_barriers) && !found; i++)
485 : {
486 725 : if (checksum_barriers[i].from == current && checksum_barriers[i].to == target_state)
487 207 : found = true;
488 : }
489 :
490 : /*
491 : * If the relevant state criteria aren't satisfied, throw an error which
492 : * will be caught by the procsignal machinery for a later retry.
493 : */
494 207 : if (!found)
495 0 : ereport(ERROR,
496 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
497 : errmsg("incorrect data checksum state %i for target state %i",
498 : current, target_state));
499 :
500 207 : SetLocalDataChecksumState(target_state);
501 207 : return true;
502 : }
503 :
504 :
505 : /*
506 : * Disables data checksums for the cluster, if applicable. Starts a background
507 : * worker which turns off the data checksums.
508 : */
509 : Datum
510 6 : disable_data_checksums(PG_FUNCTION_ARGS)
511 : {
512 6 : PreventCommandDuringRecovery("pg_disable_data_checksums()");
513 :
514 6 : if (!superuser())
515 0 : ereport(ERROR,
516 : errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
517 : errmsg("must be superuser to change data checksum state"));
518 :
519 6 : StartDataChecksumsWorkerLauncher(DISABLE_DATACHECKSUMS, 0, 0);
520 6 : PG_RETURN_VOID();
521 : }
522 :
523 : /*
524 : * Enables data checksums for the cluster, if applicable. Supports vacuum-
525 : * like cost based throttling to limit system load. Starts a background worker
526 : * which updates data checksums on existing data.
527 : */
528 : Datum
529 9 : enable_data_checksums(PG_FUNCTION_ARGS)
530 : {
531 9 : int cost_delay = PG_GETARG_INT32(0);
532 9 : int cost_limit = PG_GETARG_INT32(1);
533 :
534 9 : PreventCommandDuringRecovery("pg_enable_data_checksums()");
535 :
536 9 : if (!superuser())
537 0 : ereport(ERROR,
538 : errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
539 : errmsg("must be superuser to change data checksum state"));
540 :
541 9 : if (cost_delay < 0)
542 0 : ereport(ERROR,
543 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
544 : errmsg("cost delay cannot be a negative value"));
545 :
546 9 : if (cost_limit <= 0)
547 0 : ereport(ERROR,
548 : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
549 : errmsg("cost limit must be greater than zero"));
550 :
551 9 : StartDataChecksumsWorkerLauncher(ENABLE_DATACHECKSUMS, cost_delay, cost_limit);
552 :
553 9 : PG_RETURN_VOID();
554 : }
555 :
556 :
557 : /*****************************************************************************
558 : * Functionality for running the datachecksumsworker and associated launcher
559 : */
560 :
561 : /*
562 : * StartDataChecksumsWorkerLauncher
563 : * Main entry point for datachecksumsworker launcher process
564 : *
565 : * The main entrypoint for starting data checksums processing for enabling as
566 : * well as disabling.
567 : */
568 : void
569 15 : StartDataChecksumsWorkerLauncher(DataChecksumsWorkerOperation op,
570 : int cost_delay,
571 : int cost_limit)
572 : {
573 : BackgroundWorker bgw;
574 : BackgroundWorkerHandle *bgw_handle;
575 : bool running;
576 :
577 : #ifdef USE_ASSERT_CHECKING
578 : /* The cost delay settings have no effect when disabling */
579 : if (op == DISABLE_DATACHECKSUMS)
580 : Assert(cost_delay == 0 && cost_limit == 0);
581 : #endif
582 :
583 15 : INJECTION_POINT("datachecksumsworker-startup-delay", NULL);
584 :
585 : /* Store the desired state in shared memory */
586 15 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
587 :
588 15 : DataChecksumState->launch_operation = op;
589 15 : DataChecksumState->launch_cost_delay = cost_delay;
590 15 : DataChecksumState->launch_cost_limit = cost_limit;
591 :
592 : /* Is the launcher already running? If so, what is it doing? */
593 15 : running = DataChecksumState->launcher_running;
594 :
595 15 : LWLockRelease(DataChecksumsWorkerLock);
596 :
597 : /*
598 : * Launch a new launcher process, if it's not running already.
599 : *
600 : * If the launcher is currently busy enabling the checksums, and we want
601 : * them disabled (or vice versa), the launcher will notice that at latest
602 : * when it's about to exit, and will loop back process the new request. So
603 : * if the launcher is already running, we don't need to do anything more
604 : * here to abort it.
605 : *
606 : * If you call pg_enable/disable_data_checksums() twice in a row, before
607 : * the launcher has had a chance to start up, we still end up launching it
608 : * twice. That's OK, the second invocation will see that a launcher is
609 : * already running and exit quickly.
610 : */
611 15 : if (!running)
612 : {
613 15 : if ((op == ENABLE_DATACHECKSUMS && DataChecksumsOn()) ||
614 6 : (op == DISABLE_DATACHECKSUMS && DataChecksumsOff()))
615 : {
616 3 : ereport(LOG,
617 : errmsg("data checksums already in desired state, exiting"));
618 3 : return;
619 : }
620 :
621 : /*
622 : * Prepare the BackgroundWorker and launch it.
623 : */
624 12 : memset(&bgw, 0, sizeof(bgw));
625 12 : bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION;
626 12 : bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
627 12 : snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres");
628 12 : snprintf(bgw.bgw_function_name, BGW_MAXLEN, "DataChecksumsWorkerLauncherMain");
629 12 : snprintf(bgw.bgw_name, BGW_MAXLEN, "datachecksum launcher");
630 12 : snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksum launcher");
631 12 : bgw.bgw_restart_time = BGW_NEVER_RESTART;
632 12 : bgw.bgw_notify_pid = MyProcPid;
633 12 : bgw.bgw_main_arg = (Datum) 0;
634 :
635 12 : if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle))
636 0 : ereport(ERROR,
637 : errcode(ERRCODE_INSUFFICIENT_RESOURCES),
638 : errmsg("failed to start background worker to process data checksums"));
639 : }
640 : else
641 : {
642 0 : ereport(LOG,
643 : errmsg("data checksum processing already running"));
644 : }
645 : }
646 :
647 : /*
648 : * ProcessSingleRelationFork
649 : * Enable data checksums in a single relation/fork.
650 : *
651 : * Returns true if successful, and false if *aborted*. On error, an actual
652 : * error is raised in the lower levels.
653 : */
654 : static bool
655 6611 : ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy)
656 : {
657 6611 : BlockNumber numblocks = RelationGetNumberOfBlocksInFork(reln, forkNum);
658 : char activity[NAMEDATALEN * 2 + 128];
659 : char *relns;
660 :
661 6611 : relns = get_namespace_name(RelationGetNamespace(reln));
662 :
663 : /* Report the current relation to pg_stat_activity */
664 6611 : snprintf(activity, sizeof(activity) - 1, "processing: %s.%s (%s, %u blocks)",
665 6611 : (relns ? relns : ""), RelationGetRelationName(reln), forkNames[forkNum], numblocks);
666 6611 : pgstat_report_activity(STATE_RUNNING, activity);
667 6611 : pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_BLOCKS_TOTAL, numblocks);
668 6611 : if (relns)
669 6611 : pfree(relns);
670 :
671 : /*
672 : * We are looping over the blocks which existed at the time of process
673 : * start, which is safe since new blocks are created with checksums set
674 : * already due to the state being "inprogress-on".
675 : */
676 43943 : for (BlockNumber blknum = 0; blknum < numblocks; blknum++)
677 : {
678 37332 : Buffer buf = ReadBufferExtended(reln, forkNum, blknum, RBM_NORMAL, strategy);
679 :
680 : /* Need to get an exclusive lock to mark the buffer as dirty */
681 37332 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
682 :
683 : /*
684 : * Mark the buffer as dirty and force a full page write. We have to
685 : * re-write the page to WAL even if the checksum hasn't changed,
686 : * because if there is a replica it might have a slightly different
687 : * version of the page with an invalid checksum, caused by unlogged
688 : * changes (e.g. hint bits) on the primary happening while checksums
689 : * were off. This can happen if there was a valid checksum on the page
690 : * at one point in the past, so only when checksums are first on, then
691 : * off, and then turned on again. TODO: investigate if this could be
692 : * avoided if the checksum is calculated to be correct and wal_level
693 : * is set to "minimal",
694 : */
695 37332 : START_CRIT_SECTION();
696 37332 : MarkBufferDirty(buf);
697 37332 : log_newpage_buffer(buf, false);
698 37332 : END_CRIT_SECTION();
699 :
700 37332 : UnlockReleaseBuffer(buf);
701 :
702 : /*
703 : * This is the only place where we check if we are asked to abort, the
704 : * abortion will bubble up from here.
705 : */
706 : Assert(operation == ENABLE_DATACHECKSUMS);
707 37332 : LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED);
708 37332 : if (DataChecksumState->launch_operation == DISABLE_DATACHECKSUMS)
709 0 : abort_requested = true;
710 37332 : LWLockRelease(DataChecksumsWorkerLock);
711 :
712 37332 : if (abort_requested)
713 0 : return false;
714 :
715 : /* update the block counter */
716 37332 : pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_BLOCKS_DONE,
717 37332 : (blknum + 1));
718 :
719 : /*
720 : * Processing is re-using the vacuum cost delay for process
721 : * throttling, hence why we call vacuum APIs here.
722 : */
723 37332 : vacuum_delay_point(false);
724 : }
725 :
726 6611 : return true;
727 : }
728 :
729 : /*
730 : * ProcessSingleRelationByOid
731 : * Process a single relation based on oid.
732 : *
733 : * Returns true if successful, and false if *aborted*. On error, an actual
734 : * error is raised in the lower levels.
735 : */
736 : static bool
737 5120 : ProcessSingleRelationByOid(Oid relationId, BufferAccessStrategy strategy)
738 : {
739 : Relation rel;
740 5120 : bool aborted = false;
741 :
742 5120 : StartTransactionCommand();
743 :
744 5120 : rel = try_relation_open(relationId, AccessShareLock);
745 5120 : if (rel == NULL)
746 : {
747 : /*
748 : * Relation no longer exists. We don't consider this an error since
749 : * there are no pages in it that need data checksums, and thus return
750 : * true. The worker operates off a list of relations generated at the
751 : * start of processing, so relations being dropped in the meantime is
752 : * to be expected.
753 : */
754 0 : CommitTransactionCommand();
755 0 : pgstat_report_activity(STATE_IDLE, NULL);
756 0 : return true;
757 : }
758 5120 : RelationGetSmgr(rel);
759 :
760 25600 : for (ForkNumber fnum = 0; fnum <= MAX_FORKNUM; fnum++)
761 : {
762 20480 : if (smgrexists(rel->rd_smgr, fnum))
763 : {
764 6611 : if (!ProcessSingleRelationFork(rel, fnum, strategy))
765 : {
766 0 : aborted = true;
767 0 : break;
768 : }
769 : }
770 : }
771 5120 : relation_close(rel, AccessShareLock);
772 :
773 5120 : CommitTransactionCommand();
774 5120 : pgstat_report_activity(STATE_IDLE, NULL);
775 :
776 5120 : return !aborted;
777 : }
778 :
779 : /*
780 : * ProcessDatabase
781 : * Enable data checksums in a single database.
782 : *
783 : * We do this by launching a dynamic background worker into this database, and
784 : * waiting for it to finish. We have to do this in a separate worker, since
785 : * each process can only be connected to one database during its lifetime.
786 : */
787 : static DataChecksumsWorkerResult
788 20 : ProcessDatabase(DataChecksumsWorkerDatabase *db)
789 : {
790 : BackgroundWorker bgw;
791 : BackgroundWorkerHandle *bgw_handle;
792 : BgwHandleStatus status;
793 : pid_t pid;
794 : char activity[NAMEDATALEN + 64];
795 :
796 20 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
797 20 : DataChecksumState->success = DATACHECKSUMSWORKER_FAILED;
798 20 : LWLockRelease(DataChecksumsWorkerLock);
799 :
800 20 : memset(&bgw, 0, sizeof(bgw));
801 20 : bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION;
802 20 : bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
803 20 : snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres");
804 20 : snprintf(bgw.bgw_function_name, BGW_MAXLEN, "%s", "DataChecksumsWorkerMain");
805 20 : snprintf(bgw.bgw_name, BGW_MAXLEN, "datachecksum worker");
806 20 : snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksum worker");
807 20 : bgw.bgw_restart_time = BGW_NEVER_RESTART;
808 20 : bgw.bgw_notify_pid = MyProcPid;
809 20 : bgw.bgw_main_arg = ObjectIdGetDatum(db->dboid);
810 :
811 : /*
812 : * If there are no worker slots available, there is little we can do. If
813 : * we retry in a bit it's still unlikely that the user has managed to
814 : * reconfigure in the meantime and we'd be run through retries fast.
815 : */
816 20 : if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle))
817 : {
818 0 : ereport(WARNING,
819 : errmsg("could not start background worker for enabling data checksums in database \"%s\"",
820 : db->dbname),
821 : errhint("The \"%s\" setting might be too low.", "max_worker_processes"));
822 0 : return DATACHECKSUMSWORKER_FAILED;
823 : }
824 :
825 20 : status = WaitForBackgroundWorkerStartup(bgw_handle, &pid);
826 20 : if (status == BGWH_STOPPED)
827 : {
828 : /*
829 : * If the worker managed to start, and stop, before we got to waiting
830 : * for it we can see a STOPPED status here without it being a failure.
831 : */
832 0 : LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED);
833 0 : if (DataChecksumState->success == DATACHECKSUMSWORKER_SUCCESSFUL)
834 : {
835 0 : LWLockRelease(DataChecksumsWorkerLock);
836 0 : pgstat_report_activity(STATE_IDLE, NULL);
837 0 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
838 0 : DataChecksumState->worker_pid = InvalidPid;
839 0 : LWLockRelease(DataChecksumsWorkerLock);
840 0 : return DataChecksumState->success;
841 : }
842 0 : LWLockRelease(DataChecksumsWorkerLock);
843 :
844 0 : ereport(WARNING,
845 : errmsg("could not start background worker for enabling data checksums in database \"%s\"",
846 : db->dbname),
847 : errhint("More details on the error might be found in the server log."));
848 :
849 : /*
850 : * Heuristic to see if the database was dropped, and if it was we can
851 : * treat it as not an error, else treat as fatal and error out.
852 : */
853 0 : if (DatabaseExists(db->dboid))
854 0 : return DATACHECKSUMSWORKER_FAILED;
855 : else
856 0 : return DATACHECKSUMSWORKER_DROPDB;
857 : }
858 :
859 : /*
860 : * If the postmaster crashed we cannot end up with a processed database so
861 : * we have no alternative other than exiting. When enabling checksums we
862 : * won't at this time have changed the data checksums state in pg_control
863 : * to enabled so when the cluster comes back up processing will have to be
864 : * restarted.
865 : */
866 20 : if (status == BGWH_POSTMASTER_DIED)
867 0 : ereport(FATAL,
868 : errcode(ERRCODE_ADMIN_SHUTDOWN),
869 : errmsg("cannot enable data checksums without the postmaster process"),
870 : errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums()."));
871 :
872 : Assert(status == BGWH_STARTED);
873 20 : ereport(LOG,
874 : errmsg("initiating data checksum processing in database \"%s\"",
875 : db->dbname));
876 :
877 : /* Save the pid of the worker so we can signal it later */
878 20 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
879 20 : DataChecksumState->worker_pid = pid;
880 20 : LWLockRelease(DataChecksumsWorkerLock);
881 :
882 20 : snprintf(activity, sizeof(activity) - 1,
883 : "Waiting for worker in database %s (pid %ld)", db->dbname, (long) pid);
884 20 : pgstat_report_activity(STATE_RUNNING, activity);
885 :
886 20 : status = WaitForBackgroundWorkerShutdown(bgw_handle);
887 19 : if (status == BGWH_POSTMASTER_DIED)
888 0 : ereport(FATAL,
889 : errcode(ERRCODE_ADMIN_SHUTDOWN),
890 : errmsg("postmaster exited during data checksum processing in \"%s\"",
891 : db->dbname),
892 : errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums()."));
893 :
894 19 : LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED);
895 19 : if (DataChecksumState->success == DATACHECKSUMSWORKER_ABORTED)
896 0 : ereport(LOG,
897 : errmsg("data checksums processing was aborted in database \"%s\"",
898 : db->dbname));
899 19 : LWLockRelease(DataChecksumsWorkerLock);
900 :
901 19 : pgstat_report_activity(STATE_IDLE, NULL);
902 19 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
903 19 : DataChecksumState->worker_pid = InvalidPid;
904 19 : LWLockRelease(DataChecksumsWorkerLock);
905 :
906 19 : return DataChecksumState->success;
907 : }
908 :
909 : /*
910 : * launcher_exit
911 : *
912 : * Internal routine for cleaning up state when a launcher process which has
913 : * performed checksum operations exits. A launcher process which is exiting due
914 : * to a duplicate started launcher does not need to perform any cleanup and
915 : * this function should not be called. Otherwise, we need to clean up the abort
916 : * flag to ensure that processing started again if it was previously aborted
917 : * (note: started again, *not* restarted from where it left off).
918 : */
919 : static void
920 12 : launcher_exit(int code, Datum arg)
921 : {
922 12 : abort_requested = false;
923 :
924 12 : if (launcher_running)
925 : {
926 2 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
927 2 : if (DataChecksumState->worker_pid != InvalidPid)
928 : {
929 1 : ereport(LOG,
930 : errmsg("data checksums launcher exiting while worker is still running, signalling worker"));
931 1 : kill(DataChecksumState->worker_pid, SIGTERM);
932 : }
933 2 : LWLockRelease(DataChecksumsWorkerLock);
934 : }
935 :
936 : /*
937 : * If the launcher is exiting before data checksums are enabled then set
938 : * the state to off since processing cannot be resumed.
939 : */
940 12 : if (DataChecksumsInProgressOn())
941 1 : SetDataChecksumsOff();
942 :
943 12 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
944 12 : launcher_running = false;
945 12 : DataChecksumState->launcher_running = false;
946 12 : LWLockRelease(DataChecksumsWorkerLock);
947 12 : }
948 :
949 : /*
950 : * launcher_cancel_handler
951 : *
952 : * Internal routine for reacting to SIGINT and flagging the worker to abort.
953 : * The worker won't be interrupted immediately but will check for abort flag
954 : * between each block in a relation.
955 : */
956 : static void
957 0 : launcher_cancel_handler(SIGNAL_ARGS)
958 : {
959 0 : int save_errno = errno;
960 :
961 0 : abort_requested = true;
962 :
963 : /*
964 : * There is no sleeping in the main loop, the flag will be checked
965 : * periodically in ProcessSingleRelationFork. The worker does however
966 : * sleep when waiting for concurrent transactions to end so we still need
967 : * to set the latch.
968 : */
969 0 : SetLatch(MyLatch);
970 :
971 0 : errno = save_errno;
972 0 : }
973 :
974 : /*
975 : * WaitForAllTransactionsToFinish
976 : * Blocks awaiting all current transactions to finish
977 : *
978 : * Returns when all transactions which are active at the call of the function
979 : * have ended, or if the postmaster dies while waiting. If the postmaster dies
980 : * the abort flag will be set to indicate that the caller of this shouldn't
981 : * proceed.
982 : *
983 : * NB: this will return early, if aborted by SIGINT or if the target state
984 : * is changed while we're running.
985 : */
986 : static void
987 8 : WaitForAllTransactionsToFinish(void)
988 : {
989 : TransactionId waitforxid;
990 :
991 8 : LWLockAcquire(XidGenLock, LW_SHARED);
992 8 : waitforxid = XidFromFullTransactionId(TransamVariables->nextXid);
993 8 : LWLockRelease(XidGenLock);
994 :
995 8 : while (TransactionIdPrecedes(GetOldestActiveTransactionId(false, true), waitforxid))
996 : {
997 : char activity[64];
998 : int rc;
999 :
1000 : /* Oldest running xid is older than us, so wait */
1001 0 : snprintf(activity,
1002 : sizeof(activity),
1003 : "Waiting for current transactions to finish (waiting for %u)",
1004 : waitforxid);
1005 0 : pgstat_report_activity(STATE_RUNNING, activity);
1006 :
1007 : /* Retry every 3 seconds */
1008 0 : ResetLatch(MyLatch);
1009 0 : rc = WaitLatch(MyLatch,
1010 : WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
1011 : 3000,
1012 : WAIT_EVENT_CHECKSUM_ENABLE_STARTCONDITION);
1013 :
1014 : /*
1015 : * If the postmaster died we won't be able to enable checksums
1016 : * cluster-wide so abort and hope to continue when restarted.
1017 : */
1018 0 : if (rc & WL_POSTMASTER_DEATH)
1019 0 : ereport(FATAL,
1020 : errcode(ERRCODE_ADMIN_SHUTDOWN),
1021 : errmsg("postmaster exited during data checksums processing"),
1022 : errhint("Data checksums processing must be restarted manually after cluster restart."));
1023 :
1024 0 : CHECK_FOR_INTERRUPTS();
1025 0 : CHECK_FOR_ABORT_REQUEST();
1026 :
1027 0 : if (abort_requested)
1028 0 : break;
1029 : }
1030 :
1031 8 : pgstat_report_activity(STATE_IDLE, NULL);
1032 8 : return;
1033 : }
1034 :
1035 : /*
1036 : * DataChecksumsWorkerLauncherMain
1037 : *
1038 : * Main function for launching dynamic background workers for processing data
1039 : * checksums in databases. This function has the bgworker management, with
1040 : * ProcessAllDatabases being responsible for looping over the databases and
1041 : * initiating processing.
1042 : */
1043 : void
1044 12 : DataChecksumsWorkerLauncherMain(Datum arg)
1045 : {
1046 :
1047 12 : ereport(DEBUG1,
1048 : errmsg("background worker \"datachecksums launcher\" started"));
1049 :
1050 12 : pqsignal(SIGTERM, die);
1051 12 : pqsignal(SIGINT, launcher_cancel_handler);
1052 12 : pqsignal(SIGUSR1, procsignal_sigusr1_handler);
1053 12 : pqsignal(SIGUSR2, PG_SIG_IGN);
1054 :
1055 12 : BackgroundWorkerUnblockSignals();
1056 :
1057 12 : MyBackendType = B_DATACHECKSUMSWORKER_LAUNCHER;
1058 12 : init_ps_display(NULL);
1059 :
1060 12 : INJECTION_POINT("datachecksumsworker-launcher-delay", NULL);
1061 :
1062 12 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
1063 :
1064 12 : if (DataChecksumState->launcher_running)
1065 : {
1066 0 : ereport(LOG,
1067 : errmsg("background worker \"datachecksums launcher\" already running, exiting"));
1068 : /* Launcher was already running, let it finish */
1069 0 : LWLockRelease(DataChecksumsWorkerLock);
1070 0 : return;
1071 : }
1072 :
1073 12 : on_shmem_exit(launcher_exit, 0);
1074 12 : launcher_running = true;
1075 :
1076 : /* Initialize a connection to shared catalogs only */
1077 12 : BackgroundWorkerInitializeConnectionByOid(InvalidOid, InvalidOid, 0);
1078 :
1079 12 : operation = DataChecksumState->launch_operation;
1080 12 : DataChecksumState->launcher_running = true;
1081 12 : DataChecksumState->operation = operation;
1082 12 : DataChecksumState->cost_delay = DataChecksumState->launch_cost_delay;
1083 12 : DataChecksumState->cost_limit = DataChecksumState->launch_cost_limit;
1084 12 : LWLockRelease(DataChecksumsWorkerLock);
1085 :
1086 : /*
1087 : * The target state can change while we are busy enabling/disabling
1088 : * checksums, if the user calls pg_disable/enable_data_checksums() before
1089 : * we are finished with the previous request. In that case, we will loop
1090 : * back here, to process the new request.
1091 : */
1092 12 : again:
1093 :
1094 12 : pgstat_progress_start_command(PROGRESS_COMMAND_DATACHECKSUMS,
1095 : InvalidOid);
1096 :
1097 12 : if (operation == ENABLE_DATACHECKSUMS)
1098 : {
1099 : /*
1100 : * If we are asked to enable checksums in a cluster which already has
1101 : * checksums enabled, exit immediately as there is nothing more to do.
1102 : */
1103 8 : if (DataChecksumsNeedVerify())
1104 0 : goto done;
1105 :
1106 8 : ereport(LOG,
1107 : errmsg("enabling data checksums requested, starting data checksum calculation"));
1108 :
1109 : /*
1110 : * Set the state to inprogress-on and wait on the procsignal barrier.
1111 : */
1112 8 : pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE,
1113 : PROGRESS_DATACHECKSUMS_PHASE_ENABLING);
1114 8 : SetDataChecksumsOnInProgress();
1115 :
1116 : /*
1117 : * All backends are now in inprogress-on state and are writing data
1118 : * checksums. Start processing all data at rest.
1119 : */
1120 8 : if (!ProcessAllDatabases())
1121 : {
1122 : /*
1123 : * If the target state changed during processing then it's not a
1124 : * failure, so restart processing instead.
1125 : */
1126 0 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
1127 0 : if (DataChecksumState->launch_operation != operation)
1128 : {
1129 0 : LWLockRelease(DataChecksumsWorkerLock);
1130 0 : goto done;
1131 : }
1132 0 : LWLockRelease(DataChecksumsWorkerLock);
1133 0 : ereport(ERROR,
1134 : errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1135 : errmsg("unable to enable data checksums in cluster"));
1136 : }
1137 :
1138 : /*
1139 : * Data checksums have been set on all pages, set the state to on in
1140 : * order to instruct backends to validate checksums on reading.
1141 : */
1142 6 : SetDataChecksumsOn();
1143 :
1144 6 : ereport(LOG,
1145 : errmsg("data checksums are now enabled"));
1146 : }
1147 4 : else if (operation == DISABLE_DATACHECKSUMS)
1148 : {
1149 4 : ereport(LOG,
1150 : errmsg("disabling data checksums requested"));
1151 :
1152 4 : pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE,
1153 : PROGRESS_DATACHECKSUMS_PHASE_DISABLING);
1154 4 : SetDataChecksumsOff();
1155 4 : ereport(LOG,
1156 : errmsg("data checksums are now disabled"));
1157 : }
1158 : else
1159 : Assert(false);
1160 :
1161 0 : done:
1162 :
1163 : /*
1164 : * This state will only be displayed for a fleeting moment, but for the
1165 : * sake of correctness it is still added before ending the command.
1166 : */
1167 10 : pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE,
1168 : PROGRESS_DATACHECKSUMS_PHASE_DONE);
1169 :
1170 : /*
1171 : * All done. But before we exit, check if the target state was changed
1172 : * while we were running. In that case we will have to start all over
1173 : * again.
1174 : */
1175 10 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
1176 10 : if (DataChecksumState->launch_operation != operation)
1177 : {
1178 0 : DataChecksumState->operation = DataChecksumState->launch_operation;
1179 0 : operation = DataChecksumState->launch_operation;
1180 0 : DataChecksumState->cost_delay = DataChecksumState->launch_cost_delay;
1181 0 : DataChecksumState->cost_limit = DataChecksumState->launch_cost_limit;
1182 0 : LWLockRelease(DataChecksumsWorkerLock);
1183 0 : goto again;
1184 : }
1185 :
1186 : /* Shut down progress reporting as we are done */
1187 10 : pgstat_progress_end_command();
1188 :
1189 10 : launcher_running = false;
1190 10 : DataChecksumState->launcher_running = false;
1191 10 : LWLockRelease(DataChecksumsWorkerLock);
1192 : }
1193 :
1194 : /*
1195 : * ProcessAllDatabases
1196 : * Compute the list of all databases and process checksums in each
1197 : *
1198 : * This will generate a list of databases to process for enabling checksums.
1199 : * If a database encounters a failure then processing will end immediately and
1200 : * return an error.
1201 : */
1202 : static bool
1203 8 : ProcessAllDatabases(void)
1204 : {
1205 : List *DatabaseList;
1206 8 : int cumulative_total = 0;
1207 :
1208 : /* Set up so first run processes shared catalogs, not once in every db */
1209 8 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
1210 8 : DataChecksumState->process_shared_catalogs = true;
1211 8 : LWLockRelease(DataChecksumsWorkerLock);
1212 :
1213 : /* Get a list of all databases to process */
1214 8 : WaitForAllTransactionsToFinish();
1215 8 : DatabaseList = BuildDatabaseList();
1216 :
1217 : /*
1218 : * Update progress reporting with the total number of databases we need to
1219 : * process. This number should not be changed during processing, the
1220 : * columns for processed databases is instead increased such that it can
1221 : * be compared against the total.
1222 : */
1223 : {
1224 8 : const int index[] = {
1225 : PROGRESS_DATACHECKSUMS_DBS_TOTAL,
1226 : PROGRESS_DATACHECKSUMS_DBS_DONE,
1227 : PROGRESS_DATACHECKSUMS_RELS_TOTAL,
1228 : PROGRESS_DATACHECKSUMS_RELS_DONE,
1229 : PROGRESS_DATACHECKSUMS_BLOCKS_TOTAL,
1230 : PROGRESS_DATACHECKSUMS_BLOCKS_DONE,
1231 : };
1232 :
1233 : int64 vals[6];
1234 :
1235 8 : vals[0] = list_length(DatabaseList);
1236 8 : vals[1] = 0;
1237 : /* translated to NULL */
1238 8 : vals[2] = -1;
1239 8 : vals[3] = -1;
1240 8 : vals[4] = -1;
1241 8 : vals[5] = -1;
1242 :
1243 8 : pgstat_progress_update_multi_param(6, index, vals);
1244 : }
1245 :
1246 32 : foreach_ptr(DataChecksumsWorkerDatabase, db, DatabaseList)
1247 : {
1248 : DataChecksumsWorkerResult result;
1249 :
1250 20 : result = ProcessDatabase(db);
1251 :
1252 : #ifdef USE_INJECTION_POINTS
1253 : /* Allow a test process to alter the result of the operation */
1254 19 : if (IS_INJECTION_POINT_ATTACHED("datachecksumsworker-fail-db-result"))
1255 : {
1256 1 : result = DATACHECKSUMSWORKER_FAILED;
1257 1 : INJECTION_POINT_CACHED("datachecksumsworker-fail-db-result",
1258 : db->dbname);
1259 : }
1260 : #endif
1261 :
1262 19 : pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_DBS_DONE,
1263 : ++cumulative_total);
1264 :
1265 19 : if (result == DATACHECKSUMSWORKER_FAILED)
1266 : {
1267 : /*
1268 : * Disable checksums on cluster, because we failed one of the
1269 : * databases and this is an all or nothing process.
1270 : */
1271 1 : SetDataChecksumsOff();
1272 1 : ereport(ERROR,
1273 : errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1274 : errmsg("data checksums failed to get enabled in all databases, aborting"),
1275 : errhint("The server log might have more information on the cause of the error."));
1276 : }
1277 18 : else if (result == DATACHECKSUMSWORKER_ABORTED || abort_requested)
1278 : {
1279 : /* Abort flag set, so exit the whole process */
1280 0 : return false;
1281 : }
1282 :
1283 : /*
1284 : * When one database has completed, it will have done shared catalogs
1285 : * so we don't have to process them again.
1286 : */
1287 18 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
1288 18 : DataChecksumState->process_shared_catalogs = false;
1289 18 : LWLockRelease(DataChecksumsWorkerLock);
1290 : }
1291 :
1292 6 : FreeDatabaseList(DatabaseList);
1293 :
1294 6 : pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE,
1295 : PROGRESS_DATACHECKSUMS_PHASE_WAITING_BARRIER);
1296 6 : return true;
1297 : }
1298 :
1299 : /*
1300 : * DataChecksumsShmemRequest
1301 : * Request datachecksumsworker-related shared memory
1302 : */
1303 : static void
1304 1239 : DataChecksumsShmemRequest(void *arg)
1305 : {
1306 1239 : ShmemRequestStruct(.name = "DataChecksumsWorker Data",
1307 : .size = sizeof(DataChecksumsStateStruct),
1308 : .ptr = (void **) &DataChecksumState,
1309 : );
1310 1239 : }
1311 :
1312 : /*
1313 : * DatabaseExists
1314 : *
1315 : * Scans the system catalog to check if a database with the given Oid exists
1316 : * and returns true if it is found and valid, else false. Note, we cannot use
1317 : * database_is_invalid_oid here as it will ERROR out, and we want to gracefully
1318 : * handle errors.
1319 : */
1320 : static bool
1321 0 : DatabaseExists(Oid dboid)
1322 : {
1323 : Relation rel;
1324 : ScanKeyData skey;
1325 : SysScanDesc scan;
1326 : bool found;
1327 : HeapTuple tuple;
1328 : Form_pg_database pg_database_tuple;
1329 :
1330 0 : StartTransactionCommand();
1331 :
1332 0 : rel = table_open(DatabaseRelationId, AccessShareLock);
1333 0 : ScanKeyInit(&skey,
1334 : Anum_pg_database_oid,
1335 : BTEqualStrategyNumber, F_OIDEQ,
1336 : ObjectIdGetDatum(dboid));
1337 0 : scan = systable_beginscan(rel, DatabaseOidIndexId, true, SnapshotSelf,
1338 : 1, &skey);
1339 0 : tuple = systable_getnext(scan);
1340 0 : found = HeapTupleIsValid(tuple);
1341 :
1342 : /* If the Oid exists, ensure that it's not partially dropped */
1343 0 : if (found)
1344 : {
1345 0 : pg_database_tuple = (Form_pg_database) GETSTRUCT(tuple);
1346 0 : if (database_is_invalid_form(pg_database_tuple))
1347 0 : found = false;
1348 : }
1349 :
1350 0 : systable_endscan(scan);
1351 0 : table_close(rel, AccessShareLock);
1352 :
1353 0 : CommitTransactionCommand();
1354 :
1355 0 : return found;
1356 : }
1357 :
1358 : /*
1359 : * BuildDatabaseList
1360 : * Compile a list of all currently available databases in the cluster
1361 : *
1362 : * This creates the list of databases for the datachecksumsworker workers to
1363 : * add checksums to. If the caller wants to ensure that no concurrently
1364 : * running CREATE DATABASE calls exist, this needs to be preceded by a call
1365 : * to WaitForAllTransactionsToFinish().
1366 : */
1367 : static List *
1368 8 : BuildDatabaseList(void)
1369 : {
1370 8 : List *DatabaseList = NIL;
1371 : Relation rel;
1372 : TableScanDesc scan;
1373 : HeapTuple tup;
1374 8 : MemoryContext ctx = CurrentMemoryContext;
1375 : MemoryContext oldctx;
1376 :
1377 8 : StartTransactionCommand();
1378 :
1379 8 : rel = table_open(DatabaseRelationId, AccessShareLock);
1380 8 : scan = table_beginscan_catalog(rel, 0, NULL);
1381 :
1382 32 : while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection)))
1383 : {
1384 24 : Form_pg_database pgdb = (Form_pg_database) GETSTRUCT(tup);
1385 : DataChecksumsWorkerDatabase *db;
1386 :
1387 24 : oldctx = MemoryContextSwitchTo(ctx);
1388 :
1389 24 : db = (DataChecksumsWorkerDatabase *) palloc0(sizeof(DataChecksumsWorkerDatabase));
1390 :
1391 24 : db->dboid = pgdb->oid;
1392 24 : db->dbname = pstrdup(NameStr(pgdb->datname));
1393 :
1394 24 : DatabaseList = lappend(DatabaseList, db);
1395 :
1396 24 : MemoryContextSwitchTo(oldctx);
1397 : }
1398 :
1399 8 : table_endscan(scan);
1400 8 : table_close(rel, AccessShareLock);
1401 :
1402 8 : CommitTransactionCommand();
1403 :
1404 8 : return DatabaseList;
1405 : }
1406 :
1407 : static void
1408 6 : FreeDatabaseList(List *dblist)
1409 : {
1410 6 : if (!dblist)
1411 0 : return;
1412 :
1413 30 : foreach_ptr(DataChecksumsWorkerDatabase, db, dblist)
1414 : {
1415 18 : if (db->dbname != NULL)
1416 18 : pfree(db->dbname);
1417 : }
1418 :
1419 6 : list_free_deep(dblist);
1420 : }
1421 :
1422 : /*
1423 : * BuildRelationList
1424 : * Compile a list of relations in the database
1425 : *
1426 : * Returns a list of OIDs for the request relation types. If temp_relations
1427 : * is True then only temporary relations are returned. If temp_relations is
1428 : * False then non-temporary relations which have data checksums are returned.
1429 : * If include_shared is True then shared relations are included as well in a
1430 : * non-temporary list. include_shared has no relevance when building a list of
1431 : * temporary relations.
1432 : */
1433 : static List *
1434 59 : BuildRelationList(bool temp_relations, bool include_shared)
1435 : {
1436 59 : List *RelationList = NIL;
1437 : Relation rel;
1438 : TableScanDesc scan;
1439 : HeapTuple tup;
1440 59 : MemoryContext ctx = CurrentMemoryContext;
1441 : MemoryContext oldctx;
1442 :
1443 59 : StartTransactionCommand();
1444 :
1445 59 : rel = table_open(RelationRelationId, AccessShareLock);
1446 59 : scan = table_beginscan_catalog(rel, 0, NULL);
1447 :
1448 26749 : while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection)))
1449 : {
1450 26690 : Form_pg_class pgc = (Form_pg_class) GETSTRUCT(tup);
1451 :
1452 : /* Only include temporary relations when explicitly asked to */
1453 26690 : if (pgc->relpersistence == RELPERSISTENCE_TEMP)
1454 : {
1455 2 : if (!temp_relations)
1456 1 : continue;
1457 : }
1458 : else
1459 : {
1460 : /*
1461 : * If we are only interested in temp relations then continue
1462 : * immediately as the current relation isn't a temp relation.
1463 : */
1464 26688 : if (temp_relations)
1465 17641 : continue;
1466 :
1467 9047 : if (!RELKIND_HAS_STORAGE(pgc->relkind))
1468 3240 : continue;
1469 :
1470 5807 : if (pgc->relisshared && !include_shared)
1471 552 : continue;
1472 : }
1473 :
1474 5256 : oldctx = MemoryContextSwitchTo(ctx);
1475 5256 : RelationList = lappend_oid(RelationList, pgc->oid);
1476 5256 : MemoryContextSwitchTo(oldctx);
1477 : }
1478 :
1479 59 : table_endscan(scan);
1480 59 : table_close(rel, AccessShareLock);
1481 :
1482 59 : CommitTransactionCommand();
1483 :
1484 59 : return RelationList;
1485 : }
1486 :
1487 : /*
1488 : * DataChecksumsWorkerMain
1489 : *
1490 : * Main function for enabling checksums in a single database. This is the
1491 : * function set as the bgw_function_name in the dynamic background worker
1492 : * process initiated for each database by the worker launcher. After enabling
1493 : * data checksums in each applicable relation in the database, it will wait for
1494 : * all temporary relations that were present when the function started to
1495 : * disappear before returning. This is required since we cannot rewrite
1496 : * existing temporary relations with data checksums.
1497 : */
1498 : void
1499 20 : DataChecksumsWorkerMain(Datum arg)
1500 : {
1501 20 : Oid dboid = DatumGetObjectId(arg);
1502 20 : List *RelationList = NIL;
1503 20 : List *InitialTempTableList = NIL;
1504 : BufferAccessStrategy strategy;
1505 20 : bool aborted = false;
1506 : int64 rels_done;
1507 : #ifdef USE_INJECTION_POINTS
1508 20 : bool retried = false;
1509 : #endif
1510 :
1511 20 : operation = ENABLE_DATACHECKSUMS;
1512 :
1513 20 : pqsignal(SIGTERM, die);
1514 20 : pqsignal(SIGUSR1, procsignal_sigusr1_handler);
1515 :
1516 20 : BackgroundWorkerUnblockSignals();
1517 :
1518 20 : MyBackendType = B_DATACHECKSUMSWORKER_WORKER;
1519 20 : init_ps_display(NULL);
1520 :
1521 20 : BackgroundWorkerInitializeConnectionByOid(dboid, InvalidOid,
1522 : BGWORKER_BYPASS_ALLOWCONN);
1523 :
1524 : /* worker will have a separate entry in pg_stat_progress_data_checksums */
1525 20 : pgstat_progress_start_command(PROGRESS_COMMAND_DATACHECKSUMS,
1526 : InvalidOid);
1527 :
1528 : /*
1529 : * Get a list of all temp tables present as we start in this database. We
1530 : * need to wait until they are all gone until we are done, since we cannot
1531 : * access these relations and modify them.
1532 : */
1533 20 : InitialTempTableList = BuildRelationList(true, false);
1534 :
1535 : /*
1536 : * Enable vacuum cost delay, if any. While this process isn't doing any
1537 : * vacuuming, we are re-using the infrastructure that vacuum cost delay
1538 : * provides rather than inventing something bespoke. This is an internal
1539 : * implementation detail and care should be taken to avoid it bleeding
1540 : * through to the user to avoid confusion.
1541 : */
1542 20 : VacuumCostDelay = DataChecksumState->cost_delay;
1543 20 : VacuumCostLimit = DataChecksumState->cost_limit;
1544 20 : VacuumCostActive = (VacuumCostDelay > 0);
1545 20 : VacuumCostBalance = 0;
1546 20 : VacuumCostPageHit = 0;
1547 20 : VacuumCostPageMiss = 0;
1548 20 : VacuumCostPageDirty = 0;
1549 :
1550 : /*
1551 : * Create and set the vacuum strategy as our buffer strategy.
1552 : */
1553 20 : strategy = GetAccessStrategy(BAS_VACUUM);
1554 :
1555 20 : RelationList = BuildRelationList(false,
1556 20 : DataChecksumState->process_shared_catalogs);
1557 :
1558 : /* Update the total number of relations to be processed in this DB. */
1559 : {
1560 20 : const int index[] = {
1561 : PROGRESS_DATACHECKSUMS_RELS_TOTAL,
1562 : PROGRESS_DATACHECKSUMS_RELS_DONE
1563 : };
1564 :
1565 : int64 vals[2];
1566 :
1567 20 : vals[0] = list_length(RelationList);
1568 20 : vals[1] = 0;
1569 :
1570 20 : pgstat_progress_update_multi_param(2, index, vals);
1571 : }
1572 :
1573 : /* Process the relations */
1574 20 : rels_done = 0;
1575 5158 : foreach_oid(reloid, RelationList)
1576 : {
1577 5120 : bool costs_updated = false;
1578 :
1579 5120 : if (!ProcessSingleRelationByOid(reloid, strategy))
1580 : {
1581 0 : aborted = true;
1582 0 : break;
1583 : }
1584 :
1585 5120 : pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_RELS_DONE,
1586 : ++rels_done);
1587 5120 : CHECK_FOR_INTERRUPTS();
1588 5119 : CHECK_FOR_ABORT_REQUEST();
1589 :
1590 5119 : if (abort_requested)
1591 0 : break;
1592 :
1593 : /*
1594 : * Check if the cost settings changed during runtime and if so, update
1595 : * to reflect the new values and signal that the access strategy needs
1596 : * to be refreshed.
1597 : */
1598 5119 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
1599 5119 : if ((DataChecksumState->launch_cost_delay != DataChecksumState->cost_delay)
1600 5119 : || (DataChecksumState->launch_cost_limit != DataChecksumState->cost_limit))
1601 : {
1602 0 : costs_updated = true;
1603 0 : VacuumCostDelay = DataChecksumState->launch_cost_delay;
1604 0 : VacuumCostLimit = DataChecksumState->launch_cost_limit;
1605 0 : VacuumCostActive = (VacuumCostDelay > 0);
1606 :
1607 0 : DataChecksumState->cost_delay = DataChecksumState->launch_cost_delay;
1608 0 : DataChecksumState->cost_limit = DataChecksumState->launch_cost_limit;
1609 : }
1610 : else
1611 5119 : costs_updated = false;
1612 5119 : LWLockRelease(DataChecksumsWorkerLock);
1613 :
1614 5119 : if (costs_updated)
1615 : {
1616 0 : FreeAccessStrategy(strategy);
1617 0 : strategy = GetAccessStrategy(BAS_VACUUM);
1618 : }
1619 : }
1620 :
1621 19 : list_free(RelationList);
1622 19 : FreeAccessStrategy(strategy);
1623 :
1624 19 : if (aborted || abort_requested)
1625 : {
1626 0 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
1627 0 : DataChecksumState->success = DATACHECKSUMSWORKER_ABORTED;
1628 0 : LWLockRelease(DataChecksumsWorkerLock);
1629 0 : ereport(DEBUG1,
1630 : errmsg("data checksum processing aborted in database OID %u",
1631 : dboid));
1632 0 : return;
1633 : }
1634 :
1635 : /* The worker is about to wait for temporary tables to go away. */
1636 19 : pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE,
1637 : PROGRESS_DATACHECKSUMS_PHASE_WAITING_TEMPREL);
1638 :
1639 : /*
1640 : * Wait for all temp tables that existed when we started to go away. This
1641 : * is necessary since we cannot "reach" them to enable checksums. Any temp
1642 : * tables created after we started will already have checksums in them
1643 : * (due to the "inprogress-on" state), so no need to wait for those.
1644 : */
1645 : for (;;)
1646 0 : {
1647 : List *CurrentTempTables;
1648 : int numleft;
1649 : char activity[64];
1650 :
1651 19 : CurrentTempTables = BuildRelationList(true, false);
1652 19 : numleft = 0;
1653 38 : foreach_oid(tmptbloid, InitialTempTableList)
1654 : {
1655 0 : if (list_member_oid(CurrentTempTables, tmptbloid))
1656 0 : numleft++;
1657 : }
1658 19 : list_free(CurrentTempTables);
1659 :
1660 : #ifdef USE_INJECTION_POINTS
1661 19 : if (IS_INJECTION_POINT_ATTACHED("datachecksumsworker-fake-temptable-wait"))
1662 : {
1663 : /* Make sure to just cause one retry */
1664 0 : if (!retried && numleft == 0)
1665 : {
1666 0 : numleft = 1;
1667 0 : retried = true;
1668 :
1669 0 : INJECTION_POINT_CACHED("datachecksumsworker-fake-temptable-wait", NULL);
1670 : }
1671 : }
1672 : #endif
1673 :
1674 19 : if (numleft == 0)
1675 19 : break;
1676 :
1677 : /*
1678 : * At least one temp table is left to wait for, indicate in pgstat
1679 : * activity and progress reporting.
1680 : */
1681 0 : snprintf(activity,
1682 : sizeof(activity),
1683 : "Waiting for %d temp tables to be removed", numleft);
1684 0 : pgstat_report_activity(STATE_RUNNING, activity);
1685 :
1686 : /* Retry every 3 seconds */
1687 0 : ResetLatch(MyLatch);
1688 0 : (void) WaitLatch(MyLatch,
1689 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
1690 : 3000,
1691 : WAIT_EVENT_CHECKSUM_ENABLE_TEMPTABLE_WAIT);
1692 :
1693 0 : CHECK_FOR_INTERRUPTS();
1694 0 : CHECK_FOR_ABORT_REQUEST();
1695 :
1696 0 : if (aborted || abort_requested)
1697 : {
1698 0 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
1699 0 : DataChecksumState->success = DATACHECKSUMSWORKER_ABORTED;
1700 0 : LWLockRelease(DataChecksumsWorkerLock);
1701 0 : ereport(LOG,
1702 : errmsg("data checksum processing aborted in database OID %u",
1703 : dboid));
1704 0 : return;
1705 : }
1706 : }
1707 :
1708 19 : list_free(InitialTempTableList);
1709 :
1710 : /* worker done */
1711 19 : pgstat_progress_end_command();
1712 :
1713 19 : LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
1714 19 : DataChecksumState->success = DATACHECKSUMSWORKER_SUCCESSFUL;
1715 19 : LWLockRelease(DataChecksumsWorkerLock);
1716 : }
|