Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * checkpointer.c
4 : *
5 : * The checkpointer is new as of Postgres 9.2. It handles all checkpoints.
6 : * Checkpoints are automatically dispatched after a certain amount of time has
7 : * elapsed since the last one, and it can be signaled to perform requested
8 : * checkpoints as well. (The GUC parameter that mandates a checkpoint every
9 : * so many WAL segments is implemented by having backends signal when they
10 : * fill WAL segments; the checkpointer itself doesn't watch for the
11 : * condition.)
12 : *
13 : * Normal termination is by SIGUSR2, which instructs the checkpointer to
14 : * execute a shutdown checkpoint and then exit(0). (All backends must be
15 : * stopped before SIGUSR2 is issued!) Emergency termination is by SIGQUIT;
16 : * like any backend, the checkpointer will simply abort and exit on SIGQUIT.
17 : *
18 : * If the checkpointer exits unexpectedly, the postmaster treats that the same
19 : * as a backend crash: shared memory may be corrupted, so remaining backends
20 : * should be killed by SIGQUIT and then a recovery cycle started. (Even if
21 : * shared memory isn't corrupted, we have lost information about which
22 : * files need to be fsync'd for the next checkpoint, and so a system
23 : * restart needs to be forced.)
24 : *
25 : *
26 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
27 : *
28 : *
29 : * IDENTIFICATION
30 : * src/backend/postmaster/checkpointer.c
31 : *
32 : *-------------------------------------------------------------------------
33 : */
34 : #include "postgres.h"
35 :
36 : #include <sys/time.h>
37 : #include <time.h>
38 :
39 : #include "access/xlog.h"
40 : #include "access/xlog_internal.h"
41 : #include "access/xlogrecovery.h"
42 : #include "libpq/pqsignal.h"
43 : #include "miscadmin.h"
44 : #include "pgstat.h"
45 : #include "postmaster/auxprocess.h"
46 : #include "postmaster/bgwriter.h"
47 : #include "postmaster/interrupt.h"
48 : #include "replication/syncrep.h"
49 : #include "storage/bufmgr.h"
50 : #include "storage/condition_variable.h"
51 : #include "storage/fd.h"
52 : #include "storage/ipc.h"
53 : #include "storage/lwlock.h"
54 : #include "storage/proc.h"
55 : #include "storage/procsignal.h"
56 : #include "storage/shmem.h"
57 : #include "storage/smgr.h"
58 : #include "storage/spin.h"
59 : #include "utils/guc.h"
60 : #include "utils/memutils.h"
61 : #include "utils/resowner.h"
62 :
63 :
64 : /*----------
65 : * Shared memory area for communication between checkpointer and backends
66 : *
67 : * The ckpt counters allow backends to watch for completion of a checkpoint
68 : * request they send. Here's how it works:
69 : * * At start of a checkpoint, checkpointer reads (and clears) the request
70 : * flags and increments ckpt_started, while holding ckpt_lck.
71 : * * On completion of a checkpoint, checkpointer sets ckpt_done to
72 : * equal ckpt_started.
73 : * * On failure of a checkpoint, checkpointer increments ckpt_failed
74 : * and sets ckpt_done to equal ckpt_started.
75 : *
76 : * The algorithm for backends is:
77 : * 1. Record current values of ckpt_failed and ckpt_started, and
78 : * set request flags, while holding ckpt_lck.
79 : * 2. Send signal to request checkpoint.
80 : * 3. Sleep until ckpt_started changes. Now you know a checkpoint has
81 : * begun since you started this algorithm (although *not* that it was
82 : * specifically initiated by your signal), and that it is using your flags.
83 : * 4. Record new value of ckpt_started.
84 : * 5. Sleep until ckpt_done >= saved value of ckpt_started. (Use modulo
85 : * arithmetic here in case counters wrap around.) Now you know a
86 : * checkpoint has started and completed, but not whether it was
87 : * successful.
88 : * 6. If ckpt_failed is different from the originally saved value,
89 : * assume request failed; otherwise it was definitely successful.
90 : *
91 : * ckpt_flags holds the OR of the checkpoint request flags sent by all
92 : * requesting backends since the last checkpoint start. The flags are
93 : * chosen so that OR'ing is the correct way to combine multiple requests.
94 : *
95 : * The requests array holds fsync requests sent by backends and not yet
96 : * absorbed by the checkpointer.
97 : *
98 : * Unlike the checkpoint fields, requests related fields are protected by
99 : * CheckpointerCommLock.
100 : *----------
101 : */
102 : typedef struct
103 : {
104 : SyncRequestType type; /* request type */
105 : FileTag ftag; /* file identifier */
106 : } CheckpointerRequest;
107 :
108 : typedef struct
109 : {
110 : pid_t checkpointer_pid; /* PID (0 if not started) */
111 :
112 : slock_t ckpt_lck; /* protects all the ckpt_* fields */
113 :
114 : int ckpt_started; /* advances when checkpoint starts */
115 : int ckpt_done; /* advances when checkpoint done */
116 : int ckpt_failed; /* advances when checkpoint fails */
117 :
118 : int ckpt_flags; /* checkpoint flags, as defined in xlog.h */
119 :
120 : ConditionVariable start_cv; /* signaled when ckpt_started advances */
121 : ConditionVariable done_cv; /* signaled when ckpt_done advances */
122 :
123 : int num_requests; /* current # of requests */
124 : int max_requests; /* allocated array size */
125 : CheckpointerRequest requests[FLEXIBLE_ARRAY_MEMBER];
126 : } CheckpointerShmemStruct;
127 :
128 : static CheckpointerShmemStruct *CheckpointerShmem;
129 :
130 : /* interval for calling AbsorbSyncRequests in CheckpointWriteDelay */
131 : #define WRITES_PER_ABSORB 1000
132 :
133 : /*
134 : * GUC parameters
135 : */
136 : int CheckPointTimeout = 300;
137 : int CheckPointWarning = 30;
138 : double CheckPointCompletionTarget = 0.9;
139 :
140 : /*
141 : * Private state
142 : */
143 : static bool ckpt_active = false;
144 :
145 : /* these values are valid when ckpt_active is true: */
146 : static pg_time_t ckpt_start_time;
147 : static XLogRecPtr ckpt_start_recptr;
148 : static double ckpt_cached_elapsed;
149 :
150 : static pg_time_t last_checkpoint_time;
151 : static pg_time_t last_xlog_switch_time;
152 :
153 : /* Prototypes for private functions */
154 :
155 : static void HandleCheckpointerInterrupts(void);
156 : static void CheckArchiveTimeout(void);
157 : static bool IsCheckpointOnSchedule(double progress);
158 : static bool ImmediateCheckpointRequested(void);
159 : static bool CompactCheckpointerRequestQueue(void);
160 : static void UpdateSharedMemoryConfig(void);
161 :
162 : /* Signal handlers */
163 : static void ReqCheckpointHandler(SIGNAL_ARGS);
164 :
165 :
166 : /*
167 : * Main entry point for checkpointer process
168 : *
169 : * This is invoked from AuxiliaryProcessMain, which has already created the
170 : * basic execution environment, but not enabled signals yet.
171 : */
172 : void
173 900 : CheckpointerMain(char *startup_data, size_t startup_data_len)
174 : {
175 : sigjmp_buf local_sigjmp_buf;
176 : MemoryContext checkpointer_context;
177 :
178 : Assert(startup_data_len == 0);
179 :
180 900 : MyBackendType = B_CHECKPOINTER;
181 900 : AuxiliaryProcessMainCommon();
182 :
183 900 : CheckpointerShmem->checkpointer_pid = MyProcPid;
184 :
185 : /*
186 : * Properly accept or ignore signals the postmaster might send us
187 : *
188 : * Note: we deliberately ignore SIGTERM, because during a standard Unix
189 : * system shutdown cycle, init will SIGTERM all processes at once. We
190 : * want to wait for the backends to exit, whereupon the postmaster will
191 : * tell us it's okay to shut down (via SIGUSR2).
192 : */
193 900 : pqsignal(SIGHUP, SignalHandlerForConfigReload);
194 900 : pqsignal(SIGINT, ReqCheckpointHandler); /* request checkpoint */
195 900 : pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */
196 : /* SIGQUIT handler was already set up by InitPostmasterChild */
197 900 : pqsignal(SIGALRM, SIG_IGN);
198 900 : pqsignal(SIGPIPE, SIG_IGN);
199 900 : pqsignal(SIGUSR1, procsignal_sigusr1_handler);
200 900 : pqsignal(SIGUSR2, SignalHandlerForShutdownRequest);
201 :
202 : /*
203 : * Reset some signals that are accepted by postmaster but not here
204 : */
205 900 : pqsignal(SIGCHLD, SIG_DFL);
206 :
207 : /*
208 : * Initialize so that first time-driven event happens at the correct time.
209 : */
210 900 : last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
211 :
212 : /*
213 : * Write out stats after shutdown. This needs to be called by exactly one
214 : * process during a normal shutdown, and since checkpointer is shut down
215 : * very late...
216 : *
217 : * Walsenders are shut down after the checkpointer, but currently don't
218 : * report stats. If that changes, we need a more complicated solution.
219 : */
220 900 : before_shmem_exit(pgstat_before_server_shutdown, 0);
221 :
222 : /*
223 : * Create a memory context that we will do all our work in. We do this so
224 : * that we can reset the context during error recovery and thereby avoid
225 : * possible memory leaks. Formerly this code just ran in
226 : * TopMemoryContext, but resetting that would be a really bad idea.
227 : */
228 900 : checkpointer_context = AllocSetContextCreate(TopMemoryContext,
229 : "Checkpointer",
230 : ALLOCSET_DEFAULT_SIZES);
231 900 : MemoryContextSwitchTo(checkpointer_context);
232 :
233 : /*
234 : * If an exception is encountered, processing resumes here.
235 : *
236 : * You might wonder why this isn't coded as an infinite loop around a
237 : * PG_TRY construct. The reason is that this is the bottom of the
238 : * exception stack, and so with PG_TRY there would be no exception handler
239 : * in force at all during the CATCH part. By leaving the outermost setjmp
240 : * always active, we have at least some chance of recovering from an error
241 : * during error recovery. (If we get into an infinite loop thereby, it
242 : * will soon be stopped by overflow of elog.c's internal state stack.)
243 : *
244 : * Note that we use sigsetjmp(..., 1), so that the prevailing signal mask
245 : * (to wit, BlockSig) will be restored when longjmp'ing to here. Thus,
246 : * signals other than SIGQUIT will be blocked until we complete error
247 : * recovery. It might seem that this policy makes the HOLD_INTERRUPTS()
248 : * call redundant, but it is not since InterruptPending might be set
249 : * already.
250 : */
251 900 : if (sigsetjmp(local_sigjmp_buf, 1) != 0)
252 : {
253 : /* Since not using PG_TRY, must reset error stack by hand */
254 0 : error_context_stack = NULL;
255 :
256 : /* Prevent interrupts while cleaning up */
257 0 : HOLD_INTERRUPTS();
258 :
259 : /* Report the error to the server log */
260 0 : EmitErrorReport();
261 :
262 : /*
263 : * These operations are really just a minimal subset of
264 : * AbortTransaction(). We don't have very many resources to worry
265 : * about in checkpointer, but we do have LWLocks, buffers, and temp
266 : * files.
267 : */
268 0 : LWLockReleaseAll();
269 0 : ConditionVariableCancelSleep();
270 0 : pgstat_report_wait_end();
271 0 : UnlockBuffers();
272 0 : ReleaseAuxProcessResources(false);
273 0 : AtEOXact_Buffers(false);
274 0 : AtEOXact_SMgr();
275 0 : AtEOXact_Files(false);
276 0 : AtEOXact_HashTables(false);
277 :
278 : /* Warn any waiting backends that the checkpoint failed. */
279 0 : if (ckpt_active)
280 : {
281 0 : SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
282 0 : CheckpointerShmem->ckpt_failed++;
283 0 : CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started;
284 0 : SpinLockRelease(&CheckpointerShmem->ckpt_lck);
285 :
286 0 : ConditionVariableBroadcast(&CheckpointerShmem->done_cv);
287 :
288 0 : ckpt_active = false;
289 : }
290 :
291 : /*
292 : * Now return to normal top-level context and clear ErrorContext for
293 : * next time.
294 : */
295 0 : MemoryContextSwitchTo(checkpointer_context);
296 0 : FlushErrorState();
297 :
298 : /* Flush any leaked data in the top-level context */
299 0 : MemoryContextReset(checkpointer_context);
300 :
301 : /* Now we can allow interrupts again */
302 0 : RESUME_INTERRUPTS();
303 :
304 : /*
305 : * Sleep at least 1 second after any error. A write error is likely
306 : * to be repeated, and we don't want to be filling the error logs as
307 : * fast as we can.
308 : */
309 0 : pg_usleep(1000000L);
310 : }
311 :
312 : /* We can now handle ereport(ERROR) */
313 900 : PG_exception_stack = &local_sigjmp_buf;
314 :
315 : /*
316 : * Unblock signals (they were blocked when the postmaster forked us)
317 : */
318 900 : sigprocmask(SIG_SETMASK, &UnBlockSig, NULL);
319 :
320 : /*
321 : * Ensure all shared memory values are set correctly for the config. Doing
322 : * this here ensures no race conditions from other concurrent updaters.
323 : */
324 900 : UpdateSharedMemoryConfig();
325 :
326 : /*
327 : * Advertise our proc number that backends can use to wake us up while
328 : * we're sleeping.
329 : */
330 900 : ProcGlobal->checkpointerProc = MyProcNumber;
331 :
332 : /*
333 : * Loop forever
334 : */
335 : for (;;)
336 5752 : {
337 6652 : bool do_checkpoint = false;
338 6652 : int flags = 0;
339 : pg_time_t now;
340 : int elapsed_secs;
341 : int cur_timeout;
342 6652 : bool chkpt_or_rstpt_requested = false;
343 6652 : bool chkpt_or_rstpt_timed = false;
344 :
345 : /* Clear any already-pending wakeups */
346 6652 : ResetLatch(MyLatch);
347 :
348 : /*
349 : * Process any requests or signals received recently.
350 : */
351 6652 : AbsorbSyncRequests();
352 6652 : HandleCheckpointerInterrupts();
353 :
354 : /*
355 : * Detect a pending checkpoint request by checking whether the flags
356 : * word in shared memory is nonzero. We shouldn't need to acquire the
357 : * ckpt_lck for this.
358 : */
359 5776 : if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags)
360 : {
361 1158 : do_checkpoint = true;
362 1158 : chkpt_or_rstpt_requested = true;
363 : }
364 :
365 : /*
366 : * Force a checkpoint if too much time has elapsed since the last one.
367 : * Note that we count a timed checkpoint in stats only when this
368 : * occurs without an external request, but we set the CAUSE_TIME flag
369 : * bit even if there is also an external request.
370 : */
371 5776 : now = (pg_time_t) time(NULL);
372 5776 : elapsed_secs = now - last_checkpoint_time;
373 5776 : if (elapsed_secs >= CheckPointTimeout)
374 : {
375 2 : if (!do_checkpoint)
376 2 : chkpt_or_rstpt_timed = true;
377 2 : do_checkpoint = true;
378 2 : flags |= CHECKPOINT_CAUSE_TIME;
379 : }
380 :
381 : /*
382 : * Do a checkpoint if requested.
383 : */
384 5776 : if (do_checkpoint)
385 : {
386 1160 : bool ckpt_performed = false;
387 : bool do_restartpoint;
388 :
389 : /* Check if we should perform a checkpoint or a restartpoint. */
390 1160 : do_restartpoint = RecoveryInProgress();
391 :
392 : /*
393 : * Atomically fetch the request flags to figure out what kind of a
394 : * checkpoint we should perform, and increase the started-counter
395 : * to acknowledge that we've started a new checkpoint.
396 : */
397 1160 : SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
398 1160 : flags |= CheckpointerShmem->ckpt_flags;
399 1160 : CheckpointerShmem->ckpt_flags = 0;
400 1160 : CheckpointerShmem->ckpt_started++;
401 1160 : SpinLockRelease(&CheckpointerShmem->ckpt_lck);
402 :
403 1160 : ConditionVariableBroadcast(&CheckpointerShmem->start_cv);
404 :
405 : /*
406 : * The end-of-recovery checkpoint is a real checkpoint that's
407 : * performed while we're still in recovery.
408 : */
409 1160 : if (flags & CHECKPOINT_END_OF_RECOVERY)
410 40 : do_restartpoint = false;
411 :
412 1160 : if (chkpt_or_rstpt_timed)
413 : {
414 2 : chkpt_or_rstpt_timed = false;
415 2 : if (do_restartpoint)
416 0 : PendingCheckpointerStats.restartpoints_timed++;
417 : else
418 2 : PendingCheckpointerStats.num_timed++;
419 : }
420 :
421 1160 : if (chkpt_or_rstpt_requested)
422 : {
423 1158 : chkpt_or_rstpt_requested = false;
424 1158 : if (do_restartpoint)
425 402 : PendingCheckpointerStats.restartpoints_requested++;
426 : else
427 756 : PendingCheckpointerStats.num_requested++;
428 : }
429 :
430 : /*
431 : * We will warn if (a) too soon since last checkpoint (whatever
432 : * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
433 : * since the last checkpoint start. Note in particular that this
434 : * implementation will not generate warnings caused by
435 : * CheckPointTimeout < CheckPointWarning.
436 : */
437 1160 : if (!do_restartpoint &&
438 758 : (flags & CHECKPOINT_CAUSE_XLOG) &&
439 362 : elapsed_secs < CheckPointWarning)
440 362 : ereport(LOG,
441 : (errmsg_plural("checkpoints are occurring too frequently (%d second apart)",
442 : "checkpoints are occurring too frequently (%d seconds apart)",
443 : elapsed_secs,
444 : elapsed_secs),
445 : errhint("Consider increasing the configuration parameter \"%s\".", "max_wal_size")));
446 :
447 : /*
448 : * Initialize checkpointer-private variables used during
449 : * checkpoint.
450 : */
451 1160 : ckpt_active = true;
452 1160 : if (do_restartpoint)
453 402 : ckpt_start_recptr = GetXLogReplayRecPtr(NULL);
454 : else
455 758 : ckpt_start_recptr = GetInsertRecPtr();
456 1160 : ckpt_start_time = now;
457 1160 : ckpt_cached_elapsed = 0;
458 :
459 : /*
460 : * Do the checkpoint.
461 : */
462 1160 : if (!do_restartpoint)
463 758 : ckpt_performed = CreateCheckPoint(flags);
464 : else
465 402 : ckpt_performed = CreateRestartPoint(flags);
466 :
467 : /*
468 : * After any checkpoint, free all smgr objects. Otherwise we
469 : * would never do so for dropped relations, as the checkpointer
470 : * does not process shared invalidation messages or call
471 : * AtEOXact_SMgr().
472 : */
473 1160 : smgrdestroyall();
474 :
475 : /*
476 : * Indicate checkpoint completion to any waiting backends.
477 : */
478 1160 : SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
479 1160 : CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started;
480 1160 : SpinLockRelease(&CheckpointerShmem->ckpt_lck);
481 :
482 1160 : ConditionVariableBroadcast(&CheckpointerShmem->done_cv);
483 :
484 1160 : if (!do_restartpoint)
485 : {
486 : /*
487 : * Note we record the checkpoint start time not end time as
488 : * last_checkpoint_time. This is so that time-driven
489 : * checkpoints happen at a predictable spacing.
490 : */
491 758 : last_checkpoint_time = now;
492 :
493 758 : if (ckpt_performed)
494 758 : PendingCheckpointerStats.num_performed++;
495 : }
496 : else
497 : {
498 402 : if (ckpt_performed)
499 : {
500 : /*
501 : * The same as for checkpoint. Please see the
502 : * corresponding comment.
503 : */
504 330 : last_checkpoint_time = now;
505 :
506 330 : PendingCheckpointerStats.restartpoints_performed++;
507 : }
508 : else
509 : {
510 : /*
511 : * We were not able to perform the restartpoint
512 : * (checkpoints throw an ERROR in case of error). Most
513 : * likely because we have not received any new checkpoint
514 : * WAL records since the last restartpoint. Try again in
515 : * 15 s.
516 : */
517 72 : last_checkpoint_time = now - CheckPointTimeout + 15;
518 : }
519 : }
520 :
521 1160 : ckpt_active = false;
522 :
523 : /* We may have received an interrupt during the checkpoint. */
524 1160 : HandleCheckpointerInterrupts();
525 : }
526 :
527 : /* Check for archive_timeout and switch xlog files if necessary. */
528 5758 : CheckArchiveTimeout();
529 :
530 : /* Report pending statistics to the cumulative stats system */
531 5758 : pgstat_report_checkpointer();
532 5758 : pgstat_report_wal(true);
533 :
534 : /*
535 : * If any checkpoint flags have been set, redo the loop to handle the
536 : * checkpoint without sleeping.
537 : */
538 5758 : if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags)
539 450 : continue;
540 :
541 : /*
542 : * Sleep until we are signaled or it's time for another checkpoint or
543 : * xlog file switch.
544 : */
545 5308 : now = (pg_time_t) time(NULL);
546 5308 : elapsed_secs = now - last_checkpoint_time;
547 5308 : if (elapsed_secs >= CheckPointTimeout)
548 0 : continue; /* no sleep for us ... */
549 5308 : cur_timeout = CheckPointTimeout - elapsed_secs;
550 5308 : if (XLogArchiveTimeout > 0 && !RecoveryInProgress())
551 : {
552 0 : elapsed_secs = now - last_xlog_switch_time;
553 0 : if (elapsed_secs >= XLogArchiveTimeout)
554 0 : continue; /* no sleep for us ... */
555 0 : cur_timeout = Min(cur_timeout, XLogArchiveTimeout - elapsed_secs);
556 : }
557 :
558 5308 : (void) WaitLatch(MyLatch,
559 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
560 : cur_timeout * 1000L /* convert to ms */ ,
561 : WAIT_EVENT_CHECKPOINTER_MAIN);
562 : }
563 : }
564 :
565 : /*
566 : * Process any new interrupts.
567 : */
568 : static void
569 7812 : HandleCheckpointerInterrupts(void)
570 : {
571 7812 : if (ProcSignalBarrierPending)
572 98 : ProcessProcSignalBarrier();
573 :
574 7812 : if (ConfigReloadPending)
575 : {
576 102 : ConfigReloadPending = false;
577 102 : ProcessConfigFile(PGC_SIGHUP);
578 :
579 : /*
580 : * Checkpointer is the last process to shut down, so we ask it to hold
581 : * the keys for a range of other tasks required most of which have
582 : * nothing to do with checkpointing at all.
583 : *
584 : * For various reasons, some config values can change dynamically so
585 : * the primary copy of them is held in shared memory to make sure all
586 : * backends see the same value. We make Checkpointer responsible for
587 : * updating the shared memory copy if the parameter setting changes
588 : * because of SIGHUP.
589 : */
590 102 : UpdateSharedMemoryConfig();
591 : }
592 7812 : if (ShutdownRequestPending)
593 : {
594 : /*
595 : * From here on, elog(ERROR) should end with exit(1), not send control
596 : * back to the sigsetjmp block above
597 : */
598 894 : ExitOnAnyError = true;
599 :
600 : /*
601 : * Close down the database.
602 : *
603 : * Since ShutdownXLOG() creates restartpoint or checkpoint, and
604 : * updates the statistics, increment the checkpoint request and flush
605 : * out pending statistic.
606 : */
607 894 : PendingCheckpointerStats.num_requested++;
608 894 : ShutdownXLOG(0, 0);
609 894 : pgstat_report_checkpointer();
610 894 : pgstat_report_wal(true);
611 :
612 : /* Normal exit from the checkpointer is here */
613 894 : proc_exit(0); /* done */
614 : }
615 :
616 : /* Perform logging of memory contexts of this process */
617 6918 : if (LogMemoryContextPending)
618 2 : ProcessLogMemoryContextInterrupt();
619 6918 : }
620 :
621 : /*
622 : * CheckArchiveTimeout -- check for archive_timeout and switch xlog files
623 : *
624 : * This will switch to a new WAL file and force an archive file write if
625 : * meaningful activity is recorded in the current WAL file. This includes most
626 : * writes, including just a single checkpoint record, but excludes WAL records
627 : * that were inserted with the XLOG_MARK_UNIMPORTANT flag being set (like
628 : * snapshots of running transactions). Such records, depending on
629 : * configuration, occur on regular intervals and don't contain important
630 : * information. This avoids generating archives with a few unimportant
631 : * records.
632 : */
633 : static void
634 19242 : CheckArchiveTimeout(void)
635 : {
636 : pg_time_t now;
637 : pg_time_t last_time;
638 : XLogRecPtr last_switch_lsn;
639 :
640 19242 : if (XLogArchiveTimeout <= 0 || RecoveryInProgress())
641 19242 : return;
642 :
643 0 : now = (pg_time_t) time(NULL);
644 :
645 : /* First we do a quick check using possibly-stale local state. */
646 0 : if ((int) (now - last_xlog_switch_time) < XLogArchiveTimeout)
647 0 : return;
648 :
649 : /*
650 : * Update local state ... note that last_xlog_switch_time is the last time
651 : * a switch was performed *or requested*.
652 : */
653 0 : last_time = GetLastSegSwitchData(&last_switch_lsn);
654 :
655 0 : last_xlog_switch_time = Max(last_xlog_switch_time, last_time);
656 :
657 : /* Now we can do the real checks */
658 0 : if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout)
659 : {
660 : /*
661 : * Switch segment only when "important" WAL has been logged since the
662 : * last segment switch (last_switch_lsn points to end of segment
663 : * switch occurred in).
664 : */
665 0 : if (GetLastImportantRecPtr() > last_switch_lsn)
666 : {
667 : XLogRecPtr switchpoint;
668 :
669 : /* mark switch as unimportant, avoids triggering checkpoints */
670 0 : switchpoint = RequestXLogSwitch(true);
671 :
672 : /*
673 : * If the returned pointer points exactly to a segment boundary,
674 : * assume nothing happened.
675 : */
676 0 : if (XLogSegmentOffset(switchpoint, wal_segment_size) != 0)
677 0 : elog(DEBUG1, "write-ahead log switch forced (\"archive_timeout\"=%d)",
678 : XLogArchiveTimeout);
679 : }
680 :
681 : /*
682 : * Update state in any case, so we don't retry constantly when the
683 : * system is idle.
684 : */
685 0 : last_xlog_switch_time = now;
686 : }
687 : }
688 :
689 : /*
690 : * Returns true if an immediate checkpoint request is pending. (Note that
691 : * this does not check the *current* checkpoint's IMMEDIATE flag, but whether
692 : * there is one pending behind it.)
693 : */
694 : static bool
695 87148 : ImmediateCheckpointRequested(void)
696 : {
697 87148 : volatile CheckpointerShmemStruct *cps = CheckpointerShmem;
698 :
699 : /*
700 : * We don't need to acquire the ckpt_lck in this case because we're only
701 : * looking at a single flag bit.
702 : */
703 87148 : if (cps->ckpt_flags & CHECKPOINT_IMMEDIATE)
704 6308 : return true;
705 80840 : return false;
706 : }
707 :
708 : /*
709 : * CheckpointWriteDelay -- control rate of checkpoint
710 : *
711 : * This function is called after each page write performed by BufferSync().
712 : * It is responsible for throttling BufferSync()'s write rate to hit
713 : * checkpoint_completion_target.
714 : *
715 : * The checkpoint request flags should be passed in; currently the only one
716 : * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes.
717 : *
718 : * 'progress' is an estimate of how much of the work has been done, as a
719 : * fraction between 0.0 meaning none, and 1.0 meaning all done.
720 : */
721 : void
722 511298 : CheckpointWriteDelay(int flags, double progress)
723 : {
724 : static int absorb_counter = WRITES_PER_ABSORB;
725 :
726 : /* Do nothing if checkpoint is being executed by non-checkpointer process */
727 511298 : if (!AmCheckpointerProcess())
728 89942 : return;
729 :
730 : /*
731 : * Perform the usual duties and take a nap, unless we're behind schedule,
732 : * in which case we just try to catch up as quickly as possible.
733 : */
734 421356 : if (!(flags & CHECKPOINT_IMMEDIATE) &&
735 87614 : !ShutdownRequestPending &&
736 167988 : !ImmediateCheckpointRequested() &&
737 80840 : IsCheckpointOnSchedule(progress))
738 : {
739 13484 : if (ConfigReloadPending)
740 : {
741 0 : ConfigReloadPending = false;
742 0 : ProcessConfigFile(PGC_SIGHUP);
743 : /* update shmem copies of config variables */
744 0 : UpdateSharedMemoryConfig();
745 : }
746 :
747 13484 : AbsorbSyncRequests();
748 13484 : absorb_counter = WRITES_PER_ABSORB;
749 :
750 13484 : CheckArchiveTimeout();
751 :
752 : /* Report interim statistics to the cumulative stats system */
753 13484 : pgstat_report_checkpointer();
754 :
755 : /*
756 : * This sleep used to be connected to bgwriter_delay, typically 200ms.
757 : * That resulted in more frequent wakeups if not much work to do.
758 : * Checkpointer and bgwriter are no longer related so take the Big
759 : * Sleep.
760 : */
761 13484 : WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
762 : 100,
763 : WAIT_EVENT_CHECKPOINT_WRITE_DELAY);
764 13484 : ResetLatch(MyLatch);
765 : }
766 407872 : else if (--absorb_counter <= 0)
767 : {
768 : /*
769 : * Absorb pending fsync requests after each WRITES_PER_ABSORB write
770 : * operations even when we don't sleep, to prevent overflow of the
771 : * fsync request queue.
772 : */
773 172 : AbsorbSyncRequests();
774 172 : absorb_counter = WRITES_PER_ABSORB;
775 : }
776 :
777 : /* Check for barrier events. */
778 421356 : if (ProcSignalBarrierPending)
779 10 : ProcessProcSignalBarrier();
780 : }
781 :
782 : /*
783 : * IsCheckpointOnSchedule -- are we on schedule to finish this checkpoint
784 : * (or restartpoint) in time?
785 : *
786 : * Compares the current progress against the time/segments elapsed since last
787 : * checkpoint, and returns true if the progress we've made this far is greater
788 : * than the elapsed time/segments.
789 : */
790 : static bool
791 80840 : IsCheckpointOnSchedule(double progress)
792 : {
793 : XLogRecPtr recptr;
794 : struct timeval now;
795 : double elapsed_xlogs,
796 : elapsed_time;
797 :
798 : Assert(ckpt_active);
799 :
800 : /* Scale progress according to checkpoint_completion_target. */
801 80840 : progress *= CheckPointCompletionTarget;
802 :
803 : /*
804 : * Check against the cached value first. Only do the more expensive
805 : * calculations once we reach the target previously calculated. Since
806 : * neither time or WAL insert pointer moves backwards, a freshly
807 : * calculated value can only be greater than or equal to the cached value.
808 : */
809 80840 : if (progress < ckpt_cached_elapsed)
810 60922 : return false;
811 :
812 : /*
813 : * Check progress against WAL segments written and CheckPointSegments.
814 : *
815 : * We compare the current WAL insert location against the location
816 : * computed before calling CreateCheckPoint. The code in XLogInsert that
817 : * actually triggers a checkpoint when CheckPointSegments is exceeded
818 : * compares against RedoRecPtr, so this is not completely accurate.
819 : * However, it's good enough for our purposes, we're only calculating an
820 : * estimate anyway.
821 : *
822 : * During recovery, we compare last replayed WAL record's location with
823 : * the location computed before calling CreateRestartPoint. That maintains
824 : * the same pacing as we have during checkpoints in normal operation, but
825 : * we might exceed max_wal_size by a fair amount. That's because there can
826 : * be a large gap between a checkpoint's redo-pointer and the checkpoint
827 : * record itself, and we only start the restartpoint after we've seen the
828 : * checkpoint record. (The gap is typically up to CheckPointSegments *
829 : * checkpoint_completion_target where checkpoint_completion_target is the
830 : * value that was in effect when the WAL was generated).
831 : */
832 19918 : if (RecoveryInProgress())
833 9340 : recptr = GetXLogReplayRecPtr(NULL);
834 : else
835 10578 : recptr = GetInsertRecPtr();
836 19918 : elapsed_xlogs = (((double) (recptr - ckpt_start_recptr)) /
837 19918 : wal_segment_size) / CheckPointSegments;
838 :
839 19918 : if (progress < elapsed_xlogs)
840 : {
841 6428 : ckpt_cached_elapsed = elapsed_xlogs;
842 6428 : return false;
843 : }
844 :
845 : /*
846 : * Check progress against time elapsed and checkpoint_timeout.
847 : */
848 13490 : gettimeofday(&now, NULL);
849 13490 : elapsed_time = ((double) ((pg_time_t) now.tv_sec - ckpt_start_time) +
850 13490 : now.tv_usec / 1000000.0) / CheckPointTimeout;
851 :
852 13490 : if (progress < elapsed_time)
853 : {
854 6 : ckpt_cached_elapsed = elapsed_time;
855 6 : return false;
856 : }
857 :
858 : /* It looks like we're on schedule. */
859 13484 : return true;
860 : }
861 :
862 :
863 : /* --------------------------------
864 : * signal handler routines
865 : * --------------------------------
866 : */
867 :
868 : /* SIGINT: set flag to run a normal checkpoint right away */
869 : static void
870 2876 : ReqCheckpointHandler(SIGNAL_ARGS)
871 : {
872 : /*
873 : * The signaling process should have set ckpt_flags nonzero, so all we
874 : * need do is ensure that our main loop gets kicked out of any wait.
875 : */
876 2876 : SetLatch(MyLatch);
877 2876 : }
878 :
879 :
880 : /* --------------------------------
881 : * communication with backends
882 : * --------------------------------
883 : */
884 :
885 : /*
886 : * CheckpointerShmemSize
887 : * Compute space needed for checkpointer-related shared memory
888 : */
889 : Size
890 5484 : CheckpointerShmemSize(void)
891 : {
892 : Size size;
893 :
894 : /*
895 : * Currently, the size of the requests[] array is arbitrarily set equal to
896 : * NBuffers. This may prove too large or small ...
897 : */
898 5484 : size = offsetof(CheckpointerShmemStruct, requests);
899 5484 : size = add_size(size, mul_size(NBuffers, sizeof(CheckpointerRequest)));
900 :
901 5484 : return size;
902 : }
903 :
904 : /*
905 : * CheckpointerShmemInit
906 : * Allocate and initialize checkpointer-related shared memory
907 : */
908 : void
909 1918 : CheckpointerShmemInit(void)
910 : {
911 1918 : Size size = CheckpointerShmemSize();
912 : bool found;
913 :
914 1918 : CheckpointerShmem = (CheckpointerShmemStruct *)
915 1918 : ShmemInitStruct("Checkpointer Data",
916 : size,
917 : &found);
918 :
919 1918 : if (!found)
920 : {
921 : /*
922 : * First time through, so initialize. Note that we zero the whole
923 : * requests array; this is so that CompactCheckpointerRequestQueue can
924 : * assume that any pad bytes in the request structs are zeroes.
925 : */
926 2202 : MemSet(CheckpointerShmem, 0, size);
927 1918 : SpinLockInit(&CheckpointerShmem->ckpt_lck);
928 1918 : CheckpointerShmem->max_requests = NBuffers;
929 1918 : ConditionVariableInit(&CheckpointerShmem->start_cv);
930 1918 : ConditionVariableInit(&CheckpointerShmem->done_cv);
931 : }
932 1918 : }
933 :
934 : /*
935 : * RequestCheckpoint
936 : * Called in backend processes to request a checkpoint
937 : *
938 : * flags is a bitwise OR of the following:
939 : * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
940 : * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
941 : * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
942 : * ignoring checkpoint_completion_target parameter.
943 : * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
944 : * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
945 : * CHECKPOINT_END_OF_RECOVERY).
946 : * CHECKPOINT_WAIT: wait for completion before returning (otherwise,
947 : * just signal checkpointer to do it, and return).
948 : * CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling.
949 : * (This affects logging, and in particular enables CheckPointWarning.)
950 : */
951 : void
952 3984 : RequestCheckpoint(int flags)
953 : {
954 : int ntries;
955 : int old_failed,
956 : old_started;
957 :
958 : /*
959 : * If in a standalone backend, just do it ourselves.
960 : */
961 3984 : if (!IsPostmasterEnvironment)
962 : {
963 : /*
964 : * There's no point in doing slow checkpoints in a standalone backend,
965 : * because there's no other backends the checkpoint could disrupt.
966 : */
967 362 : CreateCheckPoint(flags | CHECKPOINT_IMMEDIATE);
968 :
969 : /* Free all smgr objects, as CheckpointerMain() normally would. */
970 362 : smgrdestroyall();
971 :
972 362 : return;
973 : }
974 :
975 : /*
976 : * Atomically set the request flags, and take a snapshot of the counters.
977 : * When we see ckpt_started > old_started, we know the flags we set here
978 : * have been seen by checkpointer.
979 : *
980 : * Note that we OR the flags with any existing flags, to avoid overriding
981 : * a "stronger" request by another backend. The flag senses must be
982 : * chosen to make this work!
983 : */
984 3622 : SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
985 :
986 3622 : old_failed = CheckpointerShmem->ckpt_failed;
987 3622 : old_started = CheckpointerShmem->ckpt_started;
988 3622 : CheckpointerShmem->ckpt_flags |= (flags | CHECKPOINT_REQUESTED);
989 :
990 3622 : SpinLockRelease(&CheckpointerShmem->ckpt_lck);
991 :
992 : /*
993 : * Send signal to request checkpoint. It's possible that the checkpointer
994 : * hasn't started yet, or is in process of restarting, so we will retry a
995 : * few times if needed. (Actually, more than a few times, since on slow
996 : * or overloaded buildfarm machines, it's been observed that the
997 : * checkpointer can take several seconds to start.) However, if not told
998 : * to wait for the checkpoint to occur, we consider failure to send the
999 : * signal to be nonfatal and merely LOG it. The checkpointer should see
1000 : * the request when it does start, with or without getting a signal.
1001 : */
1002 : #define MAX_SIGNAL_TRIES 600 /* max wait 60.0 sec */
1003 3632 : for (ntries = 0;; ntries++)
1004 : {
1005 3632 : if (CheckpointerShmem->checkpointer_pid == 0)
1006 : {
1007 10 : if (ntries >= MAX_SIGNAL_TRIES || !(flags & CHECKPOINT_WAIT))
1008 : {
1009 0 : elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
1010 : "could not signal for checkpoint: checkpointer is not running");
1011 0 : break;
1012 : }
1013 : }
1014 3622 : else if (kill(CheckpointerShmem->checkpointer_pid, SIGINT) != 0)
1015 : {
1016 0 : if (ntries >= MAX_SIGNAL_TRIES || !(flags & CHECKPOINT_WAIT))
1017 : {
1018 0 : elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
1019 : "could not signal for checkpoint: %m");
1020 0 : break;
1021 : }
1022 : }
1023 : else
1024 3622 : break; /* signal sent successfully */
1025 :
1026 10 : CHECK_FOR_INTERRUPTS();
1027 10 : pg_usleep(100000L); /* wait 0.1 sec, then retry */
1028 : }
1029 :
1030 : /*
1031 : * If requested, wait for completion. We detect completion according to
1032 : * the algorithm given above.
1033 : */
1034 3622 : if (flags & CHECKPOINT_WAIT)
1035 : {
1036 : int new_started,
1037 : new_failed;
1038 :
1039 : /* Wait for a new checkpoint to start. */
1040 902 : ConditionVariablePrepareToSleep(&CheckpointerShmem->start_cv);
1041 : for (;;)
1042 : {
1043 1684 : SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
1044 1684 : new_started = CheckpointerShmem->ckpt_started;
1045 1684 : SpinLockRelease(&CheckpointerShmem->ckpt_lck);
1046 :
1047 1684 : if (new_started != old_started)
1048 902 : break;
1049 :
1050 782 : ConditionVariableSleep(&CheckpointerShmem->start_cv,
1051 : WAIT_EVENT_CHECKPOINT_START);
1052 : }
1053 902 : ConditionVariableCancelSleep();
1054 :
1055 : /*
1056 : * We are waiting for ckpt_done >= new_started, in a modulo sense.
1057 : */
1058 902 : ConditionVariablePrepareToSleep(&CheckpointerShmem->done_cv);
1059 : for (;;)
1060 786 : {
1061 : int new_done;
1062 :
1063 1688 : SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
1064 1688 : new_done = CheckpointerShmem->ckpt_done;
1065 1688 : new_failed = CheckpointerShmem->ckpt_failed;
1066 1688 : SpinLockRelease(&CheckpointerShmem->ckpt_lck);
1067 :
1068 1688 : if (new_done - new_started >= 0)
1069 902 : break;
1070 :
1071 786 : ConditionVariableSleep(&CheckpointerShmem->done_cv,
1072 : WAIT_EVENT_CHECKPOINT_DONE);
1073 : }
1074 902 : ConditionVariableCancelSleep();
1075 :
1076 902 : if (new_failed != old_failed)
1077 0 : ereport(ERROR,
1078 : (errmsg("checkpoint request failed"),
1079 : errhint("Consult recent messages in the server log for details.")));
1080 : }
1081 : }
1082 :
1083 : /*
1084 : * ForwardSyncRequest
1085 : * Forward a file-fsync request from a backend to the checkpointer
1086 : *
1087 : * Whenever a backend is compelled to write directly to a relation
1088 : * (which should be seldom, if the background writer is getting its job done),
1089 : * the backend calls this routine to pass over knowledge that the relation
1090 : * is dirty and must be fsync'd before next checkpoint. We also use this
1091 : * opportunity to count such writes for statistical purposes.
1092 : *
1093 : * To avoid holding the lock for longer than necessary, we normally write
1094 : * to the requests[] queue without checking for duplicates. The checkpointer
1095 : * will have to eliminate dups internally anyway. However, if we discover
1096 : * that the queue is full, we make a pass over the entire queue to compact
1097 : * it. This is somewhat expensive, but the alternative is for the backend
1098 : * to perform its own fsync, which is far more expensive in practice. It
1099 : * is theoretically possible a backend fsync might still be necessary, if
1100 : * the queue is full and contains no duplicate entries. In that case, we
1101 : * let the backend know by returning false.
1102 : */
1103 : bool
1104 2301562 : ForwardSyncRequest(const FileTag *ftag, SyncRequestType type)
1105 : {
1106 : CheckpointerRequest *request;
1107 : bool too_full;
1108 :
1109 2301562 : if (!IsUnderPostmaster)
1110 0 : return false; /* probably shouldn't even get here */
1111 :
1112 2301562 : if (AmCheckpointerProcess())
1113 0 : elog(ERROR, "ForwardSyncRequest must not be called in checkpointer");
1114 :
1115 2301562 : LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
1116 :
1117 : /*
1118 : * If the checkpointer isn't running or the request queue is full, the
1119 : * backend will have to perform its own fsync request. But before forcing
1120 : * that to happen, we can try to compact the request queue.
1121 : */
1122 2301562 : if (CheckpointerShmem->checkpointer_pid == 0 ||
1123 2301474 : (CheckpointerShmem->num_requests >= CheckpointerShmem->max_requests &&
1124 4334 : !CompactCheckpointerRequestQueue()))
1125 : {
1126 4052 : LWLockRelease(CheckpointerCommLock);
1127 4052 : return false;
1128 : }
1129 :
1130 : /* OK, insert request */
1131 2297510 : request = &CheckpointerShmem->requests[CheckpointerShmem->num_requests++];
1132 2297510 : request->ftag = *ftag;
1133 2297510 : request->type = type;
1134 :
1135 : /* If queue is more than half full, nudge the checkpointer to empty it */
1136 2297510 : too_full = (CheckpointerShmem->num_requests >=
1137 2297510 : CheckpointerShmem->max_requests / 2);
1138 :
1139 2297510 : LWLockRelease(CheckpointerCommLock);
1140 :
1141 : /* ... but not till after we release the lock */
1142 2297510 : if (too_full)
1143 : {
1144 46872 : volatile PROC_HDR *procglobal = ProcGlobal;
1145 46872 : ProcNumber checkpointerProc = procglobal->checkpointerProc;
1146 :
1147 46872 : if (checkpointerProc != INVALID_PROC_NUMBER)
1148 46872 : SetLatch(&GetPGProcByNumber(checkpointerProc)->procLatch);
1149 : }
1150 :
1151 2297510 : return true;
1152 : }
1153 :
1154 : /*
1155 : * CompactCheckpointerRequestQueue
1156 : * Remove duplicates from the request queue to avoid backend fsyncs.
1157 : * Returns "true" if any entries were removed.
1158 : *
1159 : * Although a full fsync request queue is not common, it can lead to severe
1160 : * performance problems when it does happen. So far, this situation has
1161 : * only been observed to occur when the system is under heavy write load,
1162 : * and especially during the "sync" phase of a checkpoint. Without this
1163 : * logic, each backend begins doing an fsync for every block written, which
1164 : * gets very expensive and can slow down the whole system.
1165 : *
1166 : * Trying to do this every time the queue is full could lose if there
1167 : * aren't any removable entries. But that should be vanishingly rare in
1168 : * practice: there's one queue entry per shared buffer.
1169 : */
1170 : static bool
1171 4334 : CompactCheckpointerRequestQueue(void)
1172 : {
1173 : struct CheckpointerSlotMapping
1174 : {
1175 : CheckpointerRequest request;
1176 : int slot;
1177 : };
1178 :
1179 : int n,
1180 : preserve_count;
1181 4334 : int num_skipped = 0;
1182 : HASHCTL ctl;
1183 : HTAB *htab;
1184 : bool *skip_slot;
1185 :
1186 : /* must hold CheckpointerCommLock in exclusive mode */
1187 : Assert(LWLockHeldByMe(CheckpointerCommLock));
1188 :
1189 : /* Avoid memory allocations in a critical section. */
1190 4334 : if (CritSectionCount > 0)
1191 0 : return false;
1192 :
1193 : /* Initialize skip_slot array */
1194 4334 : skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests);
1195 :
1196 : /* Initialize temporary hash table */
1197 4334 : ctl.keysize = sizeof(CheckpointerRequest);
1198 4334 : ctl.entrysize = sizeof(struct CheckpointerSlotMapping);
1199 4334 : ctl.hcxt = CurrentMemoryContext;
1200 :
1201 4334 : htab = hash_create("CompactCheckpointerRequestQueue",
1202 4334 : CheckpointerShmem->num_requests,
1203 : &ctl,
1204 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1205 :
1206 : /*
1207 : * The basic idea here is that a request can be skipped if it's followed
1208 : * by a later, identical request. It might seem more sensible to work
1209 : * backwards from the end of the queue and check whether a request is
1210 : * *preceded* by an earlier, identical request, in the hopes of doing less
1211 : * copying. But that might change the semantics, if there's an
1212 : * intervening SYNC_FORGET_REQUEST or SYNC_FILTER_REQUEST, so we do it
1213 : * this way. It would be possible to be even smarter if we made the code
1214 : * below understand the specific semantics of such requests (it could blow
1215 : * away preceding entries that would end up being canceled anyhow), but
1216 : * it's not clear that the extra complexity would buy us anything.
1217 : */
1218 557518 : for (n = 0; n < CheckpointerShmem->num_requests; n++)
1219 : {
1220 : CheckpointerRequest *request;
1221 : struct CheckpointerSlotMapping *slotmap;
1222 : bool found;
1223 :
1224 : /*
1225 : * We use the request struct directly as a hashtable key. This
1226 : * assumes that any padding bytes in the structs are consistently the
1227 : * same, which should be okay because we zeroed them in
1228 : * CheckpointerShmemInit. Note also that RelFileLocator had better
1229 : * contain no pad bytes.
1230 : */
1231 553184 : request = &CheckpointerShmem->requests[n];
1232 553184 : slotmap = hash_search(htab, request, HASH_ENTER, &found);
1233 553184 : if (found)
1234 : {
1235 : /* Duplicate, so mark the previous occurrence as skippable */
1236 17068 : skip_slot[slotmap->slot] = true;
1237 17068 : num_skipped++;
1238 : }
1239 : /* Remember slot containing latest occurrence of this request value */
1240 553184 : slotmap->slot = n;
1241 : }
1242 :
1243 : /* Done with the hash table. */
1244 4334 : hash_destroy(htab);
1245 :
1246 : /* If no duplicates, we're out of luck. */
1247 4334 : if (!num_skipped)
1248 : {
1249 3964 : pfree(skip_slot);
1250 3964 : return false;
1251 : }
1252 :
1253 : /* We found some duplicates; remove them. */
1254 370 : preserve_count = 0;
1255 46162 : for (n = 0; n < CheckpointerShmem->num_requests; n++)
1256 : {
1257 45792 : if (skip_slot[n])
1258 17068 : continue;
1259 28724 : CheckpointerShmem->requests[preserve_count++] = CheckpointerShmem->requests[n];
1260 : }
1261 370 : ereport(DEBUG1,
1262 : (errmsg_internal("compacted fsync request queue from %d entries to %d entries",
1263 : CheckpointerShmem->num_requests, preserve_count)));
1264 370 : CheckpointerShmem->num_requests = preserve_count;
1265 :
1266 : /* Cleanup. */
1267 370 : pfree(skip_slot);
1268 370 : return true;
1269 : }
1270 :
1271 : /*
1272 : * AbsorbSyncRequests
1273 : * Retrieve queued sync requests and pass them to sync mechanism.
1274 : *
1275 : * This is exported because it must be called during CreateCheckPoint;
1276 : * we have to be sure we have accepted all pending requests just before
1277 : * we start fsync'ing. Since CreateCheckPoint sometimes runs in
1278 : * non-checkpointer processes, do nothing if not checkpointer.
1279 : */
1280 : void
1281 31084 : AbsorbSyncRequests(void)
1282 : {
1283 31084 : CheckpointerRequest *requests = NULL;
1284 : CheckpointerRequest *request;
1285 : int n;
1286 :
1287 31084 : if (!AmCheckpointerProcess())
1288 1120 : return;
1289 :
1290 29964 : LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
1291 :
1292 : /*
1293 : * We try to avoid holding the lock for a long time by copying the request
1294 : * array, and processing the requests after releasing the lock.
1295 : *
1296 : * Once we have cleared the requests from shared memory, we have to PANIC
1297 : * if we then fail to absorb them (eg, because our hashtable runs out of
1298 : * memory). This is because the system cannot run safely if we are unable
1299 : * to fsync what we have been told to fsync. Fortunately, the hashtable
1300 : * is so small that the problem is quite unlikely to arise in practice.
1301 : */
1302 29964 : n = CheckpointerShmem->num_requests;
1303 29964 : if (n > 0)
1304 : {
1305 16172 : requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest));
1306 16172 : memcpy(requests, CheckpointerShmem->requests, n * sizeof(CheckpointerRequest));
1307 : }
1308 :
1309 29964 : START_CRIT_SECTION();
1310 :
1311 29964 : CheckpointerShmem->num_requests = 0;
1312 :
1313 29964 : LWLockRelease(CheckpointerCommLock);
1314 :
1315 2138858 : for (request = requests; n > 0; request++, n--)
1316 2108894 : RememberSyncRequest(&request->ftag, request->type);
1317 :
1318 29964 : END_CRIT_SECTION();
1319 :
1320 29964 : if (requests)
1321 16172 : pfree(requests);
1322 : }
1323 :
1324 : /*
1325 : * Update any shared memory configurations based on config parameters
1326 : */
1327 : static void
1328 1002 : UpdateSharedMemoryConfig(void)
1329 : {
1330 : /* update global shmem state for sync rep */
1331 1002 : SyncRepUpdateSyncStandbysDefined();
1332 :
1333 : /*
1334 : * If full_page_writes has been changed by SIGHUP, we update it in shared
1335 : * memory and write an XLOG_FPW_CHANGE record.
1336 : */
1337 1002 : UpdateFullPageWrites();
1338 :
1339 1002 : elog(DEBUG2, "checkpointer updated shared memory configuration values");
1340 1002 : }
1341 :
1342 : /*
1343 : * FirstCallSinceLastCheckpoint allows a process to take an action once
1344 : * per checkpoint cycle by asynchronously checking for checkpoint completion.
1345 : */
1346 : bool
1347 17888 : FirstCallSinceLastCheckpoint(void)
1348 : {
1349 : static int ckpt_done = 0;
1350 : int new_done;
1351 17888 : bool FirstCall = false;
1352 :
1353 17888 : SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
1354 17888 : new_done = CheckpointerShmem->ckpt_done;
1355 17888 : SpinLockRelease(&CheckpointerShmem->ckpt_lck);
1356 :
1357 17888 : if (new_done != ckpt_done)
1358 964 : FirstCall = true;
1359 :
1360 17888 : ckpt_done = new_done;
1361 :
1362 17888 : return FirstCall;
1363 : }
|