Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * sync.c
4 : * File synchronization management code.
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/storage/sync/sync.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include <unistd.h>
18 : #include <fcntl.h>
19 : #include <sys/file.h>
20 :
21 : #include "access/clog.h"
22 : #include "access/commit_ts.h"
23 : #include "access/multixact.h"
24 : #include "access/xlog.h"
25 : #include "miscadmin.h"
26 : #include "pgstat.h"
27 : #include "portability/instr_time.h"
28 : #include "postmaster/bgwriter.h"
29 : #include "storage/fd.h"
30 : #include "storage/latch.h"
31 : #include "storage/md.h"
32 : #include "utils/hsearch.h"
33 : #include "utils/memutils.h"
34 :
35 : /*
36 : * In some contexts (currently, standalone backends and the checkpointer)
37 : * we keep track of pending fsync operations: we need to remember all relation
38 : * segments that have been written since the last checkpoint, so that we can
39 : * fsync them down to disk before completing the next checkpoint. This hash
40 : * table remembers the pending operations. We use a hash table mostly as
41 : * a convenient way of merging duplicate requests.
42 : *
43 : * We use a similar mechanism to remember no-longer-needed files that can
44 : * be deleted after the next checkpoint, but we use a linked list instead of
45 : * a hash table, because we don't expect there to be any duplicate requests.
46 : *
47 : * These mechanisms are only used for non-temp relations; we never fsync
48 : * temp rels, nor do we need to postpone their deletion (see comments in
49 : * mdunlink).
50 : *
51 : * (Regular backends do not track pending operations locally, but forward
52 : * them to the checkpointer.)
53 : */
54 : typedef uint16 CycleCtr; /* can be any convenient integer size */
55 :
56 : typedef struct
57 : {
58 : FileTag tag; /* identifies handler and file */
59 : CycleCtr cycle_ctr; /* sync_cycle_ctr of oldest request */
60 : bool canceled; /* canceled is true if we canceled "recently" */
61 : } PendingFsyncEntry;
62 :
63 : typedef struct
64 : {
65 : FileTag tag; /* identifies handler and file */
66 : CycleCtr cycle_ctr; /* checkpoint_cycle_ctr when request was made */
67 : bool canceled; /* true if request has been canceled */
68 : } PendingUnlinkEntry;
69 :
70 : static HTAB *pendingOps = NULL;
71 : static List *pendingUnlinks = NIL;
72 : static MemoryContext pendingOpsCxt; /* context for the above */
73 :
74 : static CycleCtr sync_cycle_ctr = 0;
75 : static CycleCtr checkpoint_cycle_ctr = 0;
76 :
77 : /* Intervals for calling AbsorbSyncRequests */
78 : #define FSYNCS_PER_ABSORB 10
79 : #define UNLINKS_PER_ABSORB 10
80 :
81 : /*
82 : * Function pointers for handling sync and unlink requests.
83 : */
84 : typedef struct SyncOps
85 : {
86 : int (*sync_syncfiletag) (const FileTag *ftag, char *path);
87 : int (*sync_unlinkfiletag) (const FileTag *ftag, char *path);
88 : bool (*sync_filetagmatches) (const FileTag *ftag,
89 : const FileTag *candidate);
90 : } SyncOps;
91 :
92 : /*
93 : * These indexes must correspond to the values of the SyncRequestHandler enum.
94 : */
95 : static const SyncOps syncsw[] = {
96 : /* magnetic disk */
97 : [SYNC_HANDLER_MD] = {
98 : .sync_syncfiletag = mdsyncfiletag,
99 : .sync_unlinkfiletag = mdunlinkfiletag,
100 : .sync_filetagmatches = mdfiletagmatches
101 : },
102 : /* pg_xact */
103 : [SYNC_HANDLER_CLOG] = {
104 : .sync_syncfiletag = clogsyncfiletag
105 : },
106 : /* pg_commit_ts */
107 : [SYNC_HANDLER_COMMIT_TS] = {
108 : .sync_syncfiletag = committssyncfiletag
109 : },
110 : /* pg_multixact/offsets */
111 : [SYNC_HANDLER_MULTIXACT_OFFSET] = {
112 : .sync_syncfiletag = multixactoffsetssyncfiletag
113 : },
114 : /* pg_multixact/members */
115 : [SYNC_HANDLER_MULTIXACT_MEMBER] = {
116 : .sync_syncfiletag = multixactmemberssyncfiletag
117 : }
118 : };
119 :
120 : /*
121 : * Initialize data structures for the file sync tracking.
122 : */
123 : void
124 34702 : InitSync(void)
125 : {
126 : /*
127 : * Create pending-operations hashtable if we need it. Currently, we need
128 : * it if we are standalone (not under a postmaster) or if we are a
129 : * checkpointer auxiliary process.
130 : */
131 34702 : if (!IsUnderPostmaster || AmCheckpointerProcess())
132 : {
133 : HASHCTL hash_ctl;
134 :
135 : /*
136 : * XXX: The checkpointer needs to add entries to the pending ops table
137 : * when absorbing fsync requests. That is done within a critical
138 : * section, which isn't usually allowed, but we make an exception. It
139 : * means that there's a theoretical possibility that you run out of
140 : * memory while absorbing fsync requests, which leads to a PANIC.
141 : * Fortunately the hash table is small so that's unlikely to happen in
142 : * practice.
143 : */
144 1098 : pendingOpsCxt = AllocSetContextCreate(TopMemoryContext,
145 : "Pending ops context",
146 : ALLOCSET_DEFAULT_SIZES);
147 1098 : MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
148 :
149 1098 : hash_ctl.keysize = sizeof(FileTag);
150 1098 : hash_ctl.entrysize = sizeof(PendingFsyncEntry);
151 1098 : hash_ctl.hcxt = pendingOpsCxt;
152 1098 : pendingOps = hash_create("Pending Ops Table",
153 : 100L,
154 : &hash_ctl,
155 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
156 1098 : pendingUnlinks = NIL;
157 : }
158 34702 : }
159 :
160 : /*
161 : * SyncPreCheckpoint() -- Do pre-checkpoint work
162 : *
163 : * To distinguish unlink requests that arrived before this checkpoint
164 : * started from those that arrived during the checkpoint, we use a cycle
165 : * counter similar to the one we use for fsync requests. That cycle
166 : * counter is incremented here.
167 : *
168 : * This must be called *before* the checkpoint REDO point is determined.
169 : * That ensures that we won't delete files too soon. Since this calls
170 : * AbsorbSyncRequests(), which performs memory allocations, it cannot be
171 : * called within a critical section.
172 : *
173 : * Note that we can't do anything here that depends on the assumption
174 : * that the checkpoint will be completed.
175 : */
176 : void
177 2112 : SyncPreCheckpoint(void)
178 : {
179 : /*
180 : * Operations such as DROP TABLESPACE assume that the next checkpoint will
181 : * process all recently forwarded unlink requests, but if they aren't
182 : * absorbed prior to advancing the cycle counter, they won't be processed
183 : * until a future checkpoint. The following absorb ensures that any
184 : * unlink requests forwarded before the checkpoint began will be processed
185 : * in the current checkpoint.
186 : */
187 2112 : AbsorbSyncRequests();
188 :
189 : /*
190 : * Any unlink requests arriving after this point will be assigned the next
191 : * cycle counter, and won't be unlinked until next checkpoint.
192 : */
193 2112 : checkpoint_cycle_ctr++;
194 2112 : }
195 :
196 : /*
197 : * SyncPostCheckpoint() -- Do post-checkpoint work
198 : *
199 : * Remove any lingering files that can now be safely removed.
200 : */
201 : void
202 2112 : SyncPostCheckpoint(void)
203 : {
204 : int absorb_counter;
205 : ListCell *lc;
206 :
207 2112 : absorb_counter = UNLINKS_PER_ABSORB;
208 65368 : foreach(lc, pendingUnlinks)
209 : {
210 63512 : PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(lc);
211 : char path[MAXPGPATH];
212 :
213 : /* Skip over any canceled entries */
214 63512 : if (entry->canceled)
215 2 : continue;
216 :
217 : /*
218 : * New entries are appended to the end, so if the entry is new we've
219 : * reached the end of old entries.
220 : *
221 : * Note: if just the right number of consecutive checkpoints fail, we
222 : * could be fooled here by cycle_ctr wraparound. However, the only
223 : * consequence is that we'd delay unlinking for one more checkpoint,
224 : * which is perfectly tolerable.
225 : */
226 63510 : if (entry->cycle_ctr == checkpoint_cycle_ctr)
227 256 : break;
228 :
229 : /* Unlink the file */
230 63254 : if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag,
231 : path) < 0)
232 : {
233 : /*
234 : * There's a race condition, when the database is dropped at the
235 : * same time that we process the pending unlink requests. If the
236 : * DROP DATABASE deletes the file before we do, we will get ENOENT
237 : * here. rmtree() also has to ignore ENOENT errors, to deal with
238 : * the possibility that we delete the file first.
239 : */
240 6 : if (errno != ENOENT)
241 0 : ereport(WARNING,
242 : (errcode_for_file_access(),
243 : errmsg("could not remove file \"%s\": %m", path)));
244 : }
245 :
246 : /* Mark the list entry as canceled, just in case */
247 63254 : entry->canceled = true;
248 :
249 : /*
250 : * As in ProcessSyncRequests, we don't want to stop absorbing fsync
251 : * requests for a long time when there are many deletions to be done.
252 : * We can safely call AbsorbSyncRequests() at this point in the loop.
253 : */
254 63254 : if (--absorb_counter <= 0)
255 : {
256 6108 : AbsorbSyncRequests();
257 6108 : absorb_counter = UNLINKS_PER_ABSORB;
258 : }
259 : }
260 :
261 : /*
262 : * If we reached the end of the list, we can just remove the whole list
263 : * (remembering to pfree all the PendingUnlinkEntry objects). Otherwise,
264 : * we must keep the entries at or after "lc".
265 : */
266 2112 : if (lc == NULL)
267 : {
268 1856 : list_free_deep(pendingUnlinks);
269 1856 : pendingUnlinks = NIL;
270 : }
271 : else
272 : {
273 256 : int ntodelete = list_cell_number(pendingUnlinks, lc);
274 :
275 38304 : for (int i = 0; i < ntodelete; i++)
276 38048 : pfree(list_nth(pendingUnlinks, i));
277 :
278 256 : pendingUnlinks = list_delete_first_n(pendingUnlinks, ntodelete);
279 : }
280 2112 : }
281 :
282 : /*
283 : * ProcessSyncRequests() -- Process queued fsync requests.
284 : */
285 : void
286 2476 : ProcessSyncRequests(void)
287 : {
288 : static bool sync_in_progress = false;
289 :
290 : HASH_SEQ_STATUS hstat;
291 : PendingFsyncEntry *entry;
292 : int absorb_counter;
293 :
294 : /* Statistics on sync times */
295 2476 : int processed = 0;
296 : instr_time sync_start,
297 : sync_end,
298 : sync_diff;
299 : uint64 elapsed;
300 2476 : uint64 longest = 0;
301 2476 : uint64 total_elapsed = 0;
302 :
303 : /*
304 : * This is only called during checkpoints, and checkpoints should only
305 : * occur in processes that have created a pendingOps.
306 : */
307 2476 : if (!pendingOps)
308 0 : elog(ERROR, "cannot sync without a pendingOps table");
309 :
310 : /*
311 : * If we are in the checkpointer, the sync had better include all fsync
312 : * requests that were queued by backends up to this point. The tightest
313 : * race condition that could occur is that a buffer that must be written
314 : * and fsync'd for the checkpoint could have been dumped by a backend just
315 : * before it was visited by BufferSync(). We know the backend will have
316 : * queued an fsync request before clearing the buffer's dirtybit, so we
317 : * are safe as long as we do an Absorb after completing BufferSync().
318 : */
319 2476 : AbsorbSyncRequests();
320 :
321 : /*
322 : * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
323 : * checkpoint), we want to ignore fsync requests that are entered into the
324 : * hashtable after this point --- they should be processed next time,
325 : * instead. We use sync_cycle_ctr to tell old entries apart from new
326 : * ones: new ones will have cycle_ctr equal to the incremented value of
327 : * sync_cycle_ctr.
328 : *
329 : * In normal circumstances, all entries present in the table at this point
330 : * will have cycle_ctr exactly equal to the current (about to be old)
331 : * value of sync_cycle_ctr. However, if we fail partway through the
332 : * fsync'ing loop, then older values of cycle_ctr might remain when we
333 : * come back here to try again. Repeated checkpoint failures would
334 : * eventually wrap the counter around to the point where an old entry
335 : * might appear new, causing us to skip it, possibly allowing a checkpoint
336 : * to succeed that should not have. To forestall wraparound, any time the
337 : * previous ProcessSyncRequests() failed to complete, run through the
338 : * table and forcibly set cycle_ctr = sync_cycle_ctr.
339 : *
340 : * Think not to merge this loop with the main loop, as the problem is
341 : * exactly that that loop may fail before having visited all the entries.
342 : * From a performance point of view it doesn't matter anyway, as this path
343 : * will never be taken in a system that's functioning normally.
344 : */
345 2476 : if (sync_in_progress)
346 : {
347 : /* prior try failed, so update any stale cycle_ctr values */
348 0 : hash_seq_init(&hstat, pendingOps);
349 0 : while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
350 : {
351 0 : entry->cycle_ctr = sync_cycle_ctr;
352 : }
353 : }
354 :
355 : /* Advance counter so that new hashtable entries are distinguishable */
356 2476 : sync_cycle_ctr++;
357 :
358 : /* Set flag to detect failure if we don't reach the end of the loop */
359 2476 : sync_in_progress = true;
360 :
361 : /* Now scan the hashtable for fsync requests to process */
362 2476 : absorb_counter = FSYNCS_PER_ABSORB;
363 2476 : hash_seq_init(&hstat, pendingOps);
364 314460 : while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
365 : {
366 : int failures;
367 :
368 : /*
369 : * If the entry is new then don't process it this time; it is new.
370 : * Note "continue" bypasses the hash-remove call at the bottom of the
371 : * loop.
372 : */
373 311984 : if (entry->cycle_ctr == sync_cycle_ctr)
374 0 : continue;
375 :
376 : /* Else assert we haven't missed it */
377 : Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr);
378 :
379 : /*
380 : * If fsync is off then we don't have to bother opening the file at
381 : * all. (We delay checking until this point so that changing fsync on
382 : * the fly behaves sensibly.)
383 : */
384 311984 : if (enableFsync)
385 : {
386 : /*
387 : * If in checkpointer, we want to absorb pending requests every so
388 : * often to prevent overflow of the fsync request queue. It is
389 : * unspecified whether newly-added entries will be visited by
390 : * hash_seq_search, but we don't care since we don't need to
391 : * process them anyway.
392 : */
393 0 : if (--absorb_counter <= 0)
394 : {
395 0 : AbsorbSyncRequests();
396 0 : absorb_counter = FSYNCS_PER_ABSORB;
397 : }
398 :
399 : /*
400 : * The fsync table could contain requests to fsync segments that
401 : * have been deleted (unlinked) by the time we get to them. Rather
402 : * than just hoping an ENOENT (or EACCES on Windows) error can be
403 : * ignored, what we do on error is absorb pending requests and
404 : * then retry. Since mdunlink() queues a "cancel" message before
405 : * actually unlinking, the fsync request is guaranteed to be
406 : * marked canceled after the absorb if it really was this case.
407 : * DROP DATABASE likewise has to tell us to forget fsync requests
408 : * before it starts deletions.
409 : */
410 0 : for (failures = 0; !entry->canceled; failures++)
411 : {
412 : char path[MAXPGPATH];
413 :
414 0 : INSTR_TIME_SET_CURRENT(sync_start);
415 0 : if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag,
416 : path) == 0)
417 : {
418 : /* Success; update statistics about sync timing */
419 0 : INSTR_TIME_SET_CURRENT(sync_end);
420 0 : sync_diff = sync_end;
421 0 : INSTR_TIME_SUBTRACT(sync_diff, sync_start);
422 0 : elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
423 0 : if (elapsed > longest)
424 0 : longest = elapsed;
425 0 : total_elapsed += elapsed;
426 0 : processed++;
427 :
428 0 : if (log_checkpoints)
429 0 : elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f ms",
430 : processed,
431 : path,
432 : (double) elapsed / 1000);
433 :
434 0 : break; /* out of retry loop */
435 : }
436 :
437 : /*
438 : * It is possible that the relation has been dropped or
439 : * truncated since the fsync request was entered. Therefore,
440 : * allow ENOENT, but only if we didn't fail already on this
441 : * file.
442 : */
443 0 : if (!FILE_POSSIBLY_DELETED(errno) || failures > 0)
444 0 : ereport(data_sync_elevel(ERROR),
445 : (errcode_for_file_access(),
446 : errmsg("could not fsync file \"%s\": %m",
447 : path)));
448 : else
449 0 : ereport(DEBUG1,
450 : (errcode_for_file_access(),
451 : errmsg_internal("could not fsync file \"%s\" but retrying: %m",
452 : path)));
453 :
454 : /*
455 : * Absorb incoming requests and check to see if a cancel
456 : * arrived for this relation fork.
457 : */
458 0 : AbsorbSyncRequests();
459 0 : absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
460 : } /* end retry loop */
461 : }
462 :
463 : /* We are done with this entry, remove it */
464 311984 : if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL)
465 0 : elog(ERROR, "pendingOps corrupted");
466 : } /* end loop over hashtable entries */
467 :
468 : /* Return sync performance metrics for report at checkpoint end */
469 2476 : CheckpointStats.ckpt_sync_rels = processed;
470 2476 : CheckpointStats.ckpt_longest_sync = longest;
471 2476 : CheckpointStats.ckpt_agg_sync_time = total_elapsed;
472 :
473 : /* Flag successful completion of ProcessSyncRequests */
474 2476 : sync_in_progress = false;
475 2476 : }
476 :
477 : /*
478 : * RememberSyncRequest() -- callback from checkpointer side of sync request
479 : *
480 : * We stuff fsync requests into the local hash table for execution
481 : * during the checkpointer's next checkpoint. UNLINK requests go into a
482 : * separate linked list, however, because they get processed separately.
483 : *
484 : * See sync.h for more information on the types of sync requests supported.
485 : */
486 : void
487 2696842 : RememberSyncRequest(const FileTag *ftag, SyncRequestType type)
488 : {
489 : Assert(pendingOps);
490 :
491 2696842 : if (type == SYNC_FORGET_REQUEST)
492 : {
493 : PendingFsyncEntry *entry;
494 :
495 : /* Cancel previously entered request */
496 274226 : entry = (PendingFsyncEntry *) hash_search(pendingOps,
497 : ftag,
498 : HASH_FIND,
499 : NULL);
500 274226 : if (entry != NULL)
501 40936 : entry->canceled = true;
502 : }
503 2422616 : else if (type == SYNC_FILTER_REQUEST)
504 : {
505 : HASH_SEQ_STATUS hstat;
506 : PendingFsyncEntry *pfe;
507 : ListCell *cell;
508 :
509 : /* Cancel matching fsync requests */
510 50 : hash_seq_init(&hstat, pendingOps);
511 12366 : while ((pfe = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
512 : {
513 24624 : if (pfe->tag.handler == ftag->handler &&
514 12308 : syncsw[ftag->handler].sync_filetagmatches(ftag, &pfe->tag))
515 7586 : pfe->canceled = true;
516 : }
517 :
518 : /* Cancel matching unlink requests */
519 158 : foreach(cell, pendingUnlinks)
520 : {
521 108 : PendingUnlinkEntry *pue = (PendingUnlinkEntry *) lfirst(cell);
522 :
523 216 : if (pue->tag.handler == ftag->handler &&
524 108 : syncsw[ftag->handler].sync_filetagmatches(ftag, &pue->tag))
525 2 : pue->canceled = true;
526 : }
527 : }
528 2422566 : else if (type == SYNC_UNLINK_REQUEST)
529 : {
530 : /* Unlink request: put it in the linked list */
531 63256 : MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
532 : PendingUnlinkEntry *entry;
533 :
534 63256 : entry = palloc(sizeof(PendingUnlinkEntry));
535 63256 : entry->tag = *ftag;
536 63256 : entry->cycle_ctr = checkpoint_cycle_ctr;
537 63256 : entry->canceled = false;
538 :
539 63256 : pendingUnlinks = lappend(pendingUnlinks, entry);
540 :
541 63256 : MemoryContextSwitchTo(oldcxt);
542 : }
543 : else
544 : {
545 : /* Normal case: enter a request to fsync this segment */
546 2359310 : MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
547 : PendingFsyncEntry *entry;
548 : bool found;
549 :
550 : Assert(type == SYNC_REQUEST);
551 :
552 2359310 : entry = (PendingFsyncEntry *) hash_search(pendingOps,
553 : ftag,
554 : HASH_ENTER,
555 : &found);
556 : /* if new entry, or was previously canceled, initialize it */
557 2359310 : if (!found || entry->canceled)
558 : {
559 324890 : entry->cycle_ctr = sync_cycle_ctr;
560 324890 : entry->canceled = false;
561 : }
562 :
563 : /*
564 : * NB: it's intentional that we don't change cycle_ctr if the entry
565 : * already exists. The cycle_ctr must represent the oldest fsync
566 : * request that could be in the entry.
567 : */
568 :
569 2359310 : MemoryContextSwitchTo(oldcxt);
570 : }
571 2696842 : }
572 :
573 : /*
574 : * Register the sync request locally, or forward it to the checkpointer.
575 : *
576 : * If retryOnError is true, we'll keep trying if there is no space in the
577 : * queue. Return true if we succeeded, or false if there wasn't space.
578 : */
579 : bool
580 2889138 : RegisterSyncRequest(const FileTag *ftag, SyncRequestType type,
581 : bool retryOnError)
582 : {
583 : bool ret;
584 :
585 2889138 : if (pendingOps != NULL)
586 : {
587 : /* standalone backend or startup process: fsync state is local */
588 587948 : RememberSyncRequest(ftag, type);
589 587948 : return true;
590 : }
591 :
592 : for (;;)
593 : {
594 : /*
595 : * Notify the checkpointer about it. If we fail to queue a message in
596 : * retryOnError mode, we have to sleep and try again ... ugly, but
597 : * hopefully won't happen often.
598 : *
599 : * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an
600 : * error in the case of SYNC_UNLINK_REQUEST would leave the
601 : * no-longer-used file still present on disk, which would be bad, so
602 : * I'm inclined to assume that the checkpointer will always empty the
603 : * queue soon.
604 : */
605 2301614 : ret = ForwardSyncRequest(ftag, type);
606 :
607 : /*
608 : * If we are successful in queueing the request, or we failed and were
609 : * instructed not to retry on error, break.
610 : */
611 2301614 : if (ret || (!ret && !retryOnError))
612 : break;
613 :
614 424 : WaitLatch(NULL, WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, 10,
615 : WAIT_EVENT_REGISTER_SYNC_REQUEST);
616 : }
617 :
618 2301190 : return ret;
619 : }
|