Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * storage.c
4 : * code to create and destroy physical storage for relations
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/catalog/storage.c
12 : *
13 : * NOTES
14 : * Some of this code used to be in storage/smgr/smgr.c, and the
15 : * function names still reflect that.
16 : *
17 : *-------------------------------------------------------------------------
18 : */
19 :
20 : #include "postgres.h"
21 :
22 : #include "access/visibilitymap.h"
23 : #include "access/xact.h"
24 : #include "access/xlog.h"
25 : #include "access/xloginsert.h"
26 : #include "access/xlogutils.h"
27 : #include "catalog/storage.h"
28 : #include "catalog/storage_xlog.h"
29 : #include "miscadmin.h"
30 : #include "pgstat.h"
31 : #include "storage/bulk_write.h"
32 : #include "storage/freespace.h"
33 : #include "storage/proc.h"
34 : #include "storage/smgr.h"
35 : #include "utils/hsearch.h"
36 : #include "utils/memutils.h"
37 : #include "utils/rel.h"
38 :
39 : /* GUC variables */
40 : int wal_skip_threshold = 2048; /* in kilobytes */
41 :
42 : /*
43 : * We keep a list of all relations (represented as RelFileLocator values)
44 : * that have been created or deleted in the current transaction. When
45 : * a relation is created, we create the physical file immediately, but
46 : * remember it so that we can delete the file again if the current
47 : * transaction is aborted. Conversely, a deletion request is NOT
48 : * executed immediately, but is just entered in the list. When and if
49 : * the transaction commits, we can delete the physical file.
50 : *
51 : * To handle subtransactions, every entry is marked with its transaction
52 : * nesting level. At subtransaction commit, we reassign the subtransaction's
53 : * entries to the parent nesting level. At subtransaction abort, we can
54 : * immediately execute the abort-time actions for all entries of the current
55 : * nesting level.
56 : *
57 : * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
58 : * unbetimes. It'd probably be OK to keep it in TopTransactionContext,
59 : * but I'm being paranoid.
60 : */
61 :
62 : typedef struct PendingRelDelete
63 : {
64 : RelFileLocator rlocator; /* relation that may need to be deleted */
65 : ProcNumber procNumber; /* INVALID_PROC_NUMBER if not a temp rel */
66 : bool atCommit; /* T=delete at commit; F=delete at abort */
67 : int nestLevel; /* xact nesting level of request */
68 : struct PendingRelDelete *next; /* linked-list link */
69 : } PendingRelDelete;
70 :
71 : typedef struct PendingRelSync
72 : {
73 : RelFileLocator rlocator;
74 : bool is_truncated; /* Has the file experienced truncation? */
75 : } PendingRelSync;
76 :
77 : static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
78 : static HTAB *pendingSyncHash = NULL;
79 :
80 :
81 : /*
82 : * AddPendingSync
83 : * Queue an at-commit fsync.
84 : */
85 : static void
86 76690 : AddPendingSync(const RelFileLocator *rlocator)
87 : {
88 : PendingRelSync *pending;
89 : bool found;
90 :
91 : /* create the hash if not yet */
92 76690 : if (!pendingSyncHash)
93 : {
94 : HASHCTL ctl;
95 :
96 12490 : ctl.keysize = sizeof(RelFileLocator);
97 12490 : ctl.entrysize = sizeof(PendingRelSync);
98 12490 : ctl.hcxt = TopTransactionContext;
99 12490 : pendingSyncHash = hash_create("pending sync hash", 16, &ctl,
100 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
101 : }
102 :
103 76690 : pending = hash_search(pendingSyncHash, rlocator, HASH_ENTER, &found);
104 : Assert(!found);
105 76690 : pending->is_truncated = false;
106 76690 : }
107 :
108 : /*
109 : * RelationCreateStorage
110 : * Create physical storage for a relation.
111 : *
112 : * Create the underlying disk file storage for the relation. This only
113 : * creates the main fork; additional forks are created lazily by the
114 : * modules that need them.
115 : *
116 : * This function is transactional. The creation is WAL-logged, and if the
117 : * transaction aborts later on, the storage will be destroyed. A caller
118 : * that does not want the storage to be destroyed in case of an abort may
119 : * pass register_delete = false.
120 : */
121 : SMgrRelation
122 219504 : RelationCreateStorage(RelFileLocator rlocator, char relpersistence,
123 : bool register_delete)
124 : {
125 : SMgrRelation srel;
126 : ProcNumber procNumber;
127 : bool needs_wal;
128 :
129 : Assert(!IsInParallelMode()); /* couldn't update pendingSyncHash */
130 :
131 219504 : switch (relpersistence)
132 : {
133 6128 : case RELPERSISTENCE_TEMP:
134 6128 : procNumber = ProcNumberForTempRelations();
135 6128 : needs_wal = false;
136 6128 : break;
137 536 : case RELPERSISTENCE_UNLOGGED:
138 536 : procNumber = INVALID_PROC_NUMBER;
139 536 : needs_wal = false;
140 536 : break;
141 212840 : case RELPERSISTENCE_PERMANENT:
142 212840 : procNumber = INVALID_PROC_NUMBER;
143 212840 : needs_wal = true;
144 212840 : break;
145 0 : default:
146 0 : elog(ERROR, "invalid relpersistence: %c", relpersistence);
147 : return NULL; /* placate compiler */
148 : }
149 :
150 219504 : srel = smgropen(rlocator, procNumber);
151 219504 : smgrcreate(srel, MAIN_FORKNUM, false);
152 :
153 219504 : if (needs_wal)
154 212840 : log_smgrcreate(&srel->smgr_rlocator.locator, MAIN_FORKNUM);
155 :
156 : /*
157 : * Add the relation to the list of stuff to delete at abort, if we are
158 : * asked to do so.
159 : */
160 219504 : if (register_delete)
161 : {
162 : PendingRelDelete *pending;
163 :
164 : pending = (PendingRelDelete *)
165 115984 : MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
166 115984 : pending->rlocator = rlocator;
167 115984 : pending->procNumber = procNumber;
168 115984 : pending->atCommit = false; /* delete if abort */
169 115984 : pending->nestLevel = GetCurrentTransactionNestLevel();
170 115984 : pending->next = pendingDeletes;
171 115984 : pendingDeletes = pending;
172 : }
173 :
174 219504 : if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded())
175 : {
176 : Assert(procNumber == INVALID_PROC_NUMBER);
177 73094 : AddPendingSync(&rlocator);
178 : }
179 :
180 219504 : return srel;
181 : }
182 :
183 : /*
184 : * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL.
185 : */
186 : void
187 247580 : log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
188 : {
189 : xl_smgr_create xlrec;
190 :
191 : /*
192 : * Make an XLOG entry reporting the file creation.
193 : */
194 247580 : xlrec.rlocator = *rlocator;
195 247580 : xlrec.forkNum = forkNum;
196 :
197 247580 : XLogBeginInsert();
198 247580 : XLogRegisterData(&xlrec, sizeof(xlrec));
199 247580 : XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE);
200 247580 : }
201 :
202 : /*
203 : * RelationDropStorage
204 : * Schedule unlinking of physical storage at transaction commit.
205 : */
206 : void
207 71736 : RelationDropStorage(Relation rel)
208 : {
209 : PendingRelDelete *pending;
210 :
211 : /* Add the relation to the list of stuff to delete at commit */
212 : pending = (PendingRelDelete *)
213 71736 : MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
214 71736 : pending->rlocator = rel->rd_locator;
215 71736 : pending->procNumber = rel->rd_backend;
216 71736 : pending->atCommit = true; /* delete if commit */
217 71736 : pending->nestLevel = GetCurrentTransactionNestLevel();
218 71736 : pending->next = pendingDeletes;
219 71736 : pendingDeletes = pending;
220 :
221 : /*
222 : * NOTE: if the relation was created in this transaction, it will now be
223 : * present in the pending-delete list twice, once with atCommit true and
224 : * once with atCommit false. Hence, it will be physically deleted at end
225 : * of xact in either case (and the other entry will be ignored by
226 : * smgrDoPendingDeletes, so no error will occur). We could instead remove
227 : * the existing list entry and delete the physical file immediately, but
228 : * for now I'll keep the logic simple.
229 : */
230 :
231 71736 : RelationCloseSmgr(rel);
232 71736 : }
233 :
234 : /*
235 : * RelationPreserveStorage
236 : * Mark a relation as not to be deleted after all.
237 : *
238 : * We need this function because relation mapping changes are committed
239 : * separately from commit of the whole transaction, so it's still possible
240 : * for the transaction to abort after the mapping update is done.
241 : * When a new physical relation is installed in the map, it would be
242 : * scheduled for delete-on-abort, so we'd delete it, and be in trouble.
243 : * The relation mapper fixes this by telling us to not delete such relations
244 : * after all as part of its commit.
245 : *
246 : * We also use this to reuse an old build of an index during ALTER TABLE, this
247 : * time removing the delete-at-commit entry.
248 : *
249 : * No-op if the relation is not among those scheduled for deletion.
250 : */
251 : void
252 12624 : RelationPreserveStorage(RelFileLocator rlocator, bool atCommit)
253 : {
254 : PendingRelDelete *pending;
255 : PendingRelDelete *prev;
256 : PendingRelDelete *next;
257 :
258 12624 : prev = NULL;
259 75064 : for (pending = pendingDeletes; pending != NULL; pending = next)
260 : {
261 62440 : next = pending->next;
262 62440 : if (RelFileLocatorEquals(rlocator, pending->rlocator)
263 1146 : && pending->atCommit == atCommit)
264 : {
265 : /* unlink and delete list entry */
266 1140 : if (prev)
267 840 : prev->next = next;
268 : else
269 300 : pendingDeletes = next;
270 1140 : pfree(pending);
271 : /* prev does not change */
272 : }
273 : else
274 : {
275 : /* unrelated entry, don't touch it */
276 61300 : prev = pending;
277 : }
278 : }
279 12624 : }
280 :
281 : /*
282 : * RelationTruncate
283 : * Physically truncate a relation to the specified number of blocks.
284 : *
285 : * This includes getting rid of any buffers for the blocks that are to be
286 : * dropped.
287 : */
288 : void
289 1082 : RelationTruncate(Relation rel, BlockNumber nblocks)
290 : {
291 : bool fsm;
292 : bool vm;
293 1082 : bool need_fsm_vacuum = false;
294 : ForkNumber forks[MAX_FORKNUM];
295 : BlockNumber old_blocks[MAX_FORKNUM];
296 : BlockNumber blocks[MAX_FORKNUM];
297 1082 : int nforks = 0;
298 : SMgrRelation reln;
299 :
300 : /*
301 : * Make sure smgr_targblock etc aren't pointing somewhere past new end.
302 : * (Note: don't rely on this reln pointer below this loop.)
303 : */
304 1082 : reln = RelationGetSmgr(rel);
305 1082 : reln->smgr_targblock = InvalidBlockNumber;
306 5410 : for (int i = 0; i <= MAX_FORKNUM; ++i)
307 4328 : reln->smgr_cached_nblocks[i] = InvalidBlockNumber;
308 :
309 : /* Prepare for truncation of MAIN fork of the relation */
310 1082 : forks[nforks] = MAIN_FORKNUM;
311 1082 : old_blocks[nforks] = smgrnblocks(reln, MAIN_FORKNUM);
312 1082 : blocks[nforks] = nblocks;
313 1082 : nforks++;
314 :
315 : /* Prepare for truncation of the FSM if it exists */
316 1082 : fsm = smgrexists(RelationGetSmgr(rel), FSM_FORKNUM);
317 1082 : if (fsm)
318 : {
319 302 : blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, nblocks);
320 302 : if (BlockNumberIsValid(blocks[nforks]))
321 : {
322 302 : forks[nforks] = FSM_FORKNUM;
323 302 : old_blocks[nforks] = smgrnblocks(reln, FSM_FORKNUM);
324 302 : nforks++;
325 302 : need_fsm_vacuum = true;
326 : }
327 : }
328 :
329 : /* Prepare for truncation of the visibility map too if it exists */
330 1082 : vm = smgrexists(RelationGetSmgr(rel), VISIBILITYMAP_FORKNUM);
331 1082 : if (vm)
332 : {
333 302 : blocks[nforks] = visibilitymap_prepare_truncate(rel, nblocks);
334 302 : if (BlockNumberIsValid(blocks[nforks]))
335 : {
336 118 : forks[nforks] = VISIBILITYMAP_FORKNUM;
337 118 : old_blocks[nforks] = smgrnblocks(reln, VISIBILITYMAP_FORKNUM);
338 118 : nforks++;
339 : }
340 : }
341 :
342 1082 : RelationPreTruncate(rel);
343 :
344 : /*
345 : * The code which follows can interact with concurrent checkpoints in two
346 : * separate ways.
347 : *
348 : * First, the truncation operation might drop buffers that the checkpoint
349 : * otherwise would have flushed. If it does, then it's essential that the
350 : * files actually get truncated on disk before the checkpoint record is
351 : * written. Otherwise, if reply begins from that checkpoint, the
352 : * to-be-truncated blocks might still exist on disk but have older
353 : * contents than expected, which can cause replay to fail. It's OK for the
354 : * blocks to not exist on disk at all, but not for them to have the wrong
355 : * contents. For this reason, we need to set DELAY_CHKPT_COMPLETE while
356 : * this code executes.
357 : *
358 : * Second, the call to smgrtruncate() below will in turn call
359 : * RegisterSyncRequest(). We need the sync request created by that call to
360 : * be processed before the checkpoint completes. CheckPointGuts() will
361 : * call ProcessSyncRequests(), but if we register our sync request after
362 : * that happens, then the WAL record for the truncation could end up
363 : * preceding the checkpoint record, while the actual sync doesn't happen
364 : * until the next checkpoint. To prevent that, we need to set
365 : * DELAY_CHKPT_START here. That way, if the XLOG_SMGR_TRUNCATE precedes
366 : * the redo pointer of a concurrent checkpoint, we're guaranteed that the
367 : * corresponding sync request will be processed before the checkpoint
368 : * completes.
369 : */
370 : Assert((MyProc->delayChkptFlags & (DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE)) == 0);
371 1082 : MyProc->delayChkptFlags |= DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE;
372 :
373 : /*
374 : * We WAL-log the truncation first and then truncate in a critical
375 : * section. Truncation drops buffers, even if dirty, and then truncates
376 : * disk files. All of that work needs to complete before the lock is
377 : * released, or else old versions of pages on disk that are missing recent
378 : * changes would become accessible again. We'll try the whole operation
379 : * again in crash recovery if we panic, but even then we can't give up
380 : * because we don't want standbys' relation sizes to diverge and break
381 : * replay or visibility invariants downstream. The critical section also
382 : * suppresses interrupts.
383 : *
384 : * (See also visibilitymap.c if changing this code.)
385 : */
386 1082 : START_CRIT_SECTION();
387 :
388 1082 : if (RelationNeedsWAL(rel))
389 : {
390 : /*
391 : * Make an XLOG entry reporting the file truncation.
392 : */
393 : XLogRecPtr lsn;
394 : xl_smgr_truncate xlrec;
395 :
396 412 : xlrec.blkno = nblocks;
397 412 : xlrec.rlocator = rel->rd_locator;
398 412 : xlrec.flags = SMGR_TRUNCATE_ALL;
399 :
400 412 : XLogBeginInsert();
401 412 : XLogRegisterData(&xlrec, sizeof(xlrec));
402 :
403 412 : lsn = XLogInsert(RM_SMGR_ID,
404 : XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
405 :
406 : /*
407 : * Flush, because otherwise the truncation of the main relation might
408 : * hit the disk before the WAL record, and the truncation of the FSM
409 : * or visibility map. If we crashed during that window, we'd be left
410 : * with a truncated heap, but the FSM or visibility map would still
411 : * contain entries for the non-existent heap pages, and standbys would
412 : * also never replay the truncation.
413 : */
414 412 : XLogFlush(lsn);
415 : }
416 :
417 : /*
418 : * This will first remove any buffers from the buffer pool that should no
419 : * longer exist after truncation is complete, and then truncate the
420 : * corresponding files on disk.
421 : */
422 1082 : smgrtruncate(RelationGetSmgr(rel), forks, nforks, old_blocks, blocks);
423 :
424 1082 : END_CRIT_SECTION();
425 :
426 : /* We've done all the critical work, so checkpoints are OK now. */
427 1082 : MyProc->delayChkptFlags &= ~(DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE);
428 :
429 : /*
430 : * Update upper-level FSM pages to account for the truncation. This is
431 : * important because the just-truncated pages were likely marked as
432 : * all-free, and would be preferentially selected.
433 : *
434 : * NB: There's no point in delaying checkpoints until this is done.
435 : * Because the FSM is not WAL-logged, we have to be prepared for the
436 : * possibility of corruption after a crash anyway.
437 : */
438 1082 : if (need_fsm_vacuum)
439 302 : FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber);
440 1082 : }
441 :
442 : /*
443 : * RelationPreTruncate
444 : * Perform AM-independent work before a physical truncation.
445 : *
446 : * If an access method's relation_nontransactional_truncate does not call
447 : * RelationTruncate(), it must call this before decreasing the table size.
448 : */
449 : void
450 1082 : RelationPreTruncate(Relation rel)
451 : {
452 : PendingRelSync *pending;
453 :
454 1082 : if (!pendingSyncHash)
455 1076 : return;
456 :
457 6 : pending = hash_search(pendingSyncHash,
458 6 : &(RelationGetSmgr(rel)->smgr_rlocator.locator),
459 : HASH_FIND, NULL);
460 6 : if (pending)
461 6 : pending->is_truncated = true;
462 : }
463 :
464 : /*
465 : * Copy a fork's data, block by block.
466 : *
467 : * Note that this requires that there is no dirty data in shared buffers. If
468 : * it's possible that there are, callers need to flush those using
469 : * e.g. FlushRelationBuffers(rel).
470 : *
471 : * Also note that this is frequently called via locutions such as
472 : * RelationCopyStorage(RelationGetSmgr(rel), ...);
473 : * That's safe only because we perform only smgr and WAL operations here.
474 : * If we invoked anything else, a relcache flush could cause our SMgrRelation
475 : * argument to become a dangling pointer.
476 : */
477 : void
478 178 : RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
479 : ForkNumber forkNum, char relpersistence)
480 : {
481 : bool use_wal;
482 : bool copying_initfork;
483 : BlockNumber nblocks;
484 : BlockNumber blkno;
485 : BulkWriteState *bulkstate;
486 :
487 : /*
488 : * The init fork for an unlogged relation in many respects has to be
489 : * treated the same as normal relation, changes need to be WAL logged and
490 : * it needs to be synced to disk.
491 : */
492 178 : copying_initfork = relpersistence == RELPERSISTENCE_UNLOGGED &&
493 : forkNum == INIT_FORKNUM;
494 :
495 : /*
496 : * We need to log the copied data in WAL iff WAL archiving/streaming is
497 : * enabled AND it's a permanent relation. This gives the same answer as
498 : * "RelationNeedsWAL(rel) || copying_initfork", because we know the
499 : * current operation created new relation storage.
500 : */
501 194 : use_wal = XLogIsNeeded() &&
502 16 : (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
503 :
504 178 : bulkstate = smgr_bulk_start_smgr(dst, forkNum, use_wal);
505 :
506 178 : nblocks = smgrnblocks(src, forkNum);
507 :
508 1372 : for (blkno = 0; blkno < nblocks; blkno++)
509 : {
510 : BulkWriteBuffer buf;
511 : int piv_flags;
512 : bool checksum_failure;
513 : bool verified;
514 :
515 : /* If we got a cancel signal during the copy of the data, quit */
516 1194 : CHECK_FOR_INTERRUPTS();
517 :
518 1194 : buf = smgr_bulk_get_buf(bulkstate);
519 1194 : smgrread(src, forkNum, blkno, (Page) buf);
520 :
521 1194 : piv_flags = PIV_LOG_WARNING;
522 1194 : if (ignore_checksum_failure)
523 0 : piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE;
524 1194 : verified = PageIsVerified((Page) buf, blkno, piv_flags,
525 : &checksum_failure);
526 1194 : if (checksum_failure)
527 : {
528 0 : RelFileLocatorBackend rloc = src->smgr_rlocator;
529 :
530 0 : pgstat_prepare_report_checksum_failure(rloc.locator.dbOid);
531 0 : pgstat_report_checksum_failures_in_db(rloc.locator.dbOid, 1);
532 : }
533 :
534 1194 : if (!verified)
535 : {
536 : /*
537 : * For paranoia's sake, capture the file path before invoking the
538 : * ereport machinery. This guards against the possibility of a
539 : * relcache flush caused by, e.g., an errcontext callback.
540 : * (errcontext callbacks shouldn't be risking any such thing, but
541 : * people have been known to forget that rule.)
542 : */
543 0 : RelPathStr relpath = relpathbackend(src->smgr_rlocator.locator,
544 : src->smgr_rlocator.backend,
545 : forkNum);
546 :
547 0 : ereport(ERROR,
548 : (errcode(ERRCODE_DATA_CORRUPTED),
549 : errmsg("invalid page in block %u of relation %s",
550 : blkno, relpath.str)));
551 : }
552 :
553 : /*
554 : * Queue the page for WAL-logging and writing out. Unfortunately we
555 : * don't know what kind of a page this is, so we have to log the full
556 : * page including any unused space.
557 : */
558 1194 : smgr_bulk_write(bulkstate, blkno, buf, false);
559 : }
560 178 : smgr_bulk_finish(bulkstate);
561 178 : }
562 :
563 : /*
564 : * RelFileLocatorSkippingWAL
565 : * Check if a BM_PERMANENT relfilelocator is using WAL.
566 : *
567 : * Changes to certain relations must not write WAL; see "Skipping WAL for
568 : * New RelFileLocator" in src/backend/access/transam/README. Though it is
569 : * known from Relation efficiently, this function is intended for the code
570 : * paths not having access to Relation.
571 : */
572 : bool
573 169066 : RelFileLocatorSkippingWAL(RelFileLocator rlocator)
574 : {
575 180644 : if (!pendingSyncHash ||
576 11578 : hash_search(pendingSyncHash, &rlocator, HASH_FIND, NULL) == NULL)
577 165428 : return false;
578 :
579 3638 : return true;
580 : }
581 :
582 : /*
583 : * EstimatePendingSyncsSpace
584 : * Estimate space needed to pass syncs to parallel workers.
585 : */
586 : Size
587 910 : EstimatePendingSyncsSpace(void)
588 : {
589 : long entries;
590 :
591 910 : entries = pendingSyncHash ? hash_get_num_entries(pendingSyncHash) : 0;
592 910 : return mul_size(1 + entries, sizeof(RelFileLocator));
593 : }
594 :
595 : /*
596 : * SerializePendingSyncs
597 : * Serialize syncs for parallel workers.
598 : */
599 : void
600 910 : SerializePendingSyncs(Size maxSize, char *startAddress)
601 : {
602 : HTAB *tmphash;
603 : HASHCTL ctl;
604 : HASH_SEQ_STATUS scan;
605 : PendingRelSync *sync;
606 : PendingRelDelete *delete;
607 : RelFileLocator *src;
608 910 : RelFileLocator *dest = (RelFileLocator *) startAddress;
609 :
610 910 : if (!pendingSyncHash)
611 718 : goto terminate;
612 :
613 : /* Create temporary hash to collect active relfilelocators */
614 192 : ctl.keysize = sizeof(RelFileLocator);
615 192 : ctl.entrysize = sizeof(RelFileLocator);
616 192 : ctl.hcxt = CurrentMemoryContext;
617 192 : tmphash = hash_create("tmp relfilelocators",
618 : hash_get_num_entries(pendingSyncHash), &ctl,
619 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
620 :
621 : /* collect all rlocator from pending syncs */
622 192 : hash_seq_init(&scan, pendingSyncHash);
623 1654 : while ((sync = (PendingRelSync *) hash_seq_search(&scan)))
624 1462 : (void) hash_search(tmphash, &sync->rlocator, HASH_ENTER, NULL);
625 :
626 : /* remove deleted rnodes */
627 1994 : for (delete = pendingDeletes; delete != NULL; delete = delete->next)
628 1802 : if (delete->atCommit)
629 330 : (void) hash_search(tmphash, &delete->rlocator,
630 : HASH_REMOVE, NULL);
631 :
632 192 : hash_seq_init(&scan, tmphash);
633 1332 : while ((src = (RelFileLocator *) hash_seq_search(&scan)))
634 1140 : *dest++ = *src;
635 :
636 192 : hash_destroy(tmphash);
637 :
638 910 : terminate:
639 910 : MemSet(dest, 0, sizeof(RelFileLocator));
640 910 : }
641 :
642 : /*
643 : * RestorePendingSyncs
644 : * Restore syncs within a parallel worker.
645 : *
646 : * RelationNeedsWAL() and RelFileLocatorSkippingWAL() must offer the correct
647 : * answer to parallel workers. Only smgrDoPendingSyncs() reads the
648 : * is_truncated field, at end of transaction. Hence, don't restore it.
649 : */
650 : void
651 2736 : RestorePendingSyncs(char *startAddress)
652 : {
653 : RelFileLocator *rlocator;
654 :
655 : Assert(pendingSyncHash == NULL);
656 6332 : for (rlocator = (RelFileLocator *) startAddress; rlocator->relNumber != 0;
657 3596 : rlocator++)
658 3596 : AddPendingSync(rlocator);
659 2736 : }
660 :
661 : /*
662 : * smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
663 : *
664 : * This also runs when aborting a subxact; we want to clean up a failed
665 : * subxact immediately.
666 : *
667 : * Note: It's possible that we're being asked to remove a relation that has
668 : * no physical storage in any fork. In particular, it's possible that we're
669 : * cleaning up an old temporary relation for which RemovePgTempFiles has
670 : * already recovered the physical storage.
671 : */
672 : void
673 825274 : smgrDoPendingDeletes(bool isCommit)
674 : {
675 825274 : int nestLevel = GetCurrentTransactionNestLevel();
676 : PendingRelDelete *pending;
677 : PendingRelDelete *prev;
678 : PendingRelDelete *next;
679 825274 : int nrels = 0,
680 825274 : maxrels = 0;
681 825274 : SMgrRelation *srels = NULL;
682 :
683 825274 : prev = NULL;
684 1019962 : for (pending = pendingDeletes; pending != NULL; pending = next)
685 : {
686 194688 : next = pending->next;
687 194688 : if (pending->nestLevel < nestLevel)
688 : {
689 : /* outer-level entries should not be processed yet */
690 8228 : prev = pending;
691 : }
692 : else
693 : {
694 : /* unlink list entry first, so we don't retry on failure */
695 186460 : if (prev)
696 0 : prev->next = next;
697 : else
698 186460 : pendingDeletes = next;
699 : /* do deletion if called for */
700 186460 : if (pending->atCommit == isCommit)
701 : {
702 : SMgrRelation srel;
703 :
704 74546 : srel = smgropen(pending->rlocator, pending->procNumber);
705 :
706 : /* allocate the initial array, or extend it, if needed */
707 74546 : if (maxrels == 0)
708 : {
709 21676 : maxrels = 8;
710 21676 : srels = palloc(sizeof(SMgrRelation) * maxrels);
711 : }
712 52870 : else if (maxrels <= nrels)
713 : {
714 1718 : maxrels *= 2;
715 1718 : srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
716 : }
717 :
718 74546 : srels[nrels++] = srel;
719 : }
720 : /* must explicitly free the list entry */
721 186460 : pfree(pending);
722 : /* prev does not change */
723 : }
724 : }
725 :
726 825274 : if (nrels > 0)
727 : {
728 21676 : smgrdounlinkall(srels, nrels, false);
729 :
730 96222 : for (int i = 0; i < nrels; i++)
731 74546 : smgrclose(srels[i]);
732 :
733 21676 : pfree(srels);
734 : }
735 825274 : }
736 :
737 : /*
738 : * smgrDoPendingSyncs() -- Take care of relation syncs at end of xact.
739 : */
740 : void
741 816950 : smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
742 : {
743 : PendingRelDelete *pending;
744 816950 : int nrels = 0,
745 816950 : maxrels = 0;
746 816950 : SMgrRelation *srels = NULL;
747 : HASH_SEQ_STATUS scan;
748 : PendingRelSync *pendingsync;
749 :
750 : Assert(GetCurrentTransactionNestLevel() == 1);
751 :
752 816950 : if (!pendingSyncHash)
753 805440 : return; /* no relation needs sync */
754 :
755 : /* Abort -- just throw away all pending syncs */
756 12490 : if (!isCommit)
757 : {
758 582 : pendingSyncHash = NULL;
759 582 : return;
760 : }
761 :
762 : AssertPendingSyncs_RelationCache();
763 :
764 : /* Parallel worker -- just throw away all pending syncs */
765 11908 : if (isParallelWorker)
766 : {
767 398 : pendingSyncHash = NULL;
768 398 : return;
769 : }
770 :
771 : /* Skip syncing nodes that smgrDoPendingDeletes() will delete. */
772 45070 : for (pending = pendingDeletes; pending != NULL; pending = pending->next)
773 33560 : if (pending->atCommit)
774 7270 : (void) hash_search(pendingSyncHash, &pending->rlocator,
775 : HASH_REMOVE, NULL);
776 :
777 11510 : hash_seq_init(&scan, pendingSyncHash);
778 82922 : while ((pendingsync = (PendingRelSync *) hash_seq_search(&scan)))
779 : {
780 : ForkNumber fork;
781 : BlockNumber nblocks[MAX_FORKNUM + 1];
782 71412 : uint64 total_blocks = 0;
783 : SMgrRelation srel;
784 :
785 71412 : srel = smgropen(pendingsync->rlocator, INVALID_PROC_NUMBER);
786 :
787 : /*
788 : * We emit newpage WAL records for smaller relations.
789 : *
790 : * Small WAL records have a chance to be flushed along with other
791 : * backends' WAL records. We emit WAL records instead of syncing for
792 : * files that are smaller than a certain threshold, expecting faster
793 : * commit. The threshold is defined by the GUC wal_skip_threshold.
794 : */
795 71412 : if (!pendingsync->is_truncated)
796 : {
797 357060 : for (fork = 0; fork <= MAX_FORKNUM; fork++)
798 : {
799 285648 : if (smgrexists(srel, fork))
800 : {
801 86882 : BlockNumber n = smgrnblocks(srel, fork);
802 :
803 : /* we shouldn't come here for unlogged relations */
804 : Assert(fork != INIT_FORKNUM);
805 86882 : nblocks[fork] = n;
806 86882 : total_blocks += n;
807 : }
808 : else
809 198766 : nblocks[fork] = InvalidBlockNumber;
810 : }
811 : }
812 :
813 : /*
814 : * Sync file or emit WAL records for its contents.
815 : *
816 : * Although we emit WAL record if the file is small enough, do file
817 : * sync regardless of the size if the file has experienced a
818 : * truncation. It is because the file would be followed by trailing
819 : * garbage blocks after a crash recovery if, while a past longer file
820 : * had been flushed out, we omitted syncing-out of the file and
821 : * emitted WAL instead. You might think that we could choose WAL if
822 : * the current main fork is longer than ever, but there's a case where
823 : * main fork is longer than ever but FSM fork gets shorter.
824 : */
825 71412 : if (pendingsync->is_truncated ||
826 71412 : total_blocks >= wal_skip_threshold * (uint64) 1024 / BLCKSZ)
827 : {
828 : /* allocate the initial array, or extend it, if needed */
829 20 : if (maxrels == 0)
830 : {
831 20 : maxrels = 8;
832 20 : srels = palloc(sizeof(SMgrRelation) * maxrels);
833 : }
834 0 : else if (maxrels <= nrels)
835 : {
836 0 : maxrels *= 2;
837 0 : srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
838 : }
839 :
840 20 : srels[nrels++] = srel;
841 : }
842 : else
843 : {
844 : /* Emit WAL records for all blocks. The file is small enough. */
845 356960 : for (fork = 0; fork <= MAX_FORKNUM; fork++)
846 : {
847 285568 : int n = nblocks[fork];
848 : Relation rel;
849 :
850 285568 : if (!BlockNumberIsValid(n))
851 198708 : continue;
852 :
853 : /*
854 : * Emit WAL for the whole file. Unfortunately we don't know
855 : * what kind of a page this is, so we have to log the full
856 : * page including any unused space. ReadBufferExtended()
857 : * counts some pgstat events; unfortunately, we discard them.
858 : */
859 86860 : rel = CreateFakeRelcacheEntry(srel->smgr_rlocator.locator);
860 86860 : log_newpage_range(rel, fork, 0, n, false);
861 86860 : FreeFakeRelcacheEntry(rel);
862 : }
863 : }
864 : }
865 :
866 11510 : pendingSyncHash = NULL;
867 :
868 11510 : if (nrels > 0)
869 : {
870 20 : smgrdosyncall(srels, nrels);
871 20 : pfree(srels);
872 : }
873 : }
874 :
875 : /*
876 : * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
877 : *
878 : * The return value is the number of relations scheduled for termination.
879 : * *ptr is set to point to a freshly-palloc'd array of RelFileLocators.
880 : * If there are no relations to be deleted, *ptr is set to NULL.
881 : *
882 : * Only non-temporary relations are included in the returned list. This is OK
883 : * because the list is used only in contexts where temporary relations don't
884 : * matter: we're either writing to the two-phase state file (and transactions
885 : * that have touched temp tables can't be prepared) or we're writing to xlog
886 : * (and all temporary files will be zapped if we restart anyway, so no need
887 : * for redo to do it also).
888 : *
889 : * Note that the list does not include anything scheduled for termination
890 : * by upper-level transactions.
891 : */
892 : int
893 777590 : smgrGetPendingDeletes(bool forCommit, RelFileLocator **ptr)
894 : {
895 777590 : int nestLevel = GetCurrentTransactionNestLevel();
896 : int nrels;
897 : RelFileLocator *rptr;
898 : PendingRelDelete *pending;
899 :
900 777590 : nrels = 0;
901 969434 : for (pending = pendingDeletes; pending != NULL; pending = pending->next)
902 : {
903 191844 : if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
904 74666 : && pending->procNumber == INVALID_PROC_NUMBER)
905 68538 : nrels++;
906 : }
907 777590 : if (nrels == 0)
908 : {
909 757402 : *ptr = NULL;
910 757402 : return 0;
911 : }
912 20188 : rptr = (RelFileLocator *) palloc(nrels * sizeof(RelFileLocator));
913 20188 : *ptr = rptr;
914 105426 : for (pending = pendingDeletes; pending != NULL; pending = pending->next)
915 : {
916 85238 : if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
917 68686 : && pending->procNumber == INVALID_PROC_NUMBER)
918 : {
919 68538 : *rptr = pending->rlocator;
920 68538 : rptr++;
921 : }
922 : }
923 20188 : return nrels;
924 : }
925 :
926 : /*
927 : * PostPrepare_smgr -- Clean up after a successful PREPARE
928 : *
929 : * What we have to do here is throw away the in-memory state about pending
930 : * relation deletes. It's all been recorded in the 2PC state file and
931 : * it's no longer smgr's job to worry about it.
932 : */
933 : void
934 562 : PostPrepare_smgr(void)
935 : {
936 : PendingRelDelete *pending;
937 : PendingRelDelete *next;
938 :
939 682 : for (pending = pendingDeletes; pending != NULL; pending = next)
940 : {
941 120 : next = pending->next;
942 120 : pendingDeletes = next;
943 : /* must explicitly free the list entry */
944 120 : pfree(pending);
945 : }
946 562 : }
947 :
948 :
949 : /*
950 : * AtSubCommit_smgr() --- Take care of subtransaction commit.
951 : *
952 : * Reassign all items in the pending-deletes list to the parent transaction.
953 : */
954 : void
955 10618 : AtSubCommit_smgr(void)
956 : {
957 10618 : int nestLevel = GetCurrentTransactionNestLevel();
958 : PendingRelDelete *pending;
959 :
960 11074 : for (pending = pendingDeletes; pending != NULL; pending = pending->next)
961 : {
962 456 : if (pending->nestLevel >= nestLevel)
963 214 : pending->nestLevel = nestLevel - 1;
964 : }
965 10618 : }
966 :
967 : /*
968 : * AtSubAbort_smgr() --- Take care of subtransaction abort.
969 : *
970 : * Delete created relations and forget about deleted relations.
971 : * We can execute these operations immediately because we know this
972 : * subtransaction will not commit.
973 : */
974 : void
975 9292 : AtSubAbort_smgr(void)
976 : {
977 9292 : smgrDoPendingDeletes(false);
978 9292 : }
979 :
980 : void
981 31544 : smgr_redo(XLogReaderState *record)
982 : {
983 31544 : XLogRecPtr lsn = record->EndRecPtr;
984 31544 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
985 :
986 : /* Backup blocks are not used in smgr records */
987 : Assert(!XLogRecHasAnyBlockRefs(record));
988 :
989 31544 : if (info == XLOG_SMGR_CREATE)
990 : {
991 31448 : xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
992 : SMgrRelation reln;
993 :
994 31448 : reln = smgropen(xlrec->rlocator, INVALID_PROC_NUMBER);
995 31448 : smgrcreate(reln, xlrec->forkNum, true);
996 : }
997 96 : else if (info == XLOG_SMGR_TRUNCATE)
998 : {
999 96 : xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
1000 : SMgrRelation reln;
1001 : Relation rel;
1002 : ForkNumber forks[MAX_FORKNUM];
1003 : BlockNumber blocks[MAX_FORKNUM];
1004 : BlockNumber old_blocks[MAX_FORKNUM];
1005 96 : int nforks = 0;
1006 96 : bool need_fsm_vacuum = false;
1007 :
1008 96 : reln = smgropen(xlrec->rlocator, INVALID_PROC_NUMBER);
1009 :
1010 : /*
1011 : * Forcibly create relation if it doesn't exist (which suggests that
1012 : * it was dropped somewhere later in the WAL sequence). As in
1013 : * XLogReadBufferForRedo, we prefer to recreate the rel and replay the
1014 : * log as best we can until the drop is seen.
1015 : */
1016 96 : smgrcreate(reln, MAIN_FORKNUM, true);
1017 :
1018 : /*
1019 : * Before we perform the truncation, update minimum recovery point to
1020 : * cover this WAL record. Once the relation is truncated, there's no
1021 : * going back. The buffer manager enforces the WAL-first rule for
1022 : * normal updates to relation files, so that the minimum recovery
1023 : * point is always updated before the corresponding change in the data
1024 : * file is flushed to disk. We have to do the same manually here.
1025 : *
1026 : * Doing this before the truncation means that if the truncation fails
1027 : * for some reason, you cannot start up the system even after restart,
1028 : * until you fix the underlying situation so that the truncation will
1029 : * succeed. Alternatively, we could update the minimum recovery point
1030 : * after truncation, but that would leave a small window where the
1031 : * WAL-first rule could be violated.
1032 : */
1033 96 : XLogFlush(lsn);
1034 :
1035 : /* Prepare for truncation of MAIN fork */
1036 96 : if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0)
1037 : {
1038 96 : forks[nforks] = MAIN_FORKNUM;
1039 96 : old_blocks[nforks] = smgrnblocks(reln, MAIN_FORKNUM);
1040 96 : blocks[nforks] = xlrec->blkno;
1041 96 : nforks++;
1042 :
1043 : /* Also tell xlogutils.c about it */
1044 96 : XLogTruncateRelation(xlrec->rlocator, MAIN_FORKNUM, xlrec->blkno);
1045 : }
1046 :
1047 : /* Prepare for truncation of FSM and VM too */
1048 96 : rel = CreateFakeRelcacheEntry(xlrec->rlocator);
1049 :
1050 192 : if ((xlrec->flags & SMGR_TRUNCATE_FSM) != 0 &&
1051 96 : smgrexists(reln, FSM_FORKNUM))
1052 : {
1053 60 : blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, xlrec->blkno);
1054 60 : if (BlockNumberIsValid(blocks[nforks]))
1055 : {
1056 60 : forks[nforks] = FSM_FORKNUM;
1057 60 : old_blocks[nforks] = smgrnblocks(reln, FSM_FORKNUM);
1058 60 : nforks++;
1059 60 : need_fsm_vacuum = true;
1060 : }
1061 : }
1062 192 : if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0 &&
1063 96 : smgrexists(reln, VISIBILITYMAP_FORKNUM))
1064 : {
1065 50 : blocks[nforks] = visibilitymap_prepare_truncate(rel, xlrec->blkno);
1066 50 : if (BlockNumberIsValid(blocks[nforks]))
1067 : {
1068 22 : forks[nforks] = VISIBILITYMAP_FORKNUM;
1069 22 : old_blocks[nforks] = smgrnblocks(reln, VISIBILITYMAP_FORKNUM);
1070 22 : nforks++;
1071 : }
1072 : }
1073 :
1074 : /* Do the real work to truncate relation forks */
1075 96 : if (nforks > 0)
1076 : {
1077 96 : START_CRIT_SECTION();
1078 96 : smgrtruncate(reln, forks, nforks, old_blocks, blocks);
1079 96 : END_CRIT_SECTION();
1080 : }
1081 :
1082 : /*
1083 : * Update upper-level FSM pages to account for the truncation. This is
1084 : * important because the just-truncated pages were likely marked as
1085 : * all-free, and would be preferentially selected.
1086 : */
1087 96 : if (need_fsm_vacuum)
1088 60 : FreeSpaceMapVacuumRange(rel, xlrec->blkno,
1089 : InvalidBlockNumber);
1090 :
1091 96 : FreeFakeRelcacheEntry(rel);
1092 : }
1093 : else
1094 0 : elog(PANIC, "smgr_redo: unknown op code %u", info);
1095 31544 : }
|